Age Owner Branch data TLA Line data Source code
1 : : /*
2 : : * brin.c
3 : : * Implementation of BRIN indexes for Postgres
4 : : *
5 : : * See src/backend/access/brin/README for details.
6 : : *
7 : : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
8 : : * Portions Copyright (c) 1994, Regents of the University of California
9 : : *
10 : : * IDENTIFICATION
11 : : * src/backend/access/brin/brin.c
12 : : *
13 : : * TODO
14 : : * * ScalarArrayOpExpr (amsearcharray -> SK_SEARCHARRAY)
15 : : */
16 : : #include "postgres.h"
17 : :
18 : : #include "access/brin.h"
19 : : #include "access/brin_page.h"
20 : : #include "access/brin_pageops.h"
21 : : #include "access/brin_xlog.h"
22 : : #include "access/relation.h"
23 : : #include "access/reloptions.h"
24 : : #include "access/relscan.h"
25 : : #include "access/table.h"
26 : : #include "access/tableam.h"
27 : : #include "access/xloginsert.h"
28 : : #include "catalog/index.h"
29 : : #include "catalog/pg_am.h"
30 : : #include "commands/vacuum.h"
31 : : #include "miscadmin.h"
32 : : #include "pgstat.h"
33 : : #include "postmaster/autovacuum.h"
34 : : #include "storage/bufmgr.h"
35 : : #include "storage/freespace.h"
36 : : #include "storage/proc.h"
37 : : #include "tcop/tcopprot.h"
38 : : #include "utils/acl.h"
39 : : #include "utils/datum.h"
40 : : #include "utils/fmgrprotos.h"
41 : : #include "utils/guc.h"
42 : : #include "utils/index_selfuncs.h"
43 : : #include "utils/memutils.h"
44 : : #include "utils/rel.h"
45 : : #include "utils/tuplesort.h"
46 : : #include "utils/wait_event.h"
47 : :
48 : : /* Magic numbers for parallel state sharing */
49 : : #define PARALLEL_KEY_BRIN_SHARED UINT64CONST(0xB000000000000001)
50 : : #define PARALLEL_KEY_TUPLESORT UINT64CONST(0xB000000000000002)
51 : : #define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xB000000000000003)
52 : : #define PARALLEL_KEY_WAL_USAGE UINT64CONST(0xB000000000000004)
53 : : #define PARALLEL_KEY_BUFFER_USAGE UINT64CONST(0xB000000000000005)
54 : :
55 : : /*
56 : : * Status for index builds performed in parallel. This is allocated in a
57 : : * dynamic shared memory segment.
58 : : */
59 : : typedef struct BrinShared
60 : : {
61 : : /*
62 : : * These fields are not modified during the build. They primarily exist
63 : : * for the benefit of worker processes that need to create state
64 : : * corresponding to that used by the leader.
65 : : */
66 : : Oid heaprelid;
67 : : Oid indexrelid;
68 : : bool isconcurrent;
69 : : BlockNumber pagesPerRange;
70 : : int scantuplesortstates;
71 : :
72 : : /* Query ID, for report in worker processes */
73 : : int64 queryid;
74 : :
75 : : /*
76 : : * workersdonecv is used to monitor the progress of workers. All parallel
77 : : * participants must indicate that they are done before leader can use
78 : : * results built by the workers (and before leader can write the data into
79 : : * the index).
80 : : */
81 : : ConditionVariable workersdonecv;
82 : :
83 : : /*
84 : : * mutex protects all fields before heapdesc.
85 : : *
86 : : * These fields contain status information of interest to BRIN index
87 : : * builds that must work just the same when an index is built in parallel.
88 : : */
89 : : slock_t mutex;
90 : :
91 : : /*
92 : : * Mutable state that is maintained by workers, and reported back to
93 : : * leader at end of the scans.
94 : : *
95 : : * nparticipantsdone is number of worker processes finished.
96 : : *
97 : : * reltuples is the total number of input heap tuples.
98 : : *
99 : : * indtuples is the total number of tuples that made it into the index.
100 : : */
101 : : int nparticipantsdone;
102 : : double reltuples;
103 : : double indtuples;
104 : :
105 : : /*
106 : : * ParallelTableScanDescData data follows. Can't directly embed here, as
107 : : * implementations of the parallel table scan desc interface might need
108 : : * stronger alignment.
109 : : */
110 : : } BrinShared;
111 : :
112 : : /*
113 : : * Return pointer to a BrinShared's parallel table scan.
114 : : *
115 : : * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just
116 : : * MAXALIGN.
117 : : */
118 : : #define ParallelTableScanFromBrinShared(shared) \
119 : : (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BrinShared)))
120 : :
121 : : /*
122 : : * Status for leader in parallel index build.
123 : : */
124 : : typedef struct BrinLeader
125 : : {
126 : : /* parallel context itself */
127 : : ParallelContext *pcxt;
128 : :
129 : : /*
130 : : * nparticipanttuplesorts is the exact number of worker processes
131 : : * successfully launched, plus one leader process if it participates as a
132 : : * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
133 : : * participating as a worker).
134 : : */
135 : : int nparticipanttuplesorts;
136 : :
137 : : /*
138 : : * Leader process convenience pointers to shared state (leader avoids TOC
139 : : * lookups).
140 : : *
141 : : * brinshared is the shared state for entire build. sharedsort is the
142 : : * shared, tuplesort-managed state passed to each process tuplesort.
143 : : * snapshot is the snapshot used by the scan iff an MVCC snapshot is
144 : : * required.
145 : : */
146 : : BrinShared *brinshared;
147 : : Sharedsort *sharedsort;
148 : : Snapshot snapshot;
149 : : WalUsage *walusage;
150 : : BufferUsage *bufferusage;
151 : : } BrinLeader;
152 : :
153 : : /*
154 : : * We use a BrinBuildState during initial construction of a BRIN index.
155 : : * The running state is kept in a BrinMemTuple.
156 : : */
157 : : typedef struct BrinBuildState
158 : : {
159 : : Relation bs_irel;
160 : : double bs_numtuples;
161 : : double bs_reltuples;
162 : : Buffer bs_currentInsertBuf;
163 : : BlockNumber bs_pagesPerRange;
164 : : BlockNumber bs_currRangeStart;
165 : : BlockNumber bs_maxRangeStart;
166 : : BrinRevmap *bs_rmAccess;
167 : : BrinDesc *bs_bdesc;
168 : : BrinMemTuple *bs_dtuple;
169 : :
170 : : BrinTuple *bs_emptyTuple;
171 : : Size bs_emptyTupleLen;
172 : : MemoryContext bs_context;
173 : :
174 : : /*
175 : : * bs_leader is only present when a parallel index build is performed, and
176 : : * only in the leader process. (Actually, only the leader process has a
177 : : * BrinBuildState.)
178 : : */
179 : : BrinLeader *bs_leader;
180 : : int bs_worker_id;
181 : :
182 : : /*
183 : : * The sortstate is used by workers (including the leader). It has to be
184 : : * part of the build state, because that's the only thing passed to the
185 : : * build callback etc.
186 : : */
187 : : Tuplesortstate *bs_sortstate;
188 : : } BrinBuildState;
189 : :
190 : : /*
191 : : * We use a BrinInsertState to capture running state spanning multiple
192 : : * brininsert invocations, within the same command.
193 : : */
194 : : typedef struct BrinInsertState
195 : : {
196 : : BrinRevmap *bis_rmAccess;
197 : : BrinDesc *bis_desc;
198 : : BlockNumber bis_pages_per_range;
199 : : } BrinInsertState;
200 : :
201 : : /*
202 : : * Struct used as "opaque" during index scans
203 : : */
204 : : typedef struct BrinOpaque
205 : : {
206 : : BlockNumber bo_pagesPerRange;
207 : : BrinRevmap *bo_rmAccess;
208 : : BrinDesc *bo_bdesc;
209 : : } BrinOpaque;
210 : :
211 : : #define BRIN_ALL_BLOCKRANGES InvalidBlockNumber
212 : :
213 : : static BrinBuildState *initialize_brin_buildstate(Relation idxRel,
214 : : BrinRevmap *revmap,
215 : : BlockNumber pagesPerRange,
216 : : BlockNumber tablePages);
217 : : static BrinInsertState *initialize_brin_insertstate(Relation idxRel, IndexInfo *indexInfo);
218 : : static void terminate_brin_buildstate(BrinBuildState *state);
219 : : static void brinsummarize(Relation index, Relation heapRel, BlockNumber pageRange,
220 : : bool include_partial, double *numSummarized, double *numExisting);
221 : : static void form_and_insert_tuple(BrinBuildState *state);
222 : : static void form_and_spill_tuple(BrinBuildState *state);
223 : : static void union_tuples(BrinDesc *bdesc, BrinMemTuple *a,
224 : : BrinTuple *b);
225 : : static void brin_vacuum_scan(Relation idxrel, BufferAccessStrategy strategy);
226 : : static bool add_values_to_range(Relation idxRel, BrinDesc *bdesc,
227 : : BrinMemTuple *dtup, const Datum *values, const bool *nulls);
228 : : static bool check_null_keys(BrinValues *bval, ScanKey *nullkeys, int nnullkeys);
229 : : static void brin_fill_empty_ranges(BrinBuildState *state,
230 : : BlockNumber prevRange, BlockNumber nextRange);
231 : :
232 : : /* parallel index builds */
233 : : static void _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index,
234 : : bool isconcurrent, int request);
235 : : static void _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state);
236 : : static Size _brin_parallel_estimate_shared(Relation heap, Snapshot snapshot);
237 : : static double _brin_parallel_heapscan(BrinBuildState *state);
238 : : static double _brin_parallel_merge(BrinBuildState *state);
239 : : static void _brin_leader_participate_as_worker(BrinBuildState *buildstate,
240 : : Relation heap, Relation index);
241 : : static void _brin_parallel_scan_and_build(BrinBuildState *state,
242 : : BrinShared *brinshared,
243 : : Sharedsort *sharedsort,
244 : : Relation heap, Relation index,
245 : : int sortmem, bool progress);
246 : :
247 : : /*
248 : : * BRIN handler function: return IndexAmRoutine with access method parameters
249 : : * and callbacks.
250 : : */
251 : : Datum
3710 tgl@sss.pgh.pa.us 252 :CBC 2266 : brinhandler(PG_FUNCTION_ARGS)
253 : : {
254 : : static const IndexAmRoutine amroutine = {
255 : : .type = T_IndexAmRoutine,
256 : : .amstrategies = 0,
257 : : .amsupport = BRIN_LAST_OPTIONAL_PROCNUM,
258 : : .amoptsprocnum = BRIN_PROCNUM_OPTIONS,
259 : : .amcanorder = false,
260 : : .amcanorderbyop = false,
261 : : .amcanhash = false,
262 : : .amconsistentequality = false,
263 : : .amconsistentordering = false,
264 : : .amcanbackward = false,
265 : : .amcanunique = false,
266 : : .amcanmulticol = true,
267 : : .amoptionalkey = true,
268 : : .amsearcharray = false,
269 : : .amsearchnulls = true,
270 : : .amstorage = true,
271 : : .amclusterable = false,
272 : : .ampredlocks = false,
273 : : .amcanparallel = false,
274 : : .amcanbuildparallel = true,
275 : : .amcaninclude = false,
276 : : .amusemaintenanceworkmem = false,
277 : : .amsummarizing = true,
278 : : .amparallelvacuumoptions =
279 : : VACUUM_OPTION_PARALLEL_CLEANUP,
280 : : .amkeytype = InvalidOid,
281 : :
282 : : .ambuild = brinbuild,
283 : : .ambuildempty = brinbuildempty,
284 : : .aminsert = brininsert,
285 : : .aminsertcleanup = brininsertcleanup,
286 : : .ambulkdelete = brinbulkdelete,
287 : : .amvacuumcleanup = brinvacuumcleanup,
288 : : .amcanreturn = NULL,
289 : : .amcostestimate = brincostestimate,
290 : : .amgettreeheight = NULL,
291 : : .amoptions = brinoptions,
292 : : .amproperty = NULL,
293 : : .ambuildphasename = NULL,
294 : : .amvalidate = brinvalidate,
295 : : .amadjustmembers = NULL,
296 : : .ambeginscan = brinbeginscan,
297 : : .amrescan = brinrescan,
298 : : .amgettuple = NULL,
299 : : .amgetbitmap = bringetbitmap,
300 : : .amendscan = brinendscan,
301 : : .ammarkpos = NULL,
302 : : .amrestrpos = NULL,
303 : : .amestimateparallelscan = NULL,
304 : : .aminitparallelscan = NULL,
305 : : .amparallelrescan = NULL,
306 : : .amtranslatestrategy = NULL,
307 : : .amtranslatecmptype = NULL,
308 : : };
309 : :
75 tgl@sss.pgh.pa.us 310 :GNC 2266 : PG_RETURN_POINTER(&amroutine);
311 : : }
312 : :
313 : : /*
314 : : * Initialize a BrinInsertState to maintain state to be used across multiple
315 : : * tuple inserts, within the same command.
316 : : */
317 : : static BrinInsertState *
841 tomas.vondra@postgre 318 :CBC 561 : initialize_brin_insertstate(Relation idxRel, IndexInfo *indexInfo)
319 : : {
320 : : BrinInsertState *bistate;
321 : : MemoryContext oldcxt;
322 : :
323 : 561 : oldcxt = MemoryContextSwitchTo(indexInfo->ii_Context);
95 michael@paquier.xyz 324 :GNC 561 : bistate = palloc0_object(BrinInsertState);
841 tomas.vondra@postgre 325 :CBC 561 : bistate->bis_desc = brin_build_desc(idxRel);
326 : 561 : bistate->bis_rmAccess = brinRevmapInitialize(idxRel,
327 : : &bistate->bis_pages_per_range);
328 : 561 : indexInfo->ii_AmCache = bistate;
329 : 561 : MemoryContextSwitchTo(oldcxt);
330 : :
331 : 561 : return bistate;
332 : : }
333 : :
334 : : /*
335 : : * A tuple in the heap is being inserted. To keep a brin index up to date,
336 : : * we need to obtain the relevant index tuple and compare its stored values
337 : : * with those of the new tuple. If the tuple values are not consistent with
338 : : * the summary tuple, we need to update the index tuple.
339 : : *
340 : : * If autosummarization is enabled, check if we need to summarize the previous
341 : : * page range.
342 : : *
343 : : * If the range is not currently summarized (i.e. the revmap returns NULL for
344 : : * it), there's nothing to do for this tuple.
345 : : */
346 : : bool
3710 tgl@sss.pgh.pa.us 347 : 63108 : brininsert(Relation idxRel, Datum *values, bool *nulls,
348 : : ItemPointer heaptid, Relation heapRel,
349 : : IndexUniqueCheck checkUnique,
350 : : bool indexUnchanged,
351 : : IndexInfo *indexInfo)
352 : : {
353 : : BlockNumber pagesPerRange;
354 : : BlockNumber origHeapBlk;
355 : : BlockNumber heapBlk;
841 tomas.vondra@postgre 356 : 63108 : BrinInsertState *bistate = (BrinInsertState *) indexInfo->ii_AmCache;
357 : : BrinRevmap *revmap;
358 : : BrinDesc *bdesc;
4146 alvherre@alvh.no-ip. 359 : 63108 : Buffer buf = InvalidBuffer;
360 : 63108 : MemoryContext tupcxt = NULL;
3321 tgl@sss.pgh.pa.us 361 : 63108 : MemoryContext oldcxt = CurrentMemoryContext;
3270 alvherre@alvh.no-ip. 362 [ + - - + : 63108 : bool autosummarize = BrinGetAutoSummarize(idxRel);
+ + ]
363 : :
364 : : /*
365 : : * If first time through in this statement, initialize the insert state
366 : : * that we keep for all the inserts in the command.
367 : : */
841 tomas.vondra@postgre 368 [ + + ]: 63108 : if (!bistate)
369 : 561 : bistate = initialize_brin_insertstate(idxRel, indexInfo);
370 : :
371 : 63108 : revmap = bistate->bis_rmAccess;
372 : 63108 : bdesc = bistate->bis_desc;
373 : 63108 : pagesPerRange = bistate->bis_pages_per_range;
374 : :
375 : : /*
376 : : * origHeapBlk is the block number where the insertion occurred. heapBlk
377 : : * is the first block in the corresponding page range.
378 : : */
3270 alvherre@alvh.no-ip. 379 : 63108 : origHeapBlk = ItemPointerGetBlockNumber(heaptid);
380 : 63108 : heapBlk = (origHeapBlk / pagesPerRange) * pagesPerRange;
381 : :
382 : : for (;;)
4146 alvherre@alvh.no-ip. 383 :UBC 0 : {
4146 alvherre@alvh.no-ip. 384 :CBC 63108 : bool need_insert = false;
385 : : OffsetNumber off;
386 : : BrinTuple *brtup;
387 : : BrinMemTuple *dtup;
388 : :
389 [ - + ]: 63108 : CHECK_FOR_INTERRUPTS();
390 : :
391 : : /*
392 : : * If auto-summarization is enabled and we just inserted the first
393 : : * tuple into the first block of a new non-first page range, request a
394 : : * summarization run of the previous range.
395 : : */
3270 396 [ + + + + ]: 63108 : if (autosummarize &&
397 [ + - ]: 145 : heapBlk > 0 &&
398 [ + + ]: 145 : heapBlk == origHeapBlk &&
399 : 145 : ItemPointerGetOffsetNumber(heaptid) == FirstOffsetNumber)
400 : : {
401 : 8 : BlockNumber lastPageRange = heapBlk - 1;
402 : : BrinTuple *lastPageTuple;
403 : :
404 : : lastPageTuple =
405 : 8 : brinGetTupleForHeapBlock(revmap, lastPageRange, &buf, &off,
406 : : NULL, BUFFER_LOCK_SHARE);
407 [ + + ]: 8 : if (!lastPageTuple)
408 : : {
409 : : bool recorded;
410 : :
2923 411 : 6 : recorded = AutoVacuumRequestWork(AVW_BRINSummarizeRange,
412 : : RelationGetRelid(idxRel),
413 : : lastPageRange);
414 [ - + ]: 6 : if (!recorded)
2923 alvherre@alvh.no-ip. 415 [ # # ]:UBC 0 : ereport(LOG,
416 : : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
417 : : errmsg("request for BRIN range summarization for index \"%s\" page %u was not recorded",
418 : : RelationGetRelationName(idxRel),
419 : : lastPageRange)));
420 : : }
421 : : else
3211 alvherre@alvh.no-ip. 422 :CBC 2 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
423 : : }
424 : :
3270 425 : 63108 : brtup = brinGetTupleForHeapBlock(revmap, heapBlk, &buf, &off,
426 : : NULL, BUFFER_LOCK_SHARE);
427 : :
428 : : /* if range is unsummarized, there's nothing to do */
4146 429 [ + + ]: 63108 : if (!brtup)
430 : 39127 : break;
431 : :
432 : : /* First time through in this brininsert call? */
3321 tgl@sss.pgh.pa.us 433 [ + - ]: 23981 : if (tupcxt == NULL)
434 : : {
4146 alvherre@alvh.no-ip. 435 : 23981 : tupcxt = AllocSetContextCreate(CurrentMemoryContext,
436 : : "brininsert cxt",
437 : : ALLOCSET_DEFAULT_SIZES);
3321 tgl@sss.pgh.pa.us 438 : 23981 : MemoryContextSwitchTo(tupcxt);
439 : : }
440 : :
3264 alvherre@alvh.no-ip. 441 : 23981 : dtup = brin_deform_tuple(bdesc, brtup, NULL);
442 : :
1818 tomas.vondra@postgre 443 : 23981 : need_insert = add_values_to_range(idxRel, bdesc, dtup, values, nulls);
444 : :
4146 alvherre@alvh.no-ip. 445 [ + + ]: 23981 : if (!need_insert)
446 : : {
447 : : /*
448 : : * The tuple is consistent with the new values, so there's nothing
449 : : * to do.
450 : : */
451 : 12017 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
452 : : }
453 : : else
454 : : {
3616 kgrittn@postgresql.o 455 : 11964 : Page page = BufferGetPage(buf);
4146 alvherre@alvh.no-ip. 456 : 11964 : ItemId lp = PageGetItemId(page, off);
457 : : Size origsz;
458 : : BrinTuple *origtup;
459 : : Size newsz;
460 : : BrinTuple *newtup;
461 : : bool samepage;
462 : :
463 : : /*
464 : : * Make a copy of the old tuple, so that we can compare it after
465 : : * re-acquiring the lock.
466 : : */
467 : 11964 : origsz = ItemIdGetLength(lp);
3264 468 : 11964 : origtup = brin_copy_tuple(brtup, origsz, NULL, NULL);
469 : :
470 : : /*
471 : : * Before releasing the lock, check if we can attempt a same-page
472 : : * update. Another process could insert a tuple concurrently in
473 : : * the same page though, so downstream we must be prepared to cope
474 : : * if this turns out to not be possible after all.
475 : : */
4145 476 : 11964 : newtup = brin_form_tuple(bdesc, heapBlk, dtup, &newsz);
4146 477 : 11964 : samepage = brin_can_do_samepage_update(buf, origsz, newsz);
478 : 11964 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
479 : :
480 : : /*
481 : : * Try to update the tuple. If this doesn't work for whatever
482 : : * reason, we need to restart from the top; the revmap might be
483 : : * pointing at a different tuple for this block now, so we need to
484 : : * recompute to ensure both our new heap tuple and the other
485 : : * inserter's are covered by the combined tuple. It might be that
486 : : * we don't need to update at all.
487 : : */
488 [ - + ]: 11964 : if (!brin_doupdate(idxRel, pagesPerRange, revmap, heapBlk,
489 : : buf, off, origtup, origsz, newtup, newsz,
490 : : samepage))
491 : : {
492 : : /* no luck; start over */
851 nathan@postgresql.or 493 :UBC 0 : MemoryContextReset(tupcxt);
4146 alvherre@alvh.no-ip. 494 : 0 : continue;
495 : : }
496 : : }
497 : :
498 : : /* success! */
4146 alvherre@alvh.no-ip. 499 :CBC 23981 : break;
500 : : }
501 : :
502 [ + + ]: 63108 : if (BufferIsValid(buf))
503 : 23983 : ReleaseBuffer(buf);
3321 tgl@sss.pgh.pa.us 504 : 63108 : MemoryContextSwitchTo(oldcxt);
505 [ + + ]: 63108 : if (tupcxt != NULL)
4146 alvherre@alvh.no-ip. 506 : 23981 : MemoryContextDelete(tupcxt);
507 : :
3710 tgl@sss.pgh.pa.us 508 : 63108 : return false;
509 : : }
510 : :
511 : : /*
512 : : * Callback to clean up the BrinInsertState once all tuple inserts are done.
513 : : */
514 : : void
695 tomas.vondra@postgre 515 : 578 : brininsertcleanup(Relation index, IndexInfo *indexInfo)
516 : : {
841 517 : 578 : BrinInsertState *bistate = (BrinInsertState *) indexInfo->ii_AmCache;
518 : :
519 : : /* bail out if cache not initialized */
389 tgl@sss.pgh.pa.us 520 [ + + ]: 578 : if (bistate == NULL)
695 tomas.vondra@postgre 521 : 17 : return;
522 : :
523 : : /* do this first to avoid dangling pointer if we fail partway through */
389 tgl@sss.pgh.pa.us 524 : 561 : indexInfo->ii_AmCache = NULL;
525 : :
526 : : /*
527 : : * Clean up the revmap. Note that the brinDesc has already been cleaned up
528 : : * as part of its own memory context.
529 : : */
841 tomas.vondra@postgre 530 : 561 : brinRevmapTerminate(bistate->bis_rmAccess);
389 tgl@sss.pgh.pa.us 531 : 561 : pfree(bistate);
532 : : }
533 : :
534 : : /*
535 : : * Initialize state for a BRIN index scan.
536 : : *
537 : : * We read the metapage here to determine the pages-per-range number that this
538 : : * index was built with. Note that since this cannot be changed while we're
539 : : * holding lock on index, it's not necessary to recompute it during brinrescan.
540 : : */
541 : : IndexScanDesc
3710 542 : 1473 : brinbeginscan(Relation r, int nkeys, int norderbys)
543 : : {
544 : : IndexScanDesc scan;
545 : : BrinOpaque *opaque;
546 : :
4146 alvherre@alvh.no-ip. 547 : 1473 : scan = RelationGetIndexScan(r, nkeys, norderbys);
548 : :
1280 peter@eisentraut.org 549 : 1473 : opaque = palloc_object(BrinOpaque);
919 tmunro@postgresql.or 550 : 1473 : opaque->bo_rmAccess = brinRevmapInitialize(r, &opaque->bo_pagesPerRange);
4146 alvherre@alvh.no-ip. 551 : 1473 : opaque->bo_bdesc = brin_build_desc(r);
552 : 1473 : scan->opaque = opaque;
553 : :
3710 tgl@sss.pgh.pa.us 554 : 1473 : return scan;
555 : : }
556 : :
557 : : /*
558 : : * Execute the index scan.
559 : : *
560 : : * This works by reading index TIDs from the revmap, and obtaining the index
561 : : * tuples pointed to by them; the summary values in the index tuples are
562 : : * compared to the scan keys. We return into the TID bitmap all the pages in
563 : : * ranges corresponding to index tuples that match the scan keys.
564 : : *
565 : : * If a TID from the revmap is read as InvalidTID, we know that range is
566 : : * unsummarized. Pages in those ranges need to be returned regardless of scan
567 : : * keys.
568 : : */
569 : : int64
570 : 1473 : bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
571 : : {
4146 alvherre@alvh.no-ip. 572 : 1473 : Relation idxRel = scan->indexRelation;
573 : 1473 : Buffer buf = InvalidBuffer;
574 : : BrinDesc *bdesc;
575 : : Oid heapOid;
576 : : Relation heapRel;
577 : : BrinOpaque *opaque;
578 : : BlockNumber nblocks;
425 michael@paquier.xyz 579 : 1473 : int64 totalpages = 0;
580 : : FmgrInfo *consistentFn;
581 : : MemoryContext oldcxt;
582 : : MemoryContext perRangeCxt;
583 : : BrinMemTuple *dtup;
3224 bruce@momjian.us 584 : 1473 : BrinTuple *btup = NULL;
3264 alvherre@alvh.no-ip. 585 : 1473 : Size btupsz = 0;
586 : : ScanKey **keys,
587 : : **nullkeys;
588 : : int *nkeys,
589 : : *nnullkeys;
590 : : char *ptr;
591 : : Size len;
592 : : char *tmp PG_USED_FOR_ASSERTS_ONLY;
593 : :
4146 594 : 1473 : opaque = (BrinOpaque *) scan->opaque;
595 : 1473 : bdesc = opaque->bo_bdesc;
596 [ - + - - : 1473 : pgstat_count_index_scan(idxRel);
+ - ]
369 pg@bowt.ie 597 [ + - ]: 1473 : if (scan->instrument)
598 : 1473 : scan->instrument->nsearches++;
599 : :
600 : : /*
601 : : * We need to know the size of the table so that we know how long to
602 : : * iterate on the revmap.
603 : : */
4146 alvherre@alvh.no-ip. 604 : 1473 : heapOid = IndexGetRelation(RelationGetRelid(idxRel), false);
2610 andres@anarazel.de 605 : 1473 : heapRel = table_open(heapOid, AccessShareLock);
4146 alvherre@alvh.no-ip. 606 : 1473 : nblocks = RelationGetNumberOfBlocks(heapRel);
2610 andres@anarazel.de 607 : 1473 : table_close(heapRel, AccessShareLock);
608 : :
609 : : /*
610 : : * Make room for the consistent support procedures of indexed columns. We
611 : : * don't look them up here; we do that lazily the first time we see a scan
612 : : * key reference each of them. We rely on zeroing fn_oid to InvalidOid.
613 : : */
1280 peter@eisentraut.org 614 : 1473 : consistentFn = palloc0_array(FmgrInfo, bdesc->bd_tupdesc->natts);
615 : :
616 : : /*
617 : : * Make room for per-attribute lists of scan keys that we'll pass to the
618 : : * consistent support procedure. We don't know which attributes have scan
619 : : * keys, so we allocate space for all attributes. That may use more memory
620 : : * but it's probably cheaper than determining which attributes are used.
621 : : *
622 : : * We keep null and regular keys separate, so that we can pass just the
623 : : * regular keys to the consistent function easily.
624 : : *
625 : : * To reduce the allocation overhead, we allocate one big chunk and then
626 : : * carve it into smaller arrays ourselves. All the pieces have exactly the
627 : : * same lifetime, so that's OK.
628 : : *
629 : : * XXX The widest index can have 32 attributes, so the amount of wasted
630 : : * memory is negligible. We could invent a more compact approach (with
631 : : * just space for used attributes) but that would make the matching more
632 : : * complex so it's not a good trade-off.
633 : : */
1818 tomas.vondra@postgre 634 : 1473 : len =
635 : 1473 : MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts) + /* regular keys */
636 : 1473 : MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys) * bdesc->bd_tupdesc->natts +
637 : 1473 : MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts) +
638 : 1473 : MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts) + /* NULL keys */
639 : 1473 : MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys) * bdesc->bd_tupdesc->natts +
640 : 1473 : MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts);
641 : :
642 : 1473 : ptr = palloc(len);
643 : 1473 : tmp = ptr;
644 : :
645 : 1473 : keys = (ScanKey **) ptr;
646 : 1473 : ptr += MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts);
647 : :
648 : 1473 : nullkeys = (ScanKey **) ptr;
649 : 1473 : ptr += MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts);
650 : :
651 : 1473 : nkeys = (int *) ptr;
652 : 1473 : ptr += MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts);
653 : :
654 : 1473 : nnullkeys = (int *) ptr;
655 : 1473 : ptr += MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts);
656 : :
657 [ + + ]: 34989 : for (int i = 0; i < bdesc->bd_tupdesc->natts; i++)
658 : : {
659 : 33516 : keys[i] = (ScanKey *) ptr;
660 : 33516 : ptr += MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys);
661 : :
662 : 33516 : nullkeys[i] = (ScanKey *) ptr;
663 : 33516 : ptr += MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys);
664 : : }
665 : :
666 [ - + ]: 1473 : Assert(tmp + len == ptr);
667 : :
668 : : /* zero the number of keys */
669 : 1473 : memset(nkeys, 0, sizeof(int) * bdesc->bd_tupdesc->natts);
670 : 1473 : memset(nnullkeys, 0, sizeof(int) * bdesc->bd_tupdesc->natts);
671 : :
672 : : /* Preprocess the scan keys - split them into per-attribute arrays. */
1299 drowley@postgresql.o 673 [ + + ]: 2946 : for (int keyno = 0; keyno < scan->numberOfKeys; keyno++)
674 : : {
1818 tomas.vondra@postgre 675 : 1473 : ScanKey key = &scan->keyData[keyno];
676 : 1473 : AttrNumber keyattno = key->sk_attno;
677 : :
678 : : /*
679 : : * The collation of the scan key must match the collation used in the
680 : : * index column (but only if the search is not IS NULL/ IS NOT NULL).
681 : : * Otherwise we shouldn't be using this index ...
682 : : */
683 [ + + - + ]: 1473 : Assert((key->sk_flags & SK_ISNULL) ||
684 : : (key->sk_collation ==
685 : : TupleDescAttr(bdesc->bd_tupdesc,
686 : : keyattno - 1)->attcollation));
687 : :
688 : : /*
689 : : * First time we see this index attribute, so init as needed.
690 : : *
691 : : * This is a bit of an overkill - we don't know how many scan keys are
692 : : * there for this attribute, so we simply allocate the largest number
693 : : * possible (as if all keys were for this attribute). This may waste a
694 : : * bit of memory, but we only expect small number of scan keys in
695 : : * general, so this should be negligible, and repeated repalloc calls
696 : : * are not free either.
697 : : */
698 [ + - ]: 1473 : if (consistentFn[keyattno - 1].fn_oid == InvalidOid)
699 : : {
700 : : FmgrInfo *tmp;
701 : :
702 : : /* First time we see this attribute, so no key/null keys. */
703 [ - + ]: 1473 : Assert(nkeys[keyattno - 1] == 0);
704 [ - + ]: 1473 : Assert(nnullkeys[keyattno - 1] == 0);
705 : :
706 : 1473 : tmp = index_getprocinfo(idxRel, keyattno,
707 : : BRIN_PROCNUM_CONSISTENT);
708 : 1473 : fmgr_info_copy(&consistentFn[keyattno - 1], tmp,
709 : : CurrentMemoryContext);
710 : : }
711 : :
712 : : /* Add key to the proper per-attribute array. */
713 [ + + ]: 1473 : if (key->sk_flags & SK_ISNULL)
714 : : {
715 : 18 : nullkeys[keyattno - 1][nnullkeys[keyattno - 1]] = key;
716 : 18 : nnullkeys[keyattno - 1]++;
717 : : }
718 : : else
719 : : {
720 : 1455 : keys[keyattno - 1][nkeys[keyattno - 1]] = key;
721 : 1455 : nkeys[keyattno - 1]++;
722 : : }
723 : : }
724 : :
725 : : /* allocate an initial in-memory tuple, out of the per-range memcxt */
3264 alvherre@alvh.no-ip. 726 : 1473 : dtup = brin_new_memtuple(bdesc);
727 : :
728 : : /*
729 : : * Setup and use a per-range memory context, which is reset every time we
730 : : * loop below. This avoids having to free the tuples within the loop.
731 : : */
4146 732 : 1473 : perRangeCxt = AllocSetContextCreate(CurrentMemoryContext,
733 : : "bringetbitmap cxt",
734 : : ALLOCSET_DEFAULT_SIZES);
735 : 1473 : oldcxt = MemoryContextSwitchTo(perRangeCxt);
736 : :
737 : : /*
738 : : * Now scan the revmap. We start by querying for heap page 0,
739 : : * incrementing by the number of pages per range; this gives us a full
740 : : * view of the table. We make use of uint64 for heapBlk as a BlockNumber
741 : : * could wrap for tables with close to 2^32 pages.
742 : : */
145 drowley@postgresql.o 743 [ + + ]: 97299 : for (uint64 heapBlk = 0; heapBlk < nblocks; heapBlk += opaque->bo_pagesPerRange)
744 : : {
745 : : bool addrange;
3264 alvherre@alvh.no-ip. 746 : 95826 : bool gottuple = false;
747 : : BrinTuple *tup;
748 : : OffsetNumber off;
749 : : Size size;
750 : :
4146 751 [ - + ]: 95826 : CHECK_FOR_INTERRUPTS();
752 : :
851 nathan@postgresql.or 753 : 95826 : MemoryContextReset(perRangeCxt);
754 : :
145 755 : 95826 : tup = brinGetTupleForHeapBlock(opaque->bo_rmAccess, (BlockNumber) heapBlk, &buf,
756 : : &off, &size, BUFFER_LOCK_SHARE);
4146 alvherre@alvh.no-ip. 757 [ + + ]: 95826 : if (tup)
758 : : {
3264 759 : 94968 : gottuple = true;
760 : 94968 : btup = brin_copy_tuple(tup, size, btup, &btupsz);
4146 761 : 94968 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
762 : : }
763 : :
764 : : /*
765 : : * For page ranges with no indexed tuple, we must return the whole
766 : : * range; otherwise, compare it to the scan keys.
767 : : */
3264 768 [ + + ]: 95826 : if (!gottuple)
769 : : {
4146 770 : 858 : addrange = true;
771 : : }
772 : : else
773 : : {
3264 774 : 94968 : dtup = brin_deform_tuple(bdesc, btup, dtup);
4146 775 [ - + ]: 94968 : if (dtup->bt_placeholder)
776 : : {
777 : : /*
778 : : * Placeholder tuples are always returned, regardless of the
779 : : * values stored in them.
780 : : */
4146 alvherre@alvh.no-ip. 781 :UBC 0 : addrange = true;
782 : : }
783 : : else
784 : : {
785 : : int attno;
786 : :
787 : : /*
788 : : * Compare scan keys with summary values stored for the range.
789 : : * If scan keys are matched, the page range must be added to
790 : : * the bitmap. We initially assume the range needs to be
791 : : * added; in particular this serves the case where there are
792 : : * no keys.
793 : : */
4146 alvherre@alvh.no-ip. 794 :CBC 94968 : addrange = true;
1818 tomas.vondra@postgre 795 [ + + ]: 2352033 : for (attno = 1; attno <= bdesc->bd_tupdesc->natts; attno++)
796 : : {
797 : : BrinValues *bval;
798 : : Datum add;
799 : : Oid collation;
800 : :
801 : : /*
802 : : * skip attributes without any scan keys (both regular and
803 : : * IS [NOT] NULL)
804 : : */
805 [ + + + + ]: 2283867 : if (nkeys[attno - 1] == 0 && nnullkeys[attno - 1] == 0)
806 : 2188899 : continue;
807 : :
808 : 94968 : bval = &dtup->bt_columns[attno - 1];
809 : :
810 : : /*
811 : : * If the BRIN tuple indicates that this range is empty,
812 : : * we can skip it: there's nothing to match. We don't
813 : : * need to examine the next columns.
814 : : */
1031 815 [ - + ]: 94968 : if (dtup->bt_empty_range)
816 : : {
1031 tomas.vondra@postgre 817 :UBC 0 : addrange = false;
818 : 0 : break;
819 : : }
820 : :
821 : : /*
822 : : * First check if there are any IS [NOT] NULL scan keys,
823 : : * and if we're violating them. In that case we can
824 : : * terminate early, without invoking the support function.
825 : : *
826 : : * As there may be more keys, we can only determine
827 : : * mismatch within this loop.
828 : : */
1818 tomas.vondra@postgre 829 [ + - ]:CBC 94968 : if (bdesc->bd_info[attno - 1]->oi_regular_nulls &&
830 [ + + ]: 94968 : !check_null_keys(bval, nullkeys[attno - 1],
831 : 94968 : nnullkeys[attno - 1]))
832 : : {
833 : : /*
834 : : * If any of the IS [NOT] NULL keys failed, the page
835 : : * range as a whole can't pass. So terminate the loop.
836 : : */
837 : 498 : addrange = false;
838 : 498 : break;
839 : : }
840 : :
841 : : /*
842 : : * So either there are no IS [NOT] NULL keys, or all
843 : : * passed. If there are no regular scan keys, we're done -
844 : : * the page range matches. If there are regular keys, but
845 : : * the page range is marked as 'all nulls' it can't
846 : : * possibly pass (we're assuming the operators are
847 : : * strict).
848 : : */
849 : :
850 : : /* No regular scan keys - page range as a whole passes. */
851 [ + + ]: 94470 : if (!nkeys[attno - 1])
852 : 618 : continue;
853 : :
854 [ + - - + ]: 93852 : Assert((nkeys[attno - 1] > 0) &&
855 : : (nkeys[attno - 1] <= scan->numberOfKeys));
856 : :
857 : : /* If it is all nulls, it cannot possibly be consistent. */
858 [ + + ]: 93852 : if (bval->bv_allnulls)
859 : : {
860 : 189 : addrange = false;
861 : 189 : break;
862 : : }
863 : :
864 : : /*
865 : : * Collation from the first key (has to be the same for
866 : : * all keys for the same attribute).
867 : : */
1815 868 : 93663 : collation = keys[attno - 1][0]->sk_collation;
869 : :
870 : : /*
871 : : * Check whether the scan key is consistent with the page
872 : : * range values; if so, have the pages in the range added
873 : : * to the output bitmap.
874 : : *
875 : : * The opclass may or may not support processing of
876 : : * multiple scan keys. We can determine that based on the
877 : : * number of arguments - functions with extra parameter
878 : : * (number of scan keys) do support this, otherwise we
879 : : * have to simply pass the scan keys one by one.
880 : : */
881 [ + + ]: 93663 : if (consistentFn[attno - 1].fn_nargs >= 4)
882 : : {
883 : : /* Check all keys at once */
884 : 19797 : add = FunctionCall4Coll(&consistentFn[attno - 1],
885 : : collation,
886 : : PointerGetDatum(bdesc),
887 : : PointerGetDatum(bval),
888 : 19797 : PointerGetDatum(keys[attno - 1]),
889 : 19797 : Int32GetDatum(nkeys[attno - 1]));
890 : 19797 : addrange = DatumGetBool(add);
891 : : }
892 : : else
893 : : {
894 : : /*
895 : : * Check keys one by one
896 : : *
897 : : * When there are multiple scan keys, failure to meet
898 : : * the criteria for a single one of them is enough to
899 : : * discard the range as a whole, so break out of the
900 : : * loop as soon as a false return value is obtained.
901 : : */
902 : : int keyno;
903 : :
904 [ + + ]: 129039 : for (keyno = 0; keyno < nkeys[attno - 1]; keyno++)
905 : : {
906 : 73866 : add = FunctionCall3Coll(&consistentFn[attno - 1],
907 : 73866 : keys[attno - 1][keyno]->sk_collation,
908 : : PointerGetDatum(bdesc),
909 : : PointerGetDatum(bval),
910 : 73866 : PointerGetDatum(keys[attno - 1][keyno]));
911 : 73866 : addrange = DatumGetBool(add);
912 [ + + ]: 73866 : if (!addrange)
913 : 18693 : break;
914 : : }
915 : : }
916 : :
917 : : /*
918 : : * If we found a scan key eliminating the range, no need
919 : : * to check additional ones.
920 : : */
1120 921 [ + + ]: 93663 : if (!addrange)
922 : 26115 : break;
923 : : }
924 : : }
925 : : }
926 : :
927 : : /* add the pages in the range to the output bitmap, if needed */
4146 alvherre@alvh.no-ip. 928 [ + + ]: 95826 : if (addrange)
929 : : {
930 : : uint64 pageno;
931 : :
932 : 69024 : for (pageno = heapBlk;
1803 tomas.vondra@postgre 933 [ + + ]: 143004 : pageno <= Min(nblocks, heapBlk + opaque->bo_pagesPerRange) - 1;
4146 alvherre@alvh.no-ip. 934 : 73980 : pageno++)
935 : : {
936 : 73980 : MemoryContextSwitchTo(oldcxt);
937 : 73980 : tbm_add_page(tbm, pageno);
938 : 73980 : totalpages++;
939 : 73980 : MemoryContextSwitchTo(perRangeCxt);
940 : : }
941 : : }
942 : : }
943 : :
944 : 1473 : MemoryContextSwitchTo(oldcxt);
945 : 1473 : MemoryContextDelete(perRangeCxt);
946 : :
947 [ + - ]: 1473 : if (buf != InvalidBuffer)
948 : 1473 : ReleaseBuffer(buf);
949 : :
950 : : /*
951 : : * XXX We have an approximation of the number of *pages* that our scan
952 : : * returns, but we don't have a precise idea of the number of heap tuples
953 : : * involved.
954 : : */
3710 tgl@sss.pgh.pa.us 955 : 1473 : return totalpages * 10;
956 : : }
957 : :
958 : : /*
959 : : * Re-initialize state for a BRIN index scan
960 : : */
961 : : void
962 : 1473 : brinrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
963 : : ScanKey orderbys, int norderbys)
964 : : {
965 : : /*
966 : : * Other index AMs preprocess the scan keys at this point, or sometime
967 : : * early during the scan; this lets them optimize by removing redundant
968 : : * keys, or doing early returns when they are impossible to satisfy; see
969 : : * _bt_preprocess_keys for an example. Something like that could be added
970 : : * here someday, too.
971 : : */
972 : :
4146 alvherre@alvh.no-ip. 973 [ + - + - ]: 1473 : if (scankey && scan->numberOfKeys > 0)
550 peter@eisentraut.org 974 : 1473 : memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
4146 alvherre@alvh.no-ip. 975 : 1473 : }
976 : :
977 : : /*
978 : : * Close down a BRIN index scan
979 : : */
980 : : void
3710 tgl@sss.pgh.pa.us 981 : 1473 : brinendscan(IndexScanDesc scan)
982 : : {
4146 alvherre@alvh.no-ip. 983 : 1473 : BrinOpaque *opaque = (BrinOpaque *) scan->opaque;
984 : :
985 : 1473 : brinRevmapTerminate(opaque->bo_rmAccess);
986 : 1473 : brin_free_desc(opaque->bo_bdesc);
987 : 1473 : pfree(opaque);
988 : 1473 : }
989 : :
990 : : /*
991 : : * Per-heap-tuple callback for table_index_build_scan.
992 : : *
993 : : * Note we don't worry about the page range at the end of the table here; it is
994 : : * present in the build state struct after we're called the last time, but not
995 : : * inserted into the index. Caller must ensure to do so, if appropriate.
996 : : */
997 : : static void
998 : 464240 : brinbuildCallback(Relation index,
999 : : ItemPointer tid,
1000 : : Datum *values,
1001 : : bool *isnull,
1002 : : bool tupleIsAlive,
1003 : : void *brstate)
1004 : : {
1005 : 464240 : BrinBuildState *state = (BrinBuildState *) brstate;
1006 : : BlockNumber thisblock;
1007 : :
2319 andres@anarazel.de 1008 : 464240 : thisblock = ItemPointerGetBlockNumber(tid);
1009 : :
1010 : : /*
1011 : : * If we're in a block that belongs to a future range, summarize what
1012 : : * we've got and start afresh. Note the scan might have skipped many
1013 : : * pages, if they were devoid of live tuples; make sure to insert index
1014 : : * tuples for those too.
1015 : : */
4146 alvherre@alvh.no-ip. 1016 [ + + ]: 465392 : while (thisblock > state->bs_currRangeStart + state->bs_pagesPerRange - 1)
1017 : : {
1018 : :
1019 : : BRIN_elog((DEBUG2,
1020 : : "brinbuildCallback: completed a range: %u--%u",
1021 : : state->bs_currRangeStart,
1022 : : state->bs_currRangeStart + state->bs_pagesPerRange));
1023 : :
1024 : : /* create the index tuple and insert it */
1025 : 1152 : form_and_insert_tuple(state);
1026 : :
1027 : : /* set state to correspond to the next range */
1028 : 1152 : state->bs_currRangeStart += state->bs_pagesPerRange;
1029 : :
1030 : : /* re-initialize state for it */
1031 : 1152 : brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);
1032 : : }
1033 : :
1034 : : /* Accumulate the current tuple into the running state */
1818 tomas.vondra@postgre 1035 : 464240 : (void) add_values_to_range(index, state->bs_bdesc, state->bs_dtuple,
1036 : : values, isnull);
4146 alvherre@alvh.no-ip. 1037 : 464240 : }
1038 : :
1039 : : /*
1040 : : * Per-heap-tuple callback for table_index_build_scan with parallelism.
1041 : : *
1042 : : * A version of the callback used by parallel index builds. The main difference
1043 : : * is that instead of writing the BRIN tuples into the index, we write them
1044 : : * into a shared tuplesort, and leave the insertion up to the leader (which may
1045 : : * reorder them a bit etc.). The callback also does not generate empty ranges,
1046 : : * those will be added by the leader when merging results from workers.
1047 : : */
1048 : : static void
828 tomas.vondra@postgre 1049 : 3981 : brinbuildCallbackParallel(Relation index,
1050 : : ItemPointer tid,
1051 : : Datum *values,
1052 : : bool *isnull,
1053 : : bool tupleIsAlive,
1054 : : void *brstate)
1055 : : {
1056 : 3981 : BrinBuildState *state = (BrinBuildState *) brstate;
1057 : : BlockNumber thisblock;
1058 : :
1059 : 3981 : thisblock = ItemPointerGetBlockNumber(tid);
1060 : :
1061 : : /*
1062 : : * If we're in a block that belongs to a different range, summarize what
1063 : : * we've got and start afresh. Note the scan might have skipped many
1064 : : * pages, if they were devoid of live tuples; we do not create empty BRIN
1065 : : * ranges here - the leader is responsible for filling them in.
1066 : : *
1067 : : * Unlike serial builds, parallel index builds allow synchronized seqscans
1068 : : * (because that's what parallel scans do). This means the block may wrap
1069 : : * around to the beginning of the relation, so the condition needs to
1070 : : * check for both future and past ranges.
1071 : : */
806 1072 [ + - ]: 3981 : if ((thisblock < state->bs_currRangeStart) ||
1073 [ + + ]: 3981 : (thisblock > state->bs_currRangeStart + state->bs_pagesPerRange - 1))
1074 : : {
1075 : :
1076 : : BRIN_elog((DEBUG2,
1077 : : "brinbuildCallbackParallel: completed a range: %u--%u",
1078 : : state->bs_currRangeStart,
1079 : : state->bs_currRangeStart + state->bs_pagesPerRange));
1080 : :
1081 : : /* create the index tuple and write it into the tuplesort */
828 1082 : 38 : form_and_spill_tuple(state);
1083 : :
1084 : : /*
1085 : : * Set state to correspond to the next range (for this block).
1086 : : *
1087 : : * This skips ranges that are either empty (and so we don't get any
1088 : : * tuples to summarize), or processed by other workers. We can't
1089 : : * differentiate those cases here easily, so we leave it up to the
1090 : : * leader to fill empty ranges where needed.
1091 : : */
1092 : : state->bs_currRangeStart
1093 : 38 : = state->bs_pagesPerRange * (thisblock / state->bs_pagesPerRange);
1094 : :
1095 : : /* re-initialize state for it */
1096 : 38 : brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);
1097 : : }
1098 : :
1099 : : /* Accumulate the current tuple into the running state */
1100 : 3981 : (void) add_values_to_range(index, state->bs_bdesc, state->bs_dtuple,
1101 : : values, isnull);
1102 : 3981 : }
1103 : :
1104 : : /*
1105 : : * brinbuild() -- build a new BRIN index.
1106 : : */
1107 : : IndexBuildResult *
3710 tgl@sss.pgh.pa.us 1108 : 185 : brinbuild(Relation heap, Relation index, IndexInfo *indexInfo)
1109 : : {
1110 : : IndexBuildResult *result;
1111 : : double reltuples;
1112 : : double idxtuples;
1113 : : BrinRevmap *revmap;
1114 : : BrinBuildState *state;
1115 : : Buffer meta;
1116 : : BlockNumber pagesPerRange;
1117 : :
1118 : : /*
1119 : : * We expect to be called exactly once for any index relation.
1120 : : */
4146 alvherre@alvh.no-ip. 1121 [ - + ]: 185 : if (RelationGetNumberOfBlocks(index) != 0)
4146 alvherre@alvh.no-ip. 1122 [ # # ]:UBC 0 : elog(ERROR, "index \"%s\" already contains data",
1123 : : RelationGetRelationName(index));
1124 : :
1125 : : /*
1126 : : * Critical section not required, because on error the creation of the
1127 : : * whole relation will be rolled back.
1128 : : */
1129 : :
935 tmunro@postgresql.or 1130 :CBC 185 : meta = ExtendBufferedRel(BMR_REL(index), MAIN_FORKNUM, NULL,
1131 : : EB_LOCK_FIRST | EB_SKIP_EXTENSION_LOCK);
4146 alvherre@alvh.no-ip. 1132 [ - + ]: 185 : Assert(BufferGetBlockNumber(meta) == BRIN_METAPAGE_BLKNO);
1133 : :
3616 kgrittn@postgresql.o 1134 [ + - - + : 185 : brin_metapage_init(BufferGetPage(meta), BrinGetPagesPerRange(index),
+ + ]
1135 : : BRIN_CURRENT_VERSION);
4146 alvherre@alvh.no-ip. 1136 : 185 : MarkBufferDirty(meta);
1137 : :
1138 [ + + + + : 185 : if (RelationNeedsWAL(index))
+ + - + ]
1139 : : {
1140 : : xl_brin_createidx xlrec;
1141 : : XLogRecPtr recptr;
1142 : : Page page;
1143 : :
1144 : 128 : xlrec.version = BRIN_CURRENT_VERSION;
1145 [ + - - + : 128 : xlrec.pagesPerRange = BrinGetPagesPerRange(index);
+ + ]
1146 : :
4133 heikki.linnakangas@i 1147 : 128 : XLogBeginInsert();
397 peter@eisentraut.org 1148 : 128 : XLogRegisterData(&xlrec, SizeOfBrinCreateIdx);
3055 tgl@sss.pgh.pa.us 1149 : 128 : XLogRegisterBuffer(0, meta, REGBUF_WILL_INIT | REGBUF_STANDARD);
1150 : :
4133 heikki.linnakangas@i 1151 : 128 : recptr = XLogInsert(RM_BRIN_ID, XLOG_BRIN_CREATE_INDEX);
1152 : :
3616 kgrittn@postgresql.o 1153 : 128 : page = BufferGetPage(meta);
4146 alvherre@alvh.no-ip. 1154 : 128 : PageSetLSN(page, recptr);
1155 : : }
1156 : :
1157 : 185 : UnlockReleaseBuffer(meta);
1158 : :
1159 : : /*
1160 : : * Initialize our state, including the deformed tuple state.
1161 : : */
919 tmunro@postgresql.or 1162 : 185 : revmap = brinRevmapInitialize(index, &pagesPerRange);
828 tomas.vondra@postgre 1163 : 185 : state = initialize_brin_buildstate(index, revmap, pagesPerRange,
1164 : : RelationGetNumberOfBlocks(heap));
1165 : :
1166 : : /*
1167 : : * Attempt to launch parallel worker scan when required
1168 : : *
1169 : : * XXX plan_create_index_workers makes the number of workers dependent on
1170 : : * maintenance_work_mem, requiring 32MB for each worker. That makes sense
1171 : : * for btree, but not for BRIN, which can do with much less memory. So
1172 : : * maybe make that somehow less strict, optionally?
1173 : : */
1174 [ + + ]: 185 : if (indexInfo->ii_ParallelWorkers > 0)
1175 : 5 : _brin_begin_parallel(state, heap, index, indexInfo->ii_Concurrent,
1176 : : indexInfo->ii_ParallelWorkers);
1177 : :
1178 : : /*
1179 : : * If parallel build requested and at least one worker process was
1180 : : * successfully launched, set up coordination state, wait for workers to
1181 : : * complete. Then read all tuples from the shared tuplesort and insert
1182 : : * them into the index.
1183 : : *
1184 : : * In serial mode, simply scan the table and build the index one index
1185 : : * tuple at a time.
1186 : : */
1187 [ + + ]: 185 : if (state->bs_leader)
1188 : : {
1189 : : SortCoordinate coordinate;
1190 : :
95 michael@paquier.xyz 1191 :GNC 4 : coordinate = palloc0_object(SortCoordinateData);
828 tomas.vondra@postgre 1192 :CBC 4 : coordinate->isWorker = false;
1193 : 4 : coordinate->nParticipants =
1194 : 4 : state->bs_leader->nparticipanttuplesorts;
1195 : 4 : coordinate->sharedsort = state->bs_leader->sharedsort;
1196 : :
1197 : : /*
1198 : : * Begin leader tuplesort.
1199 : : *
1200 : : * In cases where parallelism is involved, the leader receives the
1201 : : * same share of maintenance_work_mem as a serial sort (it is
1202 : : * generally treated in the same way as a serial sort once we return).
1203 : : * Parallel worker Tuplesortstates will have received only a fraction
1204 : : * of maintenance_work_mem, though.
1205 : : *
1206 : : * We rely on the lifetime of the Leader Tuplesortstate almost not
1207 : : * overlapping with any worker Tuplesortstate's lifetime. There may
1208 : : * be some small overlap, but that's okay because we rely on leader
1209 : : * Tuplesortstate only allocating a small, fixed amount of memory
1210 : : * here. When its tuplesort_performsort() is called (by our caller),
1211 : : * and significant amounts of memory are likely to be used, all
1212 : : * workers must have already freed almost all memory held by their
1213 : : * Tuplesortstates (they are about to go away completely, too). The
1214 : : * overall effect is that maintenance_work_mem always represents an
1215 : : * absolute high watermark on the amount of memory used by a CREATE
1216 : : * INDEX operation, regardless of the use of parallelism or any other
1217 : : * factor.
1218 : : */
806 1219 : 4 : state->bs_sortstate =
1220 : 4 : tuplesort_begin_index_brin(maintenance_work_mem, coordinate,
1221 : : TUPLESORT_NONE);
1222 : :
1223 : : /* scan the relation and merge per-worker results */
697 1224 : 4 : reltuples = _brin_parallel_merge(state);
1225 : :
828 1226 : 4 : _brin_end_parallel(state->bs_leader, state);
1227 : : }
1228 : : else /* no parallel index build */
1229 : : {
1230 : : /*
1231 : : * Now scan the relation. No syncscan allowed here because we want
1232 : : * the heap blocks in physical order (we want to produce the ranges
1233 : : * starting from block 0, and the callback also relies on this to not
1234 : : * generate summary for the same range twice).
1235 : : */
1236 : 181 : reltuples = table_index_build_scan(heap, index, indexInfo, false, true,
1237 : : brinbuildCallback, state, NULL);
1238 : :
1239 : : /*
1240 : : * process the final batch
1241 : : *
1242 : : * XXX Note this does not update state->bs_currRangeStart, i.e. it
1243 : : * stays set to the last range added to the index. This is OK, because
1244 : : * that's what brin_fill_empty_ranges expects.
1245 : : */
1246 : 181 : form_and_insert_tuple(state);
1247 : :
1248 : : /*
1249 : : * Backfill the final ranges with empty data.
1250 : : *
1251 : : * This saves us from doing what amounts to full table scans when the
1252 : : * index with a predicate like WHERE (nonnull_column IS NULL), or
1253 : : * other very selective predicates.
1254 : : */
1255 : 181 : brin_fill_empty_ranges(state,
1256 : : state->bs_currRangeStart,
1257 : : state->bs_maxRangeStart);
1258 : : }
1259 : :
1260 : : /* release resources */
4146 alvherre@alvh.no-ip. 1261 : 185 : idxtuples = state->bs_numtuples;
1262 : 185 : brinRevmapTerminate(state->bs_rmAccess);
1263 : 185 : terminate_brin_buildstate(state);
1264 : :
1265 : : /*
1266 : : * Return statistics
1267 : : */
1280 peter@eisentraut.org 1268 : 185 : result = palloc_object(IndexBuildResult);
1269 : :
4146 alvherre@alvh.no-ip. 1270 : 185 : result->heap_tuples = reltuples;
1271 : 185 : result->index_tuples = idxtuples;
1272 : :
3710 tgl@sss.pgh.pa.us 1273 : 185 : return result;
1274 : : }
1275 : :
1276 : : void
1277 : 3 : brinbuildempty(Relation index)
1278 : : {
1279 : : Buffer metabuf;
1280 : :
1281 : : /* An empty BRIN index has a metapage only. */
935 tmunro@postgresql.or 1282 : 3 : metabuf = ExtendBufferedRel(BMR_REL(index), INIT_FORKNUM, NULL,
1283 : : EB_LOCK_FIRST | EB_SKIP_EXTENSION_LOCK);
1284 : :
1285 : : /* Initialize and xlog metabuffer. */
4146 alvherre@alvh.no-ip. 1286 : 3 : START_CRIT_SECTION();
3616 kgrittn@postgresql.o 1287 [ + - - + : 3 : brin_metapage_init(BufferGetPage(metabuf), BrinGetPagesPerRange(index),
- + ]
1288 : : BRIN_CURRENT_VERSION);
4146 alvherre@alvh.no-ip. 1289 : 3 : MarkBufferDirty(metabuf);
3055 tgl@sss.pgh.pa.us 1290 : 3 : log_newpage_buffer(metabuf, true);
4146 alvherre@alvh.no-ip. 1291 [ - + ]: 3 : END_CRIT_SECTION();
1292 : :
1293 : 3 : UnlockReleaseBuffer(metabuf);
1294 : 3 : }
1295 : :
1296 : : /*
1297 : : * brinbulkdelete
1298 : : * Since there are no per-heap-tuple index tuples in BRIN indexes,
1299 : : * there's not a lot we can do here.
1300 : : *
1301 : : * XXX we could mark item tuples as "dirty" (when a minimum or maximum heap
1302 : : * tuple is deleted), meaning the need to re-run summarization on the affected
1303 : : * range. Would need to add an extra flag in brintuples for that.
1304 : : */
1305 : : IndexBulkDeleteResult *
3710 tgl@sss.pgh.pa.us 1306 : 10 : brinbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
1307 : : IndexBulkDeleteCallback callback, void *callback_state)
1308 : : {
1309 : : /* allocate stats if first time through, else re-use existing struct */
4146 alvherre@alvh.no-ip. 1310 [ + - ]: 10 : if (stats == NULL)
1280 peter@eisentraut.org 1311 : 10 : stats = palloc0_object(IndexBulkDeleteResult);
1312 : :
3710 tgl@sss.pgh.pa.us 1313 : 10 : return stats;
1314 : : }
1315 : :
1316 : : /*
1317 : : * This routine is in charge of "vacuuming" a BRIN index: we just summarize
1318 : : * ranges that are currently unsummarized.
1319 : : */
1320 : : IndexBulkDeleteResult *
1321 : 67 : brinvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
1322 : : {
1323 : : Relation heapRel;
1324 : :
1325 : : /* No-op in ANALYZE ONLY mode */
4146 alvherre@alvh.no-ip. 1326 [ + + ]: 67 : if (info->analyze_only)
3710 tgl@sss.pgh.pa.us 1327 : 3 : return stats;
1328 : :
4146 alvherre@alvh.no-ip. 1329 [ + + ]: 64 : if (!stats)
1280 peter@eisentraut.org 1330 : 57 : stats = palloc0_object(IndexBulkDeleteResult);
4146 alvherre@alvh.no-ip. 1331 : 64 : stats->num_pages = RelationGetNumberOfBlocks(info->index);
1332 : : /* rest of stats is initialized by zeroing */
1333 : :
2610 andres@anarazel.de 1334 : 64 : heapRel = table_open(IndexGetRelation(RelationGetRelid(info->index), false),
1335 : : AccessShareLock);
1336 : :
3868 alvherre@alvh.no-ip. 1337 : 64 : brin_vacuum_scan(info->index, info->strategy);
1338 : :
3054 1339 : 64 : brinsummarize(info->index, heapRel, BRIN_ALL_BLOCKRANGES, false,
1340 : : &stats->num_index_tuples, &stats->num_index_tuples);
1341 : :
2610 andres@anarazel.de 1342 : 64 : table_close(heapRel, AccessShareLock);
1343 : :
3710 tgl@sss.pgh.pa.us 1344 : 64 : return stats;
1345 : : }
1346 : :
1347 : : /*
1348 : : * reloptions processor for BRIN indexes
1349 : : */
1350 : : bytea *
1351 : 658 : brinoptions(Datum reloptions, bool validate)
1352 : : {
1353 : : static const relopt_parse_elt tab[] = {
1354 : : {"pages_per_range", RELOPT_TYPE_INT, offsetof(BrinOptions, pagesPerRange)},
1355 : : {"autosummarize", RELOPT_TYPE_BOOL, offsetof(BrinOptions, autosummarize)}
1356 : : };
1357 : :
2322 michael@paquier.xyz 1358 : 658 : return (bytea *) build_reloptions(reloptions, validate,
1359 : : RELOPT_KIND_BRIN,
1360 : : sizeof(BrinOptions),
1361 : : tab, lengthof(tab));
1362 : : }
1363 : :
1364 : : /*
1365 : : * SQL-callable function to scan through an index and summarize all ranges
1366 : : * that are not currently summarized.
1367 : : */
1368 : : Datum
4146 alvherre@alvh.no-ip. 1369 : 38 : brin_summarize_new_values(PG_FUNCTION_ARGS)
1370 : : {
3270 1371 : 38 : Datum relation = PG_GETARG_DATUM(0);
1372 : :
1373 : 38 : return DirectFunctionCall2(brin_summarize_range,
1374 : : relation,
1375 : : Int64GetDatum((int64) BRIN_ALL_BLOCKRANGES));
1376 : : }
1377 : :
1378 : : /*
1379 : : * SQL-callable function to summarize the indicated page range, if not already
1380 : : * summarized. If the second argument is BRIN_ALL_BLOCKRANGES, all
1381 : : * unsummarized ranges are summarized.
1382 : : */
1383 : : Datum
1384 : 105 : brin_summarize_range(PG_FUNCTION_ARGS)
1385 : : {
4146 1386 : 105 : Oid indexoid = PG_GETARG_OID(0);
3270 1387 : 105 : int64 heapBlk64 = PG_GETARG_INT64(1);
1388 : : BlockNumber heapBlk;
1389 : : Oid heapoid;
1390 : : Relation indexRel;
1391 : : Relation heapRel;
1392 : : Oid save_userid;
1393 : : int save_sec_context;
1394 : : int save_nestlevel;
4146 1395 : 105 : double numSummarized = 0;
1396 : :
2831 1397 [ - + ]: 105 : if (RecoveryInProgress())
2831 alvherre@alvh.no-ip. 1398 [ # # ]:UBC 0 : ereport(ERROR,
1399 : : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1400 : : errmsg("recovery is in progress"),
1401 : : errhint("BRIN control functions cannot be executed during recovery.")));
1402 : :
3270 alvherre@alvh.no-ip. 1403 [ + + + + ]:CBC 105 : if (heapBlk64 > BRIN_ALL_BLOCKRANGES || heapBlk64 < 0)
1404 [ + - ]: 18 : ereport(ERROR,
1405 : : (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
1406 : : errmsg("block number out of range: %" PRId64, heapBlk64)));
1407 : 87 : heapBlk = (BlockNumber) heapBlk64;
1408 : :
1409 : : /*
1410 : : * We must lock table before index to avoid deadlocks. However, if the
1411 : : * passed indexoid isn't an index then IndexGetRelation() will fail.
1412 : : * Rather than emitting a not-very-helpful error message, postpone
1413 : : * complaining, expecting that the is-it-an-index test below will fail.
1414 : : */
3732 tgl@sss.pgh.pa.us 1415 : 87 : heapoid = IndexGetRelation(indexoid, true);
1416 [ + + ]: 87 : if (OidIsValid(heapoid))
1417 : : {
2610 andres@anarazel.de 1418 : 78 : heapRel = table_open(heapoid, ShareUpdateExclusiveLock);
1419 : :
1420 : : /*
1421 : : * Autovacuum calls us. For its benefit, switch to the table owner's
1422 : : * userid, so that any index functions are run as that user. Also
1423 : : * lock down security-restricted operations and arrange to make GUC
1424 : : * variable changes local to this command. This is harmless, albeit
1425 : : * unnecessary, when called from SQL, because we fail shortly if the
1426 : : * user does not own the index.
1427 : : */
1406 noah@leadboat.com 1428 : 78 : GetUserIdAndSecContext(&save_userid, &save_sec_context);
1429 : 78 : SetUserIdAndSecContext(heapRel->rd_rel->relowner,
1430 : : save_sec_context | SECURITY_RESTRICTED_OPERATION);
1431 : 78 : save_nestlevel = NewGUCNestLevel();
741 jdavis@postgresql.or 1432 : 78 : RestrictSearchPath();
1433 : : }
1434 : : else
1435 : : {
3732 tgl@sss.pgh.pa.us 1436 : 9 : heapRel = NULL;
1437 : : /* Set these just to suppress "uninitialized variable" warnings */
1383 1438 : 9 : save_userid = InvalidOid;
1439 : 9 : save_sec_context = -1;
1440 : 9 : save_nestlevel = -1;
1441 : : }
1442 : :
4146 alvherre@alvh.no-ip. 1443 : 87 : indexRel = index_open(indexoid, ShareUpdateExclusiveLock);
1444 : :
1445 : : /* Must be a BRIN index */
3732 tgl@sss.pgh.pa.us 1446 [ + - ]: 78 : if (indexRel->rd_rel->relkind != RELKIND_INDEX ||
1447 [ + + ]: 78 : indexRel->rd_rel->relam != BRIN_AM_OID)
1448 [ + - ]: 9 : ereport(ERROR,
1449 : : (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1450 : : errmsg("\"%s\" is not a BRIN index",
1451 : : RelationGetRelationName(indexRel))));
1452 : :
1453 : : /* User must own the index (comparable to privileges needed for VACUUM) */
1218 peter@eisentraut.org 1454 [ + - - + ]: 69 : if (heapRel != NULL && !object_ownercheck(RelationRelationId, indexoid, save_userid))
3025 peter_e@gmx.net 1455 :UBC 0 : aclcheck_error(ACLCHECK_NOT_OWNER, OBJECT_INDEX,
3732 tgl@sss.pgh.pa.us 1456 : 0 : RelationGetRelationName(indexRel));
1457 : :
1458 : : /*
1459 : : * Since we did the IndexGetRelation call above without any lock, it's
1460 : : * barely possible that a race against an index drop/recreation could have
1461 : : * netted us the wrong table. Recheck.
1462 : : */
3732 tgl@sss.pgh.pa.us 1463 [ + - - + ]:CBC 69 : if (heapRel == NULL || heapoid != IndexGetRelation(indexoid, false))
3732 tgl@sss.pgh.pa.us 1464 [ # # ]:UBC 0 : ereport(ERROR,
1465 : : (errcode(ERRCODE_UNDEFINED_TABLE),
1466 : : errmsg("could not open parent table of index \"%s\"",
1467 : : RelationGetRelationName(indexRel))));
1468 : :
1469 : : /* see gin_clean_pending_list() */
867 noah@leadboat.com 1470 [ + - ]:CBC 69 : if (indexRel->rd_index->indisvalid)
1471 : 69 : brinsummarize(indexRel, heapRel, heapBlk, true, &numSummarized, NULL);
1472 : : else
867 noah@leadboat.com 1473 [ # # ]:UBC 0 : ereport(DEBUG1,
1474 : : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1475 : : errmsg("index \"%s\" is not valid",
1476 : : RelationGetRelationName(indexRel))));
1477 : :
1478 : : /* Roll back any GUC changes executed by index functions */
1406 noah@leadboat.com 1479 :CBC 69 : AtEOXact_GUC(false, save_nestlevel);
1480 : :
1481 : : /* Restore userid and security context */
1482 : 69 : SetUserIdAndSecContext(save_userid, save_sec_context);
1483 : :
86 michael@paquier.xyz 1484 :GNC 69 : index_close(indexRel, ShareUpdateExclusiveLock);
1485 : 69 : table_close(heapRel, ShareUpdateExclusiveLock);
1486 : :
4146 alvherre@alvh.no-ip. 1487 :CBC 69 : PG_RETURN_INT32((int32) numSummarized);
1488 : : }
1489 : :
1490 : : /*
1491 : : * SQL-callable interface to mark a range as no longer summarized
1492 : : */
1493 : : Datum
3270 1494 : 52 : brin_desummarize_range(PG_FUNCTION_ARGS)
1495 : : {
3224 bruce@momjian.us 1496 : 52 : Oid indexoid = PG_GETARG_OID(0);
1497 : 52 : int64 heapBlk64 = PG_GETARG_INT64(1);
1498 : : BlockNumber heapBlk;
1499 : : Oid heapoid;
1500 : : Relation heapRel;
1501 : : Relation indexRel;
1502 : : bool done;
1503 : :
2831 alvherre@alvh.no-ip. 1504 [ - + ]: 52 : if (RecoveryInProgress())
2831 alvherre@alvh.no-ip. 1505 [ # # ]:UBC 0 : ereport(ERROR,
1506 : : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1507 : : errmsg("recovery is in progress"),
1508 : : errhint("BRIN control functions cannot be executed during recovery.")));
1509 : :
3270 alvherre@alvh.no-ip. 1510 [ + - + + ]:CBC 52 : if (heapBlk64 > MaxBlockNumber || heapBlk64 < 0)
1511 [ + - ]: 9 : ereport(ERROR,
1512 : : (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
1513 : : errmsg("block number out of range: %" PRId64,
1514 : : heapBlk64)));
1515 : 43 : heapBlk = (BlockNumber) heapBlk64;
1516 : :
1517 : : /*
1518 : : * We must lock table before index to avoid deadlocks. However, if the
1519 : : * passed indexoid isn't an index then IndexGetRelation() will fail.
1520 : : * Rather than emitting a not-very-helpful error message, postpone
1521 : : * complaining, expecting that the is-it-an-index test below will fail.
1522 : : *
1523 : : * Unlike brin_summarize_range(), autovacuum never calls this. Hence, we
1524 : : * don't switch userid.
1525 : : */
1526 : 43 : heapoid = IndexGetRelation(indexoid, true);
1527 [ + - ]: 43 : if (OidIsValid(heapoid))
2610 andres@anarazel.de 1528 : 43 : heapRel = table_open(heapoid, ShareUpdateExclusiveLock);
1529 : : else
3270 alvherre@alvh.no-ip. 1530 :UBC 0 : heapRel = NULL;
1531 : :
3270 alvherre@alvh.no-ip. 1532 :CBC 43 : indexRel = index_open(indexoid, ShareUpdateExclusiveLock);
1533 : :
1534 : : /* Must be a BRIN index */
1535 [ + - ]: 43 : if (indexRel->rd_rel->relkind != RELKIND_INDEX ||
1536 [ - + ]: 43 : indexRel->rd_rel->relam != BRIN_AM_OID)
3270 alvherre@alvh.no-ip. 1537 [ # # ]:UBC 0 : ereport(ERROR,
1538 : : (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1539 : : errmsg("\"%s\" is not a BRIN index",
1540 : : RelationGetRelationName(indexRel))));
1541 : :
1542 : : /* User must own the index (comparable to privileges needed for VACUUM) */
1218 peter@eisentraut.org 1543 [ - + ]:CBC 43 : if (!object_ownercheck(RelationRelationId, indexoid, GetUserId()))
3025 peter_e@gmx.net 1544 :UBC 0 : aclcheck_error(ACLCHECK_NOT_OWNER, OBJECT_INDEX,
3270 alvherre@alvh.no-ip. 1545 : 0 : RelationGetRelationName(indexRel));
1546 : :
1547 : : /*
1548 : : * Since we did the IndexGetRelation call above without any lock, it's
1549 : : * barely possible that a race against an index drop/recreation could have
1550 : : * netted us the wrong table. Recheck.
1551 : : */
3270 alvherre@alvh.no-ip. 1552 [ + - - + ]:CBC 43 : if (heapRel == NULL || heapoid != IndexGetRelation(indexoid, false))
3270 alvherre@alvh.no-ip. 1553 [ # # ]:UBC 0 : ereport(ERROR,
1554 : : (errcode(ERRCODE_UNDEFINED_TABLE),
1555 : : errmsg("could not open parent table of index \"%s\"",
1556 : : RelationGetRelationName(indexRel))));
1557 : :
1558 : : /* see gin_clean_pending_list() */
867 noah@leadboat.com 1559 [ + - ]:CBC 43 : if (indexRel->rd_index->indisvalid)
1560 : : {
1561 : : /* the revmap does the hard work */
1562 : : do
1563 : : {
1564 : 43 : done = brinRevmapDesummarizeRange(indexRel, heapBlk);
1565 : : }
1566 [ - + ]: 43 : while (!done);
1567 : : }
1568 : : else
867 noah@leadboat.com 1569 [ # # ]:UBC 0 : ereport(DEBUG1,
1570 : : (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1571 : : errmsg("index \"%s\" is not valid",
1572 : : RelationGetRelationName(indexRel))));
1573 : :
86 michael@paquier.xyz 1574 :GNC 43 : index_close(indexRel, ShareUpdateExclusiveLock);
1575 : 43 : table_close(heapRel, ShareUpdateExclusiveLock);
1576 : :
3270 alvherre@alvh.no-ip. 1577 :CBC 43 : PG_RETURN_VOID();
1578 : : }
1579 : :
1580 : : /*
1581 : : * Build a BrinDesc used to create or scan a BRIN index
1582 : : */
1583 : : BrinDesc *
4146 1584 : 2296 : brin_build_desc(Relation rel)
1585 : : {
1586 : : BrinOpcInfo **opcinfo;
1587 : : BrinDesc *bdesc;
1588 : : TupleDesc tupdesc;
1589 : 2296 : int totalstored = 0;
1590 : : int keyno;
1591 : : long totalsize;
1592 : : MemoryContext cxt;
1593 : : MemoryContext oldcxt;
1594 : :
1595 : 2296 : cxt = AllocSetContextCreate(CurrentMemoryContext,
1596 : : "brin desc cxt",
1597 : : ALLOCSET_SMALL_SIZES);
1598 : 2296 : oldcxt = MemoryContextSwitchTo(cxt);
1599 : 2296 : tupdesc = RelationGetDescr(rel);
1600 : :
1601 : : /*
1602 : : * Obtain BrinOpcInfo for each indexed column. While at it, accumulate
1603 : : * the number of columns stored, since the number is opclass-defined.
1604 : : */
1031 tgl@sss.pgh.pa.us 1605 : 2296 : opcinfo = palloc_array(BrinOpcInfo *, tupdesc->natts);
4146 alvherre@alvh.no-ip. 1606 [ + + ]: 38133 : for (keyno = 0; keyno < tupdesc->natts; keyno++)
1607 : : {
1608 : : FmgrInfo *opcInfoFn;
3129 andres@anarazel.de 1609 : 35837 : Form_pg_attribute attr = TupleDescAttr(tupdesc, keyno);
1610 : :
4146 alvherre@alvh.no-ip. 1611 : 35837 : opcInfoFn = index_getprocinfo(rel, keyno + 1, BRIN_PROCNUM_OPCINFO);
1612 : :
1613 : 71674 : opcinfo[keyno] = (BrinOpcInfo *)
219 peter@eisentraut.org 1614 :GNC 35837 : DatumGetPointer(FunctionCall1(opcInfoFn, ObjectIdGetDatum(attr->atttypid)));
4146 alvherre@alvh.no-ip. 1615 :CBC 35837 : totalstored += opcinfo[keyno]->oi_nstored;
1616 : : }
1617 : :
1618 : : /* Allocate our result struct and fill it in */
1619 : 2296 : totalsize = offsetof(BrinDesc, bd_info) +
1620 : 2296 : sizeof(BrinOpcInfo *) * tupdesc->natts;
1621 : :
1622 : 2296 : bdesc = palloc(totalsize);
1623 : 2296 : bdesc->bd_context = cxt;
1624 : 2296 : bdesc->bd_index = rel;
1625 : 2296 : bdesc->bd_tupdesc = tupdesc;
1626 : 2296 : bdesc->bd_disktdesc = NULL; /* generated lazily */
1627 : 2296 : bdesc->bd_totalstored = totalstored;
1628 : :
1629 [ + + ]: 38133 : for (keyno = 0; keyno < tupdesc->natts; keyno++)
1630 : 35837 : bdesc->bd_info[keyno] = opcinfo[keyno];
1631 : 2296 : pfree(opcinfo);
1632 : :
1633 : 2296 : MemoryContextSwitchTo(oldcxt);
1634 : :
1635 : 2296 : return bdesc;
1636 : : }
1637 : :
1638 : : void
1639 : 1728 : brin_free_desc(BrinDesc *bdesc)
1640 : : {
1641 : : /* make sure the tupdesc is still valid */
1642 [ - + ]: 1728 : Assert(bdesc->bd_tupdesc->tdrefcount >= 1);
1643 : : /* no need for retail pfree */
1644 : 1728 : MemoryContextDelete(bdesc->bd_context);
1645 : 1728 : }
1646 : :
1647 : : /*
1648 : : * Fetch index's statistical data into *stats
1649 : : */
1650 : : void
3265 1651 : 5368 : brinGetStats(Relation index, BrinStatsData *stats)
1652 : : {
1653 : : Buffer metabuffer;
1654 : : Page metapage;
1655 : : BrinMetaPageData *metadata;
1656 : :
1657 : 5368 : metabuffer = ReadBuffer(index, BRIN_METAPAGE_BLKNO);
1658 : 5368 : LockBuffer(metabuffer, BUFFER_LOCK_SHARE);
1659 : 5368 : metapage = BufferGetPage(metabuffer);
1660 : 5368 : metadata = (BrinMetaPageData *) PageGetContents(metapage);
1661 : :
1662 : 5368 : stats->pagesPerRange = metadata->pagesPerRange;
1663 : 5368 : stats->revmapNumPages = metadata->lastRevmapPage - 1;
1664 : :
1665 : 5368 : UnlockReleaseBuffer(metabuffer);
1666 : 5368 : }
1667 : :
1668 : : /*
1669 : : * Initialize a BrinBuildState appropriate to create tuples on the given index.
1670 : : */
1671 : : static BrinBuildState *
4146 1672 : 239 : initialize_brin_buildstate(Relation idxRel, BrinRevmap *revmap,
1673 : : BlockNumber pagesPerRange, BlockNumber tablePages)
1674 : : {
1675 : : BrinBuildState *state;
828 tomas.vondra@postgre 1676 : 239 : BlockNumber lastRange = 0;
1677 : :
1280 peter@eisentraut.org 1678 : 239 : state = palloc_object(BrinBuildState);
1679 : :
4146 alvherre@alvh.no-ip. 1680 : 239 : state->bs_irel = idxRel;
1681 : 239 : state->bs_numtuples = 0;
828 tomas.vondra@postgre 1682 : 239 : state->bs_reltuples = 0;
4146 alvherre@alvh.no-ip. 1683 : 239 : state->bs_currentInsertBuf = InvalidBuffer;
1684 : 239 : state->bs_pagesPerRange = pagesPerRange;
1685 : 239 : state->bs_currRangeStart = 0;
1686 : 239 : state->bs_rmAccess = revmap;
1687 : 239 : state->bs_bdesc = brin_build_desc(idxRel);
1688 : 239 : state->bs_dtuple = brin_new_memtuple(state->bs_bdesc);
828 tomas.vondra@postgre 1689 : 239 : state->bs_leader = NULL;
1690 : 239 : state->bs_worker_id = 0;
806 1691 : 239 : state->bs_sortstate = NULL;
1692 : :
1693 : : /* Remember the memory context to use for an empty tuple, if needed. */
828 1694 : 239 : state->bs_context = CurrentMemoryContext;
1695 : 239 : state->bs_emptyTuple = NULL;
1696 : 239 : state->bs_emptyTupleLen = 0;
1697 : :
1698 : : /*
1699 : : * Calculate the start of the last page range. Page numbers are 0-based,
1700 : : * so to calculate the index we need to subtract one. The integer division
1701 : : * gives us the index of the page range.
1702 : : */
1703 [ + + ]: 239 : if (tablePages > 0)
1704 : 177 : lastRange = ((tablePages - 1) / pagesPerRange) * pagesPerRange;
1705 : :
1706 : : /* Now calculate the start of the next range. */
1707 : 239 : state->bs_maxRangeStart = lastRange + state->bs_pagesPerRange;
1708 : :
4146 alvherre@alvh.no-ip. 1709 : 239 : return state;
1710 : : }
1711 : :
1712 : : /*
1713 : : * Release resources associated with a BrinBuildState.
1714 : : */
1715 : : static void
1716 : 233 : terminate_brin_buildstate(BrinBuildState *state)
1717 : : {
1718 : : /*
1719 : : * Release the last index buffer used. We might as well ensure that
1720 : : * whatever free space remains in that page is available in FSM, too.
1721 : : */
1722 [ + + ]: 233 : if (!BufferIsInvalid(state->bs_currentInsertBuf))
1723 : : {
1724 : : Page page;
1725 : : Size freespace;
1726 : : BlockNumber blk;
1727 : :
3616 kgrittn@postgresql.o 1728 : 185 : page = BufferGetPage(state->bs_currentInsertBuf);
2902 tgl@sss.pgh.pa.us 1729 : 185 : freespace = PageGetFreeSpace(page);
1730 : 185 : blk = BufferGetBlockNumber(state->bs_currentInsertBuf);
4146 alvherre@alvh.no-ip. 1731 : 185 : ReleaseBuffer(state->bs_currentInsertBuf);
2504 akapila@postgresql.o 1732 : 185 : RecordPageWithFreeSpace(state->bs_irel, blk, freespace);
2902 tgl@sss.pgh.pa.us 1733 : 185 : FreeSpaceMapVacuumRange(state->bs_irel, blk, blk + 1);
1734 : : }
1735 : :
4146 alvherre@alvh.no-ip. 1736 : 233 : brin_free_desc(state->bs_bdesc);
1737 : 233 : pfree(state->bs_dtuple);
1738 : 233 : pfree(state);
1739 : 233 : }
1740 : :
1741 : : /*
1742 : : * On the given BRIN index, summarize the heap page range that corresponds
1743 : : * to the heap block number given.
1744 : : *
1745 : : * This routine can run in parallel with insertions into the heap. To avoid
1746 : : * missing those values from the summary tuple, we first insert a placeholder
1747 : : * index tuple into the index, then execute the heap scan; transactions
1748 : : * concurrent with the scan update the placeholder tuple. After the scan, we
1749 : : * union the placeholder tuple with the one computed by this routine. The
1750 : : * update of the index value happens in a loop, so that if somebody updates
1751 : : * the placeholder tuple after we read it, we detect the case and try again.
1752 : : * This ensures that the concurrently inserted tuples are not lost.
1753 : : *
1754 : : * A further corner case is this routine being asked to summarize the partial
1755 : : * range at the end of the table. heapNumBlocks is the (possibly outdated)
1756 : : * table size; if we notice that the requested range lies beyond that size,
1757 : : * we re-compute the table size after inserting the placeholder tuple, to
1758 : : * avoid missing pages that were appended recently.
1759 : : */
1760 : : static void
1761 : 1476 : summarize_range(IndexInfo *indexInfo, BrinBuildState *state, Relation heapRel,
1762 : : BlockNumber heapBlk, BlockNumber heapNumBlks)
1763 : : {
1764 : : Buffer phbuf;
1765 : : BrinTuple *phtup;
1766 : : Size phsz;
1767 : : OffsetNumber offset;
1768 : : BlockNumber scanNumBlks;
1769 : :
1770 : : /*
1771 : : * Insert the placeholder tuple
1772 : : */
1773 : 1476 : phbuf = InvalidBuffer;
1774 : 1476 : phtup = brin_form_placeholder_tuple(state->bs_bdesc, heapBlk, &phsz);
1775 : 1476 : offset = brin_doinsert(state->bs_irel, state->bs_pagesPerRange,
1776 : : state->bs_rmAccess, &phbuf,
1777 : : heapBlk, phtup, phsz);
1778 : :
1779 : : /*
1780 : : * Compute range end. We hold ShareUpdateExclusive lock on table, so it
1781 : : * cannot shrink concurrently (but it can grow).
1782 : : */
3054 1783 [ - + ]: 1476 : Assert(heapBlk % state->bs_pagesPerRange == 0);
1784 [ + + ]: 1476 : if (heapBlk + state->bs_pagesPerRange > heapNumBlks)
1785 : : {
1786 : : /*
1787 : : * If we're asked to scan what we believe to be the final range on the
1788 : : * table (i.e. a range that might be partial) we need to recompute our
1789 : : * idea of what the latest page is after inserting the placeholder
1790 : : * tuple. Anyone that grows the table later will update the
1791 : : * placeholder tuple, so it doesn't matter that we won't scan these
1792 : : * pages ourselves. Careful: the table might have been extended
1793 : : * beyond the current range, so clamp our result.
1794 : : *
1795 : : * Fortunately, this should occur infrequently.
1796 : : */
1797 [ + - ]: 12 : scanNumBlks = Min(RelationGetNumberOfBlocks(heapRel) - heapBlk,
1798 : : state->bs_pagesPerRange);
1799 : : }
1800 : : else
1801 : : {
1802 : : /* Easy case: range is known to be complete */
1803 : 1464 : scanNumBlks = state->bs_pagesPerRange;
1804 : : }
1805 : :
1806 : : /*
1807 : : * Execute the partial heap scan covering the heap blocks in the specified
1808 : : * page range, summarizing the heap tuples in it. This scan stops just
1809 : : * short of brinbuildCallback creating the new index entry.
1810 : : *
1811 : : * Note that it is critical we use the "any visible" mode of
1812 : : * table_index_build_range_scan here: otherwise, we would miss tuples
1813 : : * inserted by transactions that are still in progress, among other corner
1814 : : * cases.
1815 : : */
4146 1816 : 1476 : state->bs_currRangeStart = heapBlk;
2539 1817 : 1476 : table_index_build_range_scan(heapRel, state->bs_irel, indexInfo, false, true, false,
1818 : : heapBlk, scanNumBlks,
1819 : : brinbuildCallback, state, NULL);
1820 : :
1821 : : /*
1822 : : * Now we update the values obtained by the scan with the placeholder
1823 : : * tuple. We do this in a loop which only terminates if we're able to
1824 : : * update the placeholder tuple successfully; if we are not, this means
1825 : : * somebody else modified the placeholder tuple after we read it.
1826 : : */
1827 : : for (;;)
4146 alvherre@alvh.no-ip. 1828 :UBC 0 : {
1829 : : BrinTuple *newtup;
1830 : : Size newsize;
1831 : : bool didupdate;
1832 : : bool samepage;
1833 : :
4146 alvherre@alvh.no-ip. 1834 [ - + ]:CBC 1476 : CHECK_FOR_INTERRUPTS();
1835 : :
1836 : : /*
1837 : : * Update the summary tuple and try to update.
1838 : : */
1839 : 1476 : newtup = brin_form_tuple(state->bs_bdesc,
1840 : : heapBlk, state->bs_dtuple, &newsize);
1841 : 1476 : samepage = brin_can_do_samepage_update(phbuf, phsz, newsize);
1842 : : didupdate =
1843 : 1476 : brin_doupdate(state->bs_irel, state->bs_pagesPerRange,
1844 : : state->bs_rmAccess, heapBlk, phbuf, offset,
1845 : : phtup, phsz, newtup, newsize, samepage);
1846 : 1476 : brin_free_tuple(phtup);
1847 : 1476 : brin_free_tuple(newtup);
1848 : :
1849 : : /* If the update succeeded, we're done. */
1850 [ + - ]: 1476 : if (didupdate)
1851 : 1476 : break;
1852 : :
1853 : : /*
1854 : : * If the update didn't work, it might be because somebody updated the
1855 : : * placeholder tuple concurrently. Extract the new version, union it
1856 : : * with the values we have from the scan, and start over. (There are
1857 : : * other reasons for the update to fail, but it's simple to treat them
1858 : : * the same.)
1859 : : */
4146 alvherre@alvh.no-ip. 1860 :UBC 0 : phtup = brinGetTupleForHeapBlock(state->bs_rmAccess, heapBlk, &phbuf,
1861 : : &offset, &phsz, BUFFER_LOCK_SHARE);
1862 : : /* the placeholder tuple must exist */
1863 [ # # ]: 0 : if (phtup == NULL)
1864 [ # # ]: 0 : elog(ERROR, "missing placeholder tuple");
3264 1865 : 0 : phtup = brin_copy_tuple(phtup, phsz, NULL, NULL);
4146 1866 : 0 : LockBuffer(phbuf, BUFFER_LOCK_UNLOCK);
1867 : :
1868 : : /* merge it into the tuple from the heap scan */
1869 : 0 : union_tuples(state->bs_bdesc, state->bs_dtuple, phtup);
1870 : : }
1871 : :
4146 alvherre@alvh.no-ip. 1872 :CBC 1476 : ReleaseBuffer(phbuf);
1873 : 1476 : }
1874 : :
1875 : : /*
1876 : : * Summarize page ranges that are not already summarized. If pageRange is
1877 : : * BRIN_ALL_BLOCKRANGES then the whole table is scanned; otherwise, only the
1878 : : * page range containing the given heap page number is scanned.
1879 : : * If include_partial is true, then the partial range at the end of the table
1880 : : * is summarized, otherwise not.
1881 : : *
1882 : : * For each new index tuple inserted, *numSummarized (if not NULL) is
1883 : : * incremented; for each existing tuple, *numExisting (if not NULL) is
1884 : : * incremented.
1885 : : */
1886 : : static void
3270 1887 : 133 : brinsummarize(Relation index, Relation heapRel, BlockNumber pageRange,
1888 : : bool include_partial, double *numSummarized, double *numExisting)
1889 : : {
1890 : : BrinRevmap *revmap;
4146 1891 : 133 : BrinBuildState *state = NULL;
1892 : 133 : IndexInfo *indexInfo = NULL;
1893 : : BlockNumber heapNumBlocks;
1894 : : BlockNumber pagesPerRange;
1895 : : Buffer buf;
1896 : : BlockNumber startBlk;
1897 : :
919 tmunro@postgresql.or 1898 : 133 : revmap = brinRevmapInitialize(index, &pagesPerRange);
1899 : :
1900 : : /* determine range of pages to process */
3054 alvherre@alvh.no-ip. 1901 : 133 : heapNumBlocks = RelationGetNumberOfBlocks(heapRel);
3270 1902 [ + + ]: 133 : if (pageRange == BRIN_ALL_BLOCKRANGES)
1903 : 93 : startBlk = 0;
1904 : : else
1905 : : {
1906 : 40 : startBlk = (pageRange / pagesPerRange) * pagesPerRange;
3054 1907 : 40 : heapNumBlocks = Min(heapNumBlocks, startBlk + pagesPerRange);
1908 : : }
1909 [ - + ]: 133 : if (startBlk > heapNumBlocks)
1910 : : {
1911 : : /* Nothing to do if start point is beyond end of table */
3054 alvherre@alvh.no-ip. 1912 :UBC 0 : brinRevmapTerminate(revmap);
1913 : 0 : return;
1914 : : }
1915 : :
1916 : : /*
1917 : : * Scan the revmap to find unsummarized items.
1918 : : */
4146 alvherre@alvh.no-ip. 1919 :CBC 133 : buf = InvalidBuffer;
3054 1920 [ + + ]: 10090 : for (; startBlk < heapNumBlocks; startBlk += pagesPerRange)
1921 : : {
1922 : : BrinTuple *tup;
1923 : : OffsetNumber off;
1924 : :
1925 : : /*
1926 : : * Unless requested to summarize even a partial range, go away now if
1927 : : * we think the next range is partial. Caller would pass true when it
1928 : : * is typically run once bulk data loading is done
1929 : : * (brin_summarize_new_values), and false when it is typically the
1930 : : * result of arbitrarily-scheduled maintenance command (vacuuming).
1931 : : */
1932 [ + + ]: 10005 : if (!include_partial &&
1933 [ + + ]: 1631 : (startBlk + pagesPerRange > heapNumBlocks))
1934 : 48 : break;
1935 : :
4146 1936 [ - + ]: 9957 : CHECK_FOR_INTERRUPTS();
1937 : :
3054 1938 : 9957 : tup = brinGetTupleForHeapBlock(revmap, startBlk, &buf, &off, NULL,
1939 : : BUFFER_LOCK_SHARE);
4146 1940 [ + + ]: 9957 : if (tup == NULL)
1941 : : {
1942 : : /* no revmap entry for this heap range. Summarize it. */
1943 [ + + ]: 1476 : if (state == NULL)
1944 : : {
1945 : : /* first time through */
1946 [ - + ]: 48 : Assert(!indexInfo);
1947 : 48 : state = initialize_brin_buildstate(index, revmap,
1948 : : pagesPerRange,
1949 : : InvalidBlockNumber);
1950 : 48 : indexInfo = BuildIndexInfo(index);
1951 : : }
3054 1952 : 1476 : summarize_range(indexInfo, state, heapRel, startBlk, heapNumBlocks);
1953 : :
1954 : : /* and re-initialize state for the next range */
4146 1955 : 1476 : brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);
1956 : :
1957 [ + - ]: 1476 : if (numSummarized)
1958 : 1476 : *numSummarized += 1.0;
1959 : : }
1960 : : else
1961 : : {
1962 [ + + ]: 8481 : if (numExisting)
1963 : 1531 : *numExisting += 1.0;
1964 : 8481 : LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1965 : : }
1966 : : }
1967 : :
1968 [ + + ]: 133 : if (BufferIsValid(buf))
1969 : 97 : ReleaseBuffer(buf);
1970 : :
1971 : : /* free resources */
1972 : 133 : brinRevmapTerminate(revmap);
1973 [ + + ]: 133 : if (state)
1974 : : {
1975 : 48 : terminate_brin_buildstate(state);
3875 1976 : 48 : pfree(indexInfo);
1977 : : }
1978 : : }
1979 : :
1980 : : /*
1981 : : * Given a deformed tuple in the build state, convert it into the on-disk
1982 : : * format and insert it into the index, making the revmap point to it.
1983 : : */
1984 : : static void
4146 1985 : 1333 : form_and_insert_tuple(BrinBuildState *state)
1986 : : {
1987 : : BrinTuple *tup;
1988 : : Size size;
1989 : :
1990 : 1333 : tup = brin_form_tuple(state->bs_bdesc, state->bs_currRangeStart,
1991 : : state->bs_dtuple, &size);
1992 : 1333 : brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,
1993 : : &state->bs_currentInsertBuf, state->bs_currRangeStart,
1994 : : tup, size);
1995 : 1333 : state->bs_numtuples++;
1996 : :
1997 : 1333 : pfree(tup);
1998 : 1333 : }
1999 : :
2000 : : /*
2001 : : * Given a deformed tuple in the build state, convert it into the on-disk
2002 : : * format and write it to a (shared) tuplesort (the leader will insert it
2003 : : * into the index later).
2004 : : */
2005 : : static void
828 tomas.vondra@postgre 2006 : 48 : form_and_spill_tuple(BrinBuildState *state)
2007 : : {
2008 : : BrinTuple *tup;
2009 : : Size size;
2010 : :
2011 : : /* don't insert empty tuples in parallel build */
2012 [ + + ]: 48 : if (state->bs_dtuple->bt_empty_range)
2013 : 9 : return;
2014 : :
2015 : 39 : tup = brin_form_tuple(state->bs_bdesc, state->bs_currRangeStart,
2016 : : state->bs_dtuple, &size);
2017 : :
2018 : : /* write the BRIN tuple to the tuplesort */
806 2019 : 39 : tuplesort_putbrintuple(state->bs_sortstate, tup, size);
2020 : :
828 2021 : 39 : state->bs_numtuples++;
2022 : :
2023 : 39 : pfree(tup);
2024 : : }
2025 : :
2026 : : /*
2027 : : * Given two deformed tuples, adjust the first one so that it's consistent
2028 : : * with the summary values in both.
2029 : : */
2030 : : static void
4146 alvherre@alvh.no-ip. 2031 : 19 : union_tuples(BrinDesc *bdesc, BrinMemTuple *a, BrinTuple *b)
2032 : : {
2033 : : int keyno;
2034 : : BrinMemTuple *db;
2035 : : MemoryContext cxt;
2036 : : MemoryContext oldcxt;
2037 : :
2038 : : /* Use our own memory context to avoid retail pfree */
2039 : 19 : cxt = AllocSetContextCreate(CurrentMemoryContext,
2040 : : "brin union",
2041 : : ALLOCSET_DEFAULT_SIZES);
2042 : 19 : oldcxt = MemoryContextSwitchTo(cxt);
3264 2043 : 19 : db = brin_deform_tuple(bdesc, b, NULL);
4146 2044 : 19 : MemoryContextSwitchTo(oldcxt);
2045 : :
2046 : : /*
2047 : : * Check if the ranges are empty.
2048 : : *
2049 : : * If at least one of them is empty, we don't need to call per-key union
2050 : : * functions at all. If "b" is empty, we just use "a" as the result (it
2051 : : * might be empty fine, but that's fine). If "a" is empty but "b" is not,
2052 : : * we use "b" as the result (but we have to copy the data into "a" first).
2053 : : *
2054 : : * Only when both ranges are non-empty, we actually do the per-key merge.
2055 : : */
2056 : :
2057 : : /* If "b" is empty - ignore it and just use "a" (even if it's empty etc.). */
1031 tomas.vondra@postgre 2058 [ - + ]: 19 : if (db->bt_empty_range)
2059 : : {
2060 : : /* skip the per-key merge */
1031 tomas.vondra@postgre 2061 :UBC 0 : MemoryContextDelete(cxt);
2062 : 0 : return;
2063 : : }
2064 : :
2065 : : /*
2066 : : * Now we know "b" is not empty. If "a" is empty, then "b" is the result.
2067 : : * But we need to copy the data from "b" to "a" first, because that's how
2068 : : * we pass result out.
2069 : : *
2070 : : * We have to copy all the global/per-key flags etc. too.
2071 : : */
1031 tomas.vondra@postgre 2072 [ - + ]:CBC 19 : if (a->bt_empty_range)
2073 : : {
1031 tomas.vondra@postgre 2074 [ # # ]:UBC 0 : for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
2075 : : {
2076 : : int i;
2077 : 0 : BrinValues *col_a = &a->bt_columns[keyno];
2078 : 0 : BrinValues *col_b = &db->bt_columns[keyno];
2079 : 0 : BrinOpcInfo *opcinfo = bdesc->bd_info[keyno];
2080 : :
2081 : 0 : col_a->bv_allnulls = col_b->bv_allnulls;
2082 : 0 : col_a->bv_hasnulls = col_b->bv_hasnulls;
2083 : :
2084 : : /* If "b" has no data, we're done. */
2085 [ # # ]: 0 : if (col_b->bv_allnulls)
2086 : 0 : continue;
2087 : :
2088 [ # # ]: 0 : for (i = 0; i < opcinfo->oi_nstored; i++)
2089 : 0 : col_a->bv_values[i] =
2090 : 0 : datumCopy(col_b->bv_values[i],
2091 : 0 : opcinfo->oi_typcache[i]->typbyval,
2092 : 0 : opcinfo->oi_typcache[i]->typlen);
2093 : : }
2094 : :
2095 : : /* "a" started empty, but "b" was not empty, so remember that */
2096 : 0 : a->bt_empty_range = false;
2097 : :
2098 : : /* skip the per-key merge */
2099 : 0 : MemoryContextDelete(cxt);
2100 : 0 : return;
2101 : : }
2102 : :
2103 : : /* Now we know neither range is empty. */
4146 alvherre@alvh.no-ip. 2104 [ + + ]:CBC 95 : for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
2105 : : {
2106 : : FmgrInfo *unionFn;
2107 : 76 : BrinValues *col_a = &a->bt_columns[keyno];
2108 : 76 : BrinValues *col_b = &db->bt_columns[keyno];
1818 tomas.vondra@postgre 2109 : 76 : BrinOpcInfo *opcinfo = bdesc->bd_info[keyno];
2110 : :
2111 [ + - ]: 76 : if (opcinfo->oi_regular_nulls)
2112 : : {
2113 : : /* Does the "b" summary represent any NULL values? */
1032 2114 [ + + + + ]: 76 : bool b_has_nulls = (col_b->bv_hasnulls || col_b->bv_allnulls);
2115 : :
2116 : : /* Adjust "hasnulls". */
2117 [ + + + + ]: 76 : if (!col_a->bv_allnulls && b_has_nulls)
1818 2118 : 14 : col_a->bv_hasnulls = true;
2119 : :
2120 : : /* If there are no values in B, there's nothing left to do. */
2121 [ + + ]: 76 : if (col_b->bv_allnulls)
2122 : 2 : continue;
2123 : :
2124 : : /*
2125 : : * Adjust "allnulls". If A doesn't have values, just copy the
2126 : : * values from B into A, and we're done. We cannot run the
2127 : : * operators in this case, because values in A might contain
2128 : : * garbage. Note we already established that B contains values.
2129 : : *
2130 : : * Also adjust "hasnulls" in order not to forget the summary
2131 : : * represents NULL values. This is not redundant with the earlier
2132 : : * update, because that only happens when allnulls=false.
2133 : : */
2134 [ - + ]: 74 : if (col_a->bv_allnulls)
1818 tomas.vondra@postgre 2135 :LBC (1) : {
2136 : : int i;
2137 : :
2138 : (1) : col_a->bv_allnulls = false;
1032 2139 : (1) : col_a->bv_hasnulls = true;
2140 : :
1818 2141 [ # # ]: (2) : for (i = 0; i < opcinfo->oi_nstored; i++)
2142 : (1) : col_a->bv_values[i] =
2143 : (1) : datumCopy(col_b->bv_values[i],
2144 : (1) : opcinfo->oi_typcache[i]->typbyval,
2145 : (1) : opcinfo->oi_typcache[i]->typlen);
2146 : :
2147 : (1) : continue;
2148 : : }
2149 : : }
2150 : :
4146 alvherre@alvh.no-ip. 2151 :CBC 74 : unionFn = index_getprocinfo(bdesc->bd_index, keyno + 1,
2152 : : BRIN_PROCNUM_UNION);
2153 : 74 : FunctionCall3Coll(unionFn,
2154 : 74 : bdesc->bd_index->rd_indcollation[keyno],
2155 : : PointerGetDatum(bdesc),
2156 : : PointerGetDatum(col_a),
2157 : : PointerGetDatum(col_b));
2158 : : }
2159 : :
2160 : 19 : MemoryContextDelete(cxt);
2161 : : }
2162 : :
2163 : : /*
2164 : : * brin_vacuum_scan
2165 : : * Do a complete scan of the index during VACUUM.
2166 : : *
2167 : : * This routine scans the complete index looking for uncataloged index pages,
2168 : : * i.e. those that might have been lost due to a crash after index extension
2169 : : * and such.
2170 : : */
2171 : : static void
3868 2172 : 64 : brin_vacuum_scan(Relation idxrel, BufferAccessStrategy strategy)
2173 : : {
2174 : : BlockRangeReadStreamPrivate p;
2175 : : ReadStream *stream;
2176 : : Buffer buf;
2177 : :
118 msawada@postgresql.o 2178 :GNC 64 : p.current_blocknum = 0;
2179 : 64 : p.last_exclusive = RelationGetNumberOfBlocks(idxrel);
2180 : :
2181 : : /*
2182 : : * It is safe to use batchmode as block_range_read_stream_cb takes no
2183 : : * locks.
2184 : : */
2185 : 64 : stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE |
2186 : : READ_STREAM_FULL |
2187 : : READ_STREAM_USE_BATCHING,
2188 : : strategy,
2189 : : idxrel,
2190 : : MAIN_FORKNUM,
2191 : : block_range_read_stream_cb,
2192 : : &p,
2193 : : 0);
2194 : :
2195 : : /*
2196 : : * Scan the index in physical order, and clean up any possible mess in
2197 : : * each page.
2198 : : */
2199 [ + + ]: 346 : while ((buf = read_stream_next_buffer(stream, NULL)) != InvalidBuffer)
2200 : : {
3868 alvherre@alvh.no-ip. 2201 [ - + ]:CBC 282 : CHECK_FOR_INTERRUPTS();
2202 : :
2902 tgl@sss.pgh.pa.us 2203 : 282 : brin_page_cleanup(idxrel, buf);
2204 : :
3868 alvherre@alvh.no-ip. 2205 : 282 : ReleaseBuffer(buf);
2206 : : }
2207 : :
118 msawada@postgresql.o 2208 :GNC 64 : read_stream_end(stream);
2209 : :
2210 : : /*
2211 : : * Update all upper pages in the index's FSM, as well. This ensures not
2212 : : * only that we propagate leaf-page FSM updates made by brin_page_cleanup,
2213 : : * but also that any pre-existing damage or out-of-dateness is repaired.
2214 : : */
2902 tgl@sss.pgh.pa.us 2215 :CBC 64 : FreeSpaceMapVacuum(idxrel);
3868 alvherre@alvh.no-ip. 2216 : 64 : }
2217 : :
2218 : : static bool
1818 tomas.vondra@postgre 2219 : 492202 : add_values_to_range(Relation idxRel, BrinDesc *bdesc, BrinMemTuple *dtup,
2220 : : const Datum *values, const bool *nulls)
2221 : : {
2222 : : int keyno;
2223 : :
2224 : : /* If the range starts empty, we're certainly going to modify it. */
1031 2225 : 492202 : bool modified = dtup->bt_empty_range;
2226 : :
2227 : : /*
2228 : : * Compare the key values of the new tuple to the stored index values; our
2229 : : * deformed tuple will get updated if the new tuple doesn't fit the
2230 : : * original range (note this means we can't break out of the loop early).
2231 : : * Make a note of whether this happens, so that we know to insert the
2232 : : * modified tuple later.
2233 : : */
1818 2234 [ + + ]: 1124597 : for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
2235 : : {
2236 : : Datum result;
2237 : : BrinValues *bval;
2238 : : FmgrInfo *addValue;
2239 : : bool has_nulls;
2240 : :
2241 : 632395 : bval = &dtup->bt_columns[keyno];
2242 : :
2243 : : /*
2244 : : * Does the range have actual NULL values? Either of the flags can be
2245 : : * set, but we ignore the state before adding first row.
2246 : : *
2247 : : * We have to remember this, because we'll modify the flags and we
2248 : : * need to know if the range started as empty.
2249 : : */
1031 2250 [ + + ]: 1246290 : has_nulls = ((!dtup->bt_empty_range) &&
2251 [ + + + + ]: 613895 : (bval->bv_hasnulls || bval->bv_allnulls));
2252 : :
2253 : : /*
2254 : : * If the value we're adding is NULL, handle it locally. Otherwise
2255 : : * call the BRIN_PROCNUM_ADDVALUE procedure.
2256 : : */
1818 2257 [ + - + + ]: 632395 : if (bdesc->bd_info[keyno]->oi_regular_nulls && nulls[keyno])
2258 : : {
2259 : : /*
2260 : : * If the new value is null, we record that we saw it if it's the
2261 : : * first one; otherwise, there's nothing to do.
2262 : : */
2263 [ + + ]: 9419 : if (!bval->bv_hasnulls)
2264 : : {
2265 : 1843 : bval->bv_hasnulls = true;
2266 : 1843 : modified = true;
2267 : : }
2268 : :
2269 : 9419 : continue;
2270 : : }
2271 : :
2272 : 622976 : addValue = index_getprocinfo(idxRel, keyno + 1,
2273 : : BRIN_PROCNUM_ADDVALUE);
2274 : 622976 : result = FunctionCall4Coll(addValue,
2275 : 622976 : idxRel->rd_indcollation[keyno],
2276 : : PointerGetDatum(bdesc),
2277 : : PointerGetDatum(bval),
2278 : 622976 : values[keyno],
219 peter@eisentraut.org 2279 :GNC 622976 : BoolGetDatum(nulls[keyno]));
2280 : : /* if that returned true, we need to insert the updated tuple */
1818 tomas.vondra@postgre 2281 :CBC 622976 : modified |= DatumGetBool(result);
2282 : :
2283 : : /*
2284 : : * If the range was had actual NULL values (i.e. did not start empty),
2285 : : * make sure we don't forget about the NULL values. Either the
2286 : : * allnulls flag is still set to true, or (if the opclass cleared it)
2287 : : * we need to set hasnulls=true.
2288 : : *
2289 : : * XXX This can only happen when the opclass modified the tuple, so
2290 : : * the modified flag should be set.
2291 : : */
1031 2292 [ + + + + : 622976 : if (has_nulls && !(bval->bv_hasnulls || bval->bv_allnulls))
+ - ]
2293 : : {
2294 [ - + ]: 2 : Assert(modified);
2295 : 2 : bval->bv_hasnulls = true;
2296 : : }
2297 : : }
2298 : :
2299 : : /*
2300 : : * After updating summaries for all the keys, mark it as not empty.
2301 : : *
2302 : : * If we're actually changing the flag value (i.e. tuple started as
2303 : : * empty), we should have modified the tuple. So we should not see empty
2304 : : * range that was not modified.
2305 : : */
2306 [ + + - + ]: 492202 : Assert(!dtup->bt_empty_range || modified);
2307 : 492202 : dtup->bt_empty_range = false;
2308 : :
1818 2309 : 492202 : return modified;
2310 : : }
2311 : :
2312 : : static bool
2313 : 94968 : check_null_keys(BrinValues *bval, ScanKey *nullkeys, int nnullkeys)
2314 : : {
2315 : : int keyno;
2316 : :
2317 : : /*
2318 : : * First check if there are any IS [NOT] NULL scan keys, and if we're
2319 : : * violating them.
2320 : : */
2321 [ + + ]: 95586 : for (keyno = 0; keyno < nnullkeys; keyno++)
2322 : : {
2323 : 1116 : ScanKey key = nullkeys[keyno];
2324 : :
2325 [ - + ]: 1116 : Assert(key->sk_attno == bval->bv_attno);
2326 : :
2327 : : /* Handle only IS NULL/IS NOT NULL tests */
2328 [ - + ]: 1116 : if (!(key->sk_flags & SK_ISNULL))
1818 tomas.vondra@postgre 2329 :UBC 0 : continue;
2330 : :
1818 tomas.vondra@postgre 2331 [ + + ]:CBC 1116 : if (key->sk_flags & SK_SEARCHNULL)
2332 : : {
2333 : : /* IS NULL scan key, but range has no NULLs */
2334 [ + + + + ]: 558 : if (!bval->bv_allnulls && !bval->bv_hasnulls)
2335 : 489 : return false;
2336 : : }
2337 [ + - ]: 558 : else if (key->sk_flags & SK_SEARCHNOTNULL)
2338 : : {
2339 : : /*
2340 : : * For IS NOT NULL, we can only skip ranges that are known to have
2341 : : * only nulls.
2342 : : */
2343 [ + + ]: 558 : if (bval->bv_allnulls)
2344 : 9 : return false;
2345 : : }
2346 : : else
2347 : : {
2348 : : /*
2349 : : * Neither IS NULL nor IS NOT NULL was used; assume all indexable
2350 : : * operators are strict and thus return false with NULL value in
2351 : : * the scan key.
2352 : : */
1818 tomas.vondra@postgre 2353 :UBC 0 : return false;
2354 : : }
2355 : : }
2356 : :
1818 tomas.vondra@postgre 2357 :CBC 94470 : return true;
2358 : : }
2359 : :
2360 : : /*
2361 : : * Create parallel context, and launch workers for leader.
2362 : : *
2363 : : * buildstate argument should be initialized (with the exception of the
2364 : : * tuplesort states, which may later be created based on shared
2365 : : * state initially set up here).
2366 : : *
2367 : : * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.
2368 : : *
2369 : : * request is the target number of parallel worker processes to launch.
2370 : : *
2371 : : * Sets buildstate's BrinLeader, which caller must use to shut down parallel
2372 : : * mode by passing it to _brin_end_parallel() at the very end of its index
2373 : : * build. If not even a single worker process can be launched, this is
2374 : : * never set, and caller should proceed with a serial index build.
2375 : : */
2376 : : static void
828 2377 : 5 : _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index,
2378 : : bool isconcurrent, int request)
2379 : : {
2380 : : ParallelContext *pcxt;
2381 : : int scantuplesortstates;
2382 : : Snapshot snapshot;
2383 : : Size estbrinshared;
2384 : : Size estsort;
2385 : : BrinShared *brinshared;
2386 : : Sharedsort *sharedsort;
95 michael@paquier.xyz 2387 :GNC 5 : BrinLeader *brinleader = palloc0_object(BrinLeader);
2388 : : WalUsage *walusage;
2389 : : BufferUsage *bufferusage;
828 tomas.vondra@postgre 2390 :CBC 5 : bool leaderparticipates = true;
2391 : : int querylen;
2392 : :
2393 : : #ifdef DISABLE_LEADER_PARTICIPATION
2394 : : leaderparticipates = false;
2395 : : #endif
2396 : :
2397 : : /*
2398 : : * Enter parallel mode, and create context for parallel build of brin
2399 : : * index
2400 : : */
2401 : 5 : EnterParallelMode();
2402 [ - + ]: 5 : Assert(request > 0);
2403 : 5 : pcxt = CreateParallelContext("postgres", "_brin_parallel_build_main",
2404 : : request);
2405 : :
2406 [ + - ]: 5 : scantuplesortstates = leaderparticipates ? request + 1 : request;
2407 : :
2408 : : /*
2409 : : * Prepare for scan of the base relation. In a normal index build, we use
2410 : : * SnapshotAny because we must retrieve all tuples and do our own time
2411 : : * qual checks (because we have to index RECENTLY_DEAD tuples). In a
2412 : : * concurrent build, we take a regular MVCC snapshot and index whatever's
2413 : : * live according to that.
2414 : : */
2415 [ + - ]: 5 : if (!isconcurrent)
2416 : 5 : snapshot = SnapshotAny;
2417 : : else
828 tomas.vondra@postgre 2418 :UBC 0 : snapshot = RegisterSnapshot(GetTransactionSnapshot());
2419 : :
2420 : : /*
2421 : : * Estimate size for our own PARALLEL_KEY_BRIN_SHARED workspace.
2422 : : */
828 tomas.vondra@postgre 2423 :CBC 5 : estbrinshared = _brin_parallel_estimate_shared(heap, snapshot);
2424 : 5 : shm_toc_estimate_chunk(&pcxt->estimator, estbrinshared);
2425 : 5 : estsort = tuplesort_estimate_shared(scantuplesortstates);
2426 : 5 : shm_toc_estimate_chunk(&pcxt->estimator, estsort);
2427 : :
2428 : 5 : shm_toc_estimate_keys(&pcxt->estimator, 2);
2429 : :
2430 : : /*
2431 : : * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
2432 : : * and PARALLEL_KEY_BUFFER_USAGE.
2433 : : *
2434 : : * If there are no extensions loaded that care, we could skip this. We
2435 : : * have no way of knowing whether anyone's looking at pgWalUsage or
2436 : : * pgBufferUsage, so do it unconditionally.
2437 : : */
2438 : 5 : shm_toc_estimate_chunk(&pcxt->estimator,
2439 : : mul_size(sizeof(WalUsage), pcxt->nworkers));
2440 : 5 : shm_toc_estimate_keys(&pcxt->estimator, 1);
2441 : 5 : shm_toc_estimate_chunk(&pcxt->estimator,
2442 : : mul_size(sizeof(BufferUsage), pcxt->nworkers));
2443 : 5 : shm_toc_estimate_keys(&pcxt->estimator, 1);
2444 : :
2445 : : /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
2446 [ + - ]: 5 : if (debug_query_string)
2447 : : {
2448 : 5 : querylen = strlen(debug_query_string);
2449 : 5 : shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
2450 : 5 : shm_toc_estimate_keys(&pcxt->estimator, 1);
2451 : : }
2452 : : else
828 tomas.vondra@postgre 2453 :UBC 0 : querylen = 0; /* keep compiler quiet */
2454 : :
2455 : : /* Everyone's had a chance to ask for space, so now create the DSM */
828 tomas.vondra@postgre 2456 :CBC 5 : InitializeParallelDSM(pcxt);
2457 : :
2458 : : /* If no DSM segment was available, back out (do serial build) */
2459 [ - + ]: 5 : if (pcxt->seg == NULL)
2460 : : {
828 tomas.vondra@postgre 2461 [ # # # # ]:UBC 0 : if (IsMVCCSnapshot(snapshot))
2462 : 0 : UnregisterSnapshot(snapshot);
2463 : 0 : DestroyParallelContext(pcxt);
2464 : 0 : ExitParallelMode();
2465 : 0 : return;
2466 : : }
2467 : :
2468 : : /* Store shared build state, for which we reserved space */
828 tomas.vondra@postgre 2469 :CBC 5 : brinshared = (BrinShared *) shm_toc_allocate(pcxt->toc, estbrinshared);
2470 : : /* Initialize immutable state */
2471 : 5 : brinshared->heaprelid = RelationGetRelid(heap);
2472 : 5 : brinshared->indexrelid = RelationGetRelid(index);
2473 : 5 : brinshared->isconcurrent = isconcurrent;
2474 : 5 : brinshared->scantuplesortstates = scantuplesortstates;
2475 : 5 : brinshared->pagesPerRange = buildstate->bs_pagesPerRange;
531 michael@paquier.xyz 2476 : 5 : brinshared->queryid = pgstat_get_my_query_id();
828 tomas.vondra@postgre 2477 : 5 : ConditionVariableInit(&brinshared->workersdonecv);
2478 : 5 : SpinLockInit(&brinshared->mutex);
2479 : :
2480 : : /* Initialize mutable state */
2481 : 5 : brinshared->nparticipantsdone = 0;
2482 : 5 : brinshared->reltuples = 0.0;
2483 : 5 : brinshared->indtuples = 0.0;
2484 : :
2485 : 5 : table_parallelscan_initialize(heap,
2486 : : ParallelTableScanFromBrinShared(brinshared),
2487 : : snapshot);
2488 : :
2489 : : /*
2490 : : * Store shared tuplesort-private state, for which we reserved space.
2491 : : * Then, initialize opaque state using tuplesort routine.
2492 : : */
2493 : 5 : sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
2494 : 5 : tuplesort_initialize_shared(sharedsort, scantuplesortstates,
2495 : : pcxt->seg);
2496 : :
2497 : : /*
2498 : : * Store shared tuplesort-private state, for which we reserved space.
2499 : : * Then, initialize opaque state using tuplesort routine.
2500 : : */
2501 : 5 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_BRIN_SHARED, brinshared);
2502 : 5 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
2503 : :
2504 : : /* Store query string for workers */
2505 [ + - ]: 5 : if (debug_query_string)
2506 : : {
2507 : : char *sharedquery;
2508 : :
2509 : 5 : sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
2510 : 5 : memcpy(sharedquery, debug_query_string, querylen + 1);
2511 : 5 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery);
2512 : : }
2513 : :
2514 : : /*
2515 : : * Allocate space for each worker's WalUsage and BufferUsage; no need to
2516 : : * initialize.
2517 : : */
2518 : 5 : walusage = shm_toc_allocate(pcxt->toc,
2519 : 5 : mul_size(sizeof(WalUsage), pcxt->nworkers));
2520 : 5 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
2521 : 5 : bufferusage = shm_toc_allocate(pcxt->toc,
2522 : 5 : mul_size(sizeof(BufferUsage), pcxt->nworkers));
2523 : 5 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
2524 : :
2525 : : /* Launch workers, saving status for leader/caller */
2526 : 5 : LaunchParallelWorkers(pcxt);
2527 : 5 : brinleader->pcxt = pcxt;
2528 : 5 : brinleader->nparticipanttuplesorts = pcxt->nworkers_launched;
2529 [ + - ]: 5 : if (leaderparticipates)
2530 : 5 : brinleader->nparticipanttuplesorts++;
2531 : 5 : brinleader->brinshared = brinshared;
2532 : 5 : brinleader->sharedsort = sharedsort;
2533 : 5 : brinleader->snapshot = snapshot;
2534 : 5 : brinleader->walusage = walusage;
2535 : 5 : brinleader->bufferusage = bufferusage;
2536 : :
2537 : : /* If no workers were successfully launched, back out (do serial build) */
2538 [ + + ]: 5 : if (pcxt->nworkers_launched == 0)
2539 : : {
2540 : 1 : _brin_end_parallel(brinleader, NULL);
2541 : 1 : return;
2542 : : }
2543 : :
2544 : : /* Save leader state now that it's clear build will be parallel */
2545 : 4 : buildstate->bs_leader = brinleader;
2546 : :
2547 : : /* Join heap scan ourselves */
2548 [ + - ]: 4 : if (leaderparticipates)
2549 : 4 : _brin_leader_participate_as_worker(buildstate, heap, index);
2550 : :
2551 : : /*
2552 : : * Caller needs to wait for all launched workers when we return. Make
2553 : : * sure that the failure-to-start case will not hang forever.
2554 : : */
2555 : 4 : WaitForParallelWorkersToAttach(pcxt);
2556 : : }
2557 : :
2558 : : /*
2559 : : * Shut down workers, destroy parallel context, and end parallel mode.
2560 : : */
2561 : : static void
2562 : 5 : _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state)
2563 : : {
2564 : : int i;
2565 : :
2566 : : /* Shutdown worker processes */
2567 : 5 : WaitForParallelWorkersToFinish(brinleader->pcxt);
2568 : :
2569 : : /*
2570 : : * Next, accumulate WAL usage. (This must wait for the workers to finish,
2571 : : * or we might get incomplete data.)
2572 : : */
697 2573 [ + + ]: 11 : for (i = 0; i < brinleader->pcxt->nworkers_launched; i++)
2574 : 6 : InstrAccumParallelQuery(&brinleader->bufferusage[i], &brinleader->walusage[i]);
2575 : :
2576 : : /* Free last reference to MVCC snapshot, if one was used */
2577 [ - + - + ]: 5 : if (IsMVCCSnapshot(brinleader->snapshot))
697 tomas.vondra@postgre 2578 :UBC 0 : UnregisterSnapshot(brinleader->snapshot);
697 tomas.vondra@postgre 2579 :CBC 5 : DestroyParallelContext(brinleader->pcxt);
2580 : 5 : ExitParallelMode();
2581 : 5 : }
2582 : :
2583 : : /*
2584 : : * Within leader, wait for end of heap scan.
2585 : : *
2586 : : * When called, parallel heap scan started by _brin_begin_parallel() will
2587 : : * already be underway within worker processes (when leader participates
2588 : : * as a worker, we should end up here just as workers are finishing).
2589 : : *
2590 : : * Returns the total number of heap tuples scanned.
2591 : : */
2592 : : static double
2593 : 4 : _brin_parallel_heapscan(BrinBuildState *state)
2594 : : {
2595 : 4 : BrinShared *brinshared = state->bs_leader->brinshared;
2596 : : int nparticipanttuplesorts;
2597 : :
2598 : 4 : nparticipanttuplesorts = state->bs_leader->nparticipanttuplesorts;
2599 : : for (;;)
2600 : : {
2601 [ - + ]: 13 : SpinLockAcquire(&brinshared->mutex);
2602 [ + + ]: 13 : if (brinshared->nparticipantsdone == nparticipanttuplesorts)
2603 : : {
2604 : : /* copy the data into leader state */
2605 : 4 : state->bs_reltuples = brinshared->reltuples;
2606 : 4 : state->bs_numtuples = brinshared->indtuples;
2607 : :
2608 : 4 : SpinLockRelease(&brinshared->mutex);
2609 : 4 : break;
2610 : : }
2611 : 9 : SpinLockRelease(&brinshared->mutex);
2612 : :
2613 : 9 : ConditionVariableSleep(&brinshared->workersdonecv,
2614 : : WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN);
2615 : : }
2616 : :
2617 : 4 : ConditionVariableCancelSleep();
2618 : :
2619 : 4 : return state->bs_reltuples;
2620 : : }
2621 : :
2622 : : /*
2623 : : * Within leader, wait for end of heap scan and merge per-worker results.
2624 : : *
2625 : : * After waiting for all workers to finish, merge the per-worker results into
2626 : : * the complete index. The results from each worker are sorted by block number
2627 : : * (start of the page range). While combining the per-worker results we merge
2628 : : * summaries for the same page range, and also fill-in empty summaries for
2629 : : * ranges without any tuples.
2630 : : *
2631 : : * Returns the total number of heap tuples scanned.
2632 : : */
2633 : : static double
2634 : 4 : _brin_parallel_merge(BrinBuildState *state)
2635 : : {
2636 : : BrinTuple *btup;
2637 : 4 : BrinMemTuple *memtuple = NULL;
2638 : : Size tuplen;
2639 : 4 : BlockNumber prevblkno = InvalidBlockNumber;
2640 : : MemoryContext rangeCxt,
2641 : : oldCxt;
2642 : : double reltuples;
2643 : :
2644 : : /* wait for workers to scan table and produce partial results */
2645 : 4 : reltuples = _brin_parallel_heapscan(state);
2646 : :
2647 : : /* do the actual sort in the leader */
806 2648 : 4 : tuplesort_performsort(state->bs_sortstate);
2649 : :
2650 : : /*
2651 : : * Initialize BrinMemTuple we'll use to union summaries from workers (in
2652 : : * case they happened to produce parts of the same page range).
2653 : : */
828 2654 : 4 : memtuple = brin_new_memtuple(state->bs_bdesc);
2655 : :
2656 : : /*
2657 : : * Create a memory context we'll reset to combine results for a single
2658 : : * page range (received from the workers). We don't expect huge number of
2659 : : * overlaps under regular circumstances, because for large tables the
2660 : : * chunk size is likely larger than the BRIN page range), but it can
2661 : : * happen, and the union functions may do all kinds of stuff. So we better
2662 : : * reset the context once in a while.
2663 : : */
2664 : 4 : rangeCxt = AllocSetContextCreate(CurrentMemoryContext,
2665 : : "brin union",
2666 : : ALLOCSET_DEFAULT_SIZES);
2667 : 4 : oldCxt = MemoryContextSwitchTo(rangeCxt);
2668 : :
2669 : : /*
2670 : : * Read the BRIN tuples from the shared tuplesort, sorted by block number.
2671 : : * That probably gives us an index that is cheaper to scan, thanks to
2672 : : * mostly getting data from the same index page as before.
2673 : : */
806 2674 [ + + ]: 43 : while ((btup = tuplesort_getbrintuple(state->bs_sortstate, &tuplen, true)) != NULL)
2675 : : {
2676 : : /* Ranges should be multiples of pages_per_range for the index. */
697 2677 [ - + ]: 39 : Assert(btup->bt_blkno % state->bs_leader->brinshared->pagesPerRange == 0);
2678 : :
2679 : : /*
2680 : : * Do we need to union summaries for the same page range?
2681 : : *
2682 : : * If this is the first brin tuple we read, then just deform it into
2683 : : * the memtuple, and continue with the next one from tuplesort. We
2684 : : * however may need to insert empty summaries into the index.
2685 : : *
2686 : : * If it's the same block as the last we saw, we simply union the brin
2687 : : * tuple into it, and we're done - we don't even need to insert empty
2688 : : * ranges, because that was done earlier when we saw the first brin
2689 : : * tuple (for this range).
2690 : : *
2691 : : * Finally, if it's not the first brin tuple, and it's not the same
2692 : : * page range, we need to do the insert and then deform the tuple into
2693 : : * the memtuple. Then we'll insert empty ranges before the new brin
2694 : : * tuple, if needed.
2695 : : */
828 2696 [ + + ]: 39 : if (prevblkno == InvalidBlockNumber)
2697 : : {
2698 : : /* First brin tuples, just deform into memtuple. */
2699 : 1 : memtuple = brin_deform_tuple(state->bs_bdesc, btup, memtuple);
2700 : :
2701 : : /* continue to insert empty pages before thisblock */
2702 : : }
2703 [ + + ]: 38 : else if (memtuple->bt_blkno == btup->bt_blkno)
2704 : : {
2705 : : /*
2706 : : * Not the first brin tuple, but same page range as the previous
2707 : : * one, so we can merge it into the memtuple.
2708 : : */
2709 : 19 : union_tuples(state->bs_bdesc, memtuple, btup);
2710 : 19 : continue;
2711 : : }
2712 : : else
2713 : : {
2714 : : BrinTuple *tmp;
2715 : : Size len;
2716 : :
2717 : : /*
2718 : : * We got brin tuple for a different page range, so form a brin
2719 : : * tuple from the memtuple, insert it, and re-init the memtuple
2720 : : * from the new brin tuple.
2721 : : */
2722 : 19 : tmp = brin_form_tuple(state->bs_bdesc, memtuple->bt_blkno,
2723 : : memtuple, &len);
2724 : :
2725 : 19 : brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,
2726 : : &state->bs_currentInsertBuf, tmp->bt_blkno, tmp, len);
2727 : :
2728 : : /*
2729 : : * Reset the per-output-range context. This frees all the memory
2730 : : * possibly allocated by the union functions, and also the BRIN
2731 : : * tuple we just formed and inserted.
2732 : : */
2733 : 19 : MemoryContextReset(rangeCxt);
2734 : :
2735 : 19 : memtuple = brin_deform_tuple(state->bs_bdesc, btup, memtuple);
2736 : :
2737 : : /* continue to insert empty pages before thisblock */
2738 : : }
2739 : :
2740 : : /* Fill empty ranges for all ranges missing in the tuplesort. */
2741 : 20 : brin_fill_empty_ranges(state, prevblkno, btup->bt_blkno);
2742 : :
2743 : 20 : prevblkno = btup->bt_blkno;
2744 : : }
2745 : :
806 2746 : 4 : tuplesort_end(state->bs_sortstate);
2747 : :
2748 : : /* Fill the BRIN tuple for the last page range with data. */
828 2749 [ + + ]: 4 : if (prevblkno != InvalidBlockNumber)
2750 : : {
2751 : : BrinTuple *tmp;
2752 : : Size len;
2753 : :
2754 : 1 : tmp = brin_form_tuple(state->bs_bdesc, memtuple->bt_blkno,
2755 : : memtuple, &len);
2756 : :
2757 : 1 : brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,
2758 : : &state->bs_currentInsertBuf, tmp->bt_blkno, tmp, len);
2759 : :
2760 : 1 : pfree(tmp);
2761 : : }
2762 : :
2763 : : /* Fill empty ranges at the end, for all ranges missing in the tuplesort. */
2764 : 4 : brin_fill_empty_ranges(state, prevblkno, state->bs_maxRangeStart);
2765 : :
2766 : : /*
2767 : : * Switch back to the original memory context, and destroy the one we
2768 : : * created to isolate the union_tuple calls.
2769 : : */
2770 : 4 : MemoryContextSwitchTo(oldCxt);
2771 : 4 : MemoryContextDelete(rangeCxt);
2772 : :
697 2773 : 4 : return reltuples;
2774 : : }
2775 : :
2776 : : /*
2777 : : * Returns size of shared memory required to store state for a parallel
2778 : : * brin index build based on the snapshot its parallel scan will use.
2779 : : */
2780 : : static Size
828 2781 : 5 : _brin_parallel_estimate_shared(Relation heap, Snapshot snapshot)
2782 : : {
2783 : : /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
2784 : 5 : return add_size(BUFFERALIGN(sizeof(BrinShared)),
2785 : : table_parallelscan_estimate(heap, snapshot));
2786 : : }
2787 : :
2788 : : /*
2789 : : * Within leader, participate as a parallel worker.
2790 : : */
2791 : : static void
2792 : 4 : _brin_leader_participate_as_worker(BrinBuildState *buildstate, Relation heap, Relation index)
2793 : : {
2794 : 4 : BrinLeader *brinleader = buildstate->bs_leader;
2795 : : int sortmem;
2796 : :
2797 : : /*
2798 : : * Might as well use reliable figure when doling out maintenance_work_mem
2799 : : * (when requested number of workers were not launched, this will be
2800 : : * somewhat higher than it is for other workers).
2801 : : */
2802 : 4 : sortmem = maintenance_work_mem / brinleader->nparticipanttuplesorts;
2803 : :
2804 : : /* Perform work common to all participants */
806 2805 : 4 : _brin_parallel_scan_and_build(buildstate, brinleader->brinshared,
2806 : : brinleader->sharedsort, heap, index, sortmem, true);
828 2807 : 4 : }
2808 : :
2809 : : /*
2810 : : * Perform a worker's portion of a parallel sort.
2811 : : *
2812 : : * This generates a tuplesort for the worker portion of the table.
2813 : : *
2814 : : * sortmem is the amount of working memory to use within each worker,
2815 : : * expressed in KBs.
2816 : : *
2817 : : * When this returns, workers are done, and need only release resources.
2818 : : */
2819 : : static void
806 2820 : 10 : _brin_parallel_scan_and_build(BrinBuildState *state,
2821 : : BrinShared *brinshared, Sharedsort *sharedsort,
2822 : : Relation heap, Relation index,
2823 : : int sortmem, bool progress)
2824 : : {
2825 : : SortCoordinate coordinate;
2826 : : TableScanDesc scan;
2827 : : double reltuples;
2828 : : IndexInfo *indexInfo;
2829 : :
2830 : : /* Initialize local tuplesort coordination state */
95 michael@paquier.xyz 2831 :GNC 10 : coordinate = palloc0_object(SortCoordinateData);
828 tomas.vondra@postgre 2832 :CBC 10 : coordinate->isWorker = true;
2833 : 10 : coordinate->nParticipants = -1;
2834 : 10 : coordinate->sharedsort = sharedsort;
2835 : :
2836 : : /* Begin "partial" tuplesort */
806 2837 : 10 : state->bs_sortstate = tuplesort_begin_index_brin(sortmem, coordinate,
2838 : : TUPLESORT_NONE);
2839 : :
2840 : : /* Join parallel scan */
828 2841 : 10 : indexInfo = BuildIndexInfo(index);
2842 : 10 : indexInfo->ii_Concurrent = brinshared->isconcurrent;
2843 : :
2844 : 10 : scan = table_beginscan_parallel(heap,
2845 : : ParallelTableScanFromBrinShared(brinshared));
2846 : :
2847 : 10 : reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
2848 : : brinbuildCallbackParallel, state, scan);
2849 : :
2850 : : /* insert the last item */
2851 : 10 : form_and_spill_tuple(state);
2852 : :
2853 : : /* sort the BRIN ranges built by this worker */
806 2854 : 10 : tuplesort_performsort(state->bs_sortstate);
2855 : :
828 2856 : 10 : state->bs_reltuples += reltuples;
2857 : :
2858 : : /*
2859 : : * Done. Record ambuild statistics.
2860 : : */
2861 [ - + ]: 10 : SpinLockAcquire(&brinshared->mutex);
2862 : 10 : brinshared->nparticipantsdone++;
2863 : 10 : brinshared->reltuples += state->bs_reltuples;
2864 : 10 : brinshared->indtuples += state->bs_numtuples;
2865 : 10 : SpinLockRelease(&brinshared->mutex);
2866 : :
2867 : : /* Notify leader */
2868 : 10 : ConditionVariableSignal(&brinshared->workersdonecv);
2869 : :
806 2870 : 10 : tuplesort_end(state->bs_sortstate);
828 2871 : 10 : }
2872 : :
2873 : : /*
2874 : : * Perform work within a launched parallel process.
2875 : : */
2876 : : void
2877 : 6 : _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
2878 : : {
2879 : : char *sharedquery;
2880 : : BrinShared *brinshared;
2881 : : Sharedsort *sharedsort;
2882 : : BrinBuildState *buildstate;
2883 : : Relation heapRel;
2884 : : Relation indexRel;
2885 : : LOCKMODE heapLockmode;
2886 : : LOCKMODE indexLockmode;
2887 : : WalUsage *walusage;
2888 : : BufferUsage *bufferusage;
2889 : : int sortmem;
2890 : :
2891 : : /*
2892 : : * The only possible status flag that can be set to the parallel worker is
2893 : : * PROC_IN_SAFE_IC.
2894 : : */
2895 [ - + - - ]: 6 : Assert((MyProc->statusFlags == 0) ||
2896 : : (MyProc->statusFlags == PROC_IN_SAFE_IC));
2897 : :
2898 : : /* Set debug_query_string for individual workers first */
2899 : 6 : sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true);
2900 : 6 : debug_query_string = sharedquery;
2901 : :
2902 : : /* Report the query string from leader */
2903 : 6 : pgstat_report_activity(STATE_RUNNING, debug_query_string);
2904 : :
2905 : : /* Look up brin shared state */
2906 : 6 : brinshared = shm_toc_lookup(toc, PARALLEL_KEY_BRIN_SHARED, false);
2907 : :
2908 : : /* Open relations using lock modes known to be obtained by index.c */
2909 [ + - ]: 6 : if (!brinshared->isconcurrent)
2910 : : {
2911 : 6 : heapLockmode = ShareLock;
2912 : 6 : indexLockmode = AccessExclusiveLock;
2913 : : }
2914 : : else
2915 : : {
828 tomas.vondra@postgre 2916 :UBC 0 : heapLockmode = ShareUpdateExclusiveLock;
2917 : 0 : indexLockmode = RowExclusiveLock;
2918 : : }
2919 : :
2920 : : /* Track query ID */
531 michael@paquier.xyz 2921 :CBC 6 : pgstat_report_query_id(brinshared->queryid, false);
2922 : :
2923 : : /* Open relations within worker */
828 tomas.vondra@postgre 2924 : 6 : heapRel = table_open(brinshared->heaprelid, heapLockmode);
2925 : 6 : indexRel = index_open(brinshared->indexrelid, indexLockmode);
2926 : :
2927 : 6 : buildstate = initialize_brin_buildstate(indexRel, NULL,
2928 : : brinshared->pagesPerRange,
2929 : : InvalidBlockNumber);
2930 : :
2931 : : /* Look up shared state private to tuplesort.c */
2932 : 6 : sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
2933 : 6 : tuplesort_attach_shared(sharedsort, seg);
2934 : :
2935 : : /* Prepare to track buffer usage during parallel execution */
2936 : 6 : InstrStartParallelQuery();
2937 : :
2938 : : /*
2939 : : * Might as well use reliable figure when doling out maintenance_work_mem
2940 : : * (when requested number of workers were not launched, this will be
2941 : : * somewhat higher than it is for other workers).
2942 : : */
2943 : 6 : sortmem = maintenance_work_mem / brinshared->scantuplesortstates;
2944 : :
806 2945 : 6 : _brin_parallel_scan_and_build(buildstate, brinshared, sharedsort,
2946 : : heapRel, indexRel, sortmem, false);
2947 : :
2948 : : /* Report WAL/buffer usage during parallel execution */
828 2949 : 6 : bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
2950 : 6 : walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
2951 : 6 : InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber],
2952 : 6 : &walusage[ParallelWorkerNumber]);
2953 : :
2954 : 6 : index_close(indexRel, indexLockmode);
2955 : 6 : table_close(heapRel, heapLockmode);
2956 : 6 : }
2957 : :
2958 : : /*
2959 : : * brin_build_empty_tuple
2960 : : * Maybe initialize a BRIN tuple representing empty range.
2961 : : *
2962 : : * Returns a BRIN tuple representing an empty page range starting at the
2963 : : * specified block number. The empty tuple is initialized only once, when it's
2964 : : * needed for the first time, stored in the memory context bs_context to ensure
2965 : : * proper life span, and reused on following calls. All empty tuples are
2966 : : * exactly the same except for the bt_blkno field, which is set to the value
2967 : : * in blkno parameter.
2968 : : */
2969 : : static void
2970 : 10 : brin_build_empty_tuple(BrinBuildState *state, BlockNumber blkno)
2971 : : {
2972 : : /* First time an empty tuple is requested? If yes, initialize it. */
2973 [ + + ]: 10 : if (state->bs_emptyTuple == NULL)
2974 : : {
2975 : : MemoryContext oldcxt;
2976 : 5 : BrinMemTuple *dtuple = brin_new_memtuple(state->bs_bdesc);
2977 : :
2978 : : /* Allocate the tuple in context for the whole index build. */
2979 : 5 : oldcxt = MemoryContextSwitchTo(state->bs_context);
2980 : :
2981 : 5 : state->bs_emptyTuple = brin_form_tuple(state->bs_bdesc, blkno, dtuple,
2982 : : &state->bs_emptyTupleLen);
2983 : :
2984 : 5 : MemoryContextSwitchTo(oldcxt);
2985 : : }
2986 : : else
2987 : : {
2988 : : /* If we already have an empty tuple, just update the block. */
2989 : 5 : state->bs_emptyTuple->bt_blkno = blkno;
2990 : : }
2991 : 10 : }
2992 : :
2993 : : /*
2994 : : * brin_fill_empty_ranges
2995 : : * Add BRIN index tuples representing empty page ranges.
2996 : : *
2997 : : * prevRange/nextRange determine for which page ranges to add empty summaries.
2998 : : * Both boundaries are exclusive, i.e. only ranges starting at blkno for which
2999 : : * (prevRange < blkno < nextRange) will be added to the index.
3000 : : *
3001 : : * If prevRange is InvalidBlockNumber, this means there was no previous page
3002 : : * range (i.e. the first empty range to add is for blkno=0).
3003 : : *
3004 : : * The empty tuple is built only once, and then reused for all future calls.
3005 : : */
3006 : : static void
3007 : 205 : brin_fill_empty_ranges(BrinBuildState *state,
3008 : : BlockNumber prevRange, BlockNumber nextRange)
3009 : : {
3010 : : BlockNumber blkno;
3011 : :
3012 : : /*
3013 : : * If we already summarized some ranges, we need to start with the next
3014 : : * one. Otherwise start from the first range of the table.
3015 : : */
3016 [ + + ]: 205 : blkno = (prevRange == InvalidBlockNumber) ? 0 : (prevRange + state->bs_pagesPerRange);
3017 : :
3018 : : /* Generate empty ranges until we hit the next non-empty range. */
3019 [ + + ]: 215 : while (blkno < nextRange)
3020 : : {
3021 : : /* Did we already build the empty tuple? If not, do it now. */
3022 : 10 : brin_build_empty_tuple(state, blkno);
3023 : :
3024 : 10 : brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,
3025 : : &state->bs_currentInsertBuf,
828 tomas.vondra@postgre 3026 :GIC 10 : blkno, state->bs_emptyTuple, state->bs_emptyTupleLen);
3027 : :
3028 : : /* try next page range */
828 tomas.vondra@postgre 3029 :CBC 10 : blkno += state->bs_pagesPerRange;
3030 : : }
3031 : 205 : }
|