Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * nbtsort.c
4 : : * Build a btree from sorted input by loading leaf pages sequentially.
5 : : *
6 : : * NOTES
7 : : *
8 : : * We use tuplesort.c to sort the given index tuples into order.
9 : : * Then we scan the index tuples in order and build the btree pages
10 : : * for each level. We load source tuples into leaf-level pages.
11 : : * Whenever we fill a page at one level, we add a link to it to its
12 : : * parent level (starting a new parent level if necessary). When
13 : : * done, we write out each final page on each level, adding it to
14 : : * its parent level. When we have only one page on a level, it must be
15 : : * the root -- it can be attached to the btree metapage and we are done.
16 : : *
17 : : * It is not wise to pack the pages entirely full, since then *any*
18 : : * insertion would cause a split (and not only of the leaf page; the need
19 : : * for a split would cascade right up the tree). The steady-state load
20 : : * factor for btrees is usually estimated at 70%. We choose to pack leaf
21 : : * pages to the user-controllable fill factor (default 90%) while upper pages
22 : : * are always packed to 70%. This gives us reasonable density (there aren't
23 : : * many upper pages if the keys are reasonable-size) without risking a lot of
24 : : * cascading splits during early insertions.
25 : : *
26 : : * We use the bulk smgr loading facility to bypass the buffer cache and
27 : : * WAL-log the pages efficiently.
28 : : *
29 : : * This code isn't concerned about the FSM at all. The caller is responsible
30 : : * for initializing that.
31 : : *
32 : : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
33 : : * Portions Copyright (c) 1994, Regents of the University of California
34 : : *
35 : : * IDENTIFICATION
36 : : * src/backend/access/nbtree/nbtsort.c
37 : : *
38 : : *-------------------------------------------------------------------------
39 : : */
40 : :
41 : : #include "postgres.h"
42 : :
43 : : #include "access/nbtree.h"
44 : : #include "access/parallel.h"
45 : : #include "access/relscan.h"
46 : : #include "access/table.h"
47 : : #include "access/tableam.h"
48 : : #include "access/xact.h"
49 : : #include "catalog/index.h"
50 : : #include "commands/progress.h"
51 : : #include "executor/instrument.h"
52 : : #include "miscadmin.h"
53 : : #include "pgstat.h"
54 : : #include "storage/bulk_write.h"
55 : : #include "storage/proc.h"
56 : : #include "tcop/tcopprot.h"
57 : : #include "utils/rel.h"
58 : : #include "utils/sortsupport.h"
59 : : #include "utils/tuplesort.h"
60 : : #include "utils/wait_event.h"
61 : :
62 : :
63 : : /* Magic numbers for parallel state sharing */
64 : : #define PARALLEL_KEY_BTREE_SHARED UINT64CONST(0xA000000000000001)
65 : : #define PARALLEL_KEY_TUPLESORT UINT64CONST(0xA000000000000002)
66 : : #define PARALLEL_KEY_TUPLESORT_SPOOL2 UINT64CONST(0xA000000000000003)
67 : : #define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xA000000000000004)
68 : : #define PARALLEL_KEY_WAL_USAGE UINT64CONST(0xA000000000000005)
69 : : #define PARALLEL_KEY_BUFFER_USAGE UINT64CONST(0xA000000000000006)
70 : :
71 : : /*
72 : : * DISABLE_LEADER_PARTICIPATION disables the leader's participation in
73 : : * parallel index builds. This may be useful as a debugging aid.
74 : : */
75 : : /* #define DISABLE_LEADER_PARTICIPATION */
76 : :
77 : : /*
78 : : * Status record for spooling/sorting phase. (Note we may have two of
79 : : * these due to the special requirements for uniqueness-checking with
80 : : * dead tuples.)
81 : : */
82 : : typedef struct BTSpool
83 : : {
84 : : Tuplesortstate *sortstate; /* state data for tuplesort.c */
85 : : Relation heap;
86 : : Relation index;
87 : : bool isunique;
88 : : bool nulls_not_distinct;
89 : : } BTSpool;
90 : :
91 : : /*
92 : : * Status for index builds performed in parallel. This is allocated in a
93 : : * dynamic shared memory segment. Note that there is a separate tuplesort TOC
94 : : * entry, private to tuplesort.c but allocated by this module on its behalf.
95 : : */
96 : : typedef struct BTShared
97 : : {
98 : : /*
99 : : * These fields are not modified during the sort. They primarily exist
100 : : * for the benefit of worker processes that need to create BTSpool state
101 : : * corresponding to that used by the leader.
102 : : */
103 : : Oid heaprelid;
104 : : Oid indexrelid;
105 : : bool isunique;
106 : : bool nulls_not_distinct;
107 : : bool isconcurrent;
108 : : int scantuplesortstates;
109 : :
110 : : /* Query ID, for report in worker processes */
111 : : int64 queryid;
112 : :
113 : : /*
114 : : * workersdonecv is used to monitor the progress of workers. All parallel
115 : : * participants must indicate that they are done before leader can use
116 : : * mutable state that workers maintain during scan (and before leader can
117 : : * proceed to tuplesort_performsort()).
118 : : */
119 : : ConditionVariable workersdonecv;
120 : :
121 : : /*
122 : : * mutex protects all fields before heapdesc.
123 : : *
124 : : * These fields contain status information of interest to B-Tree index
125 : : * builds that must work just the same when an index is built in parallel.
126 : : */
127 : : slock_t mutex;
128 : :
129 : : /*
130 : : * Mutable state that is maintained by workers, and reported back to
131 : : * leader at end of parallel scan.
132 : : *
133 : : * nparticipantsdone is number of worker processes finished.
134 : : *
135 : : * reltuples is the total number of input heap tuples.
136 : : *
137 : : * havedead indicates if RECENTLY_DEAD tuples were encountered during
138 : : * build.
139 : : *
140 : : * indtuples is the total number of tuples that made it into the index.
141 : : *
142 : : * brokenhotchain indicates if any worker detected a broken HOT chain
143 : : * during build.
144 : : */
145 : : int nparticipantsdone;
146 : : double reltuples;
147 : : bool havedead;
148 : : double indtuples;
149 : : bool brokenhotchain;
150 : :
151 : : /*
152 : : * ParallelTableScanDescData data follows. Can't directly embed here, as
153 : : * implementations of the parallel table scan desc interface might need
154 : : * stronger alignment.
155 : : */
156 : : } BTShared;
157 : :
158 : : /*
159 : : * Return pointer to a BTShared's parallel table scan.
160 : : *
161 : : * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just
162 : : * MAXALIGN.
163 : : */
164 : : #define ParallelTableScanFromBTShared(shared) \
165 : : (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BTShared)))
166 : :
167 : : /*
168 : : * Status for leader in parallel index build.
169 : : */
170 : : typedef struct BTLeader
171 : : {
172 : : /* parallel context itself */
173 : : ParallelContext *pcxt;
174 : :
175 : : /*
176 : : * nparticipanttuplesorts is the exact number of worker processes
177 : : * successfully launched, plus one leader process if it participates as a
178 : : * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
179 : : * participating as a worker).
180 : : */
181 : : int nparticipanttuplesorts;
182 : :
183 : : /*
184 : : * Leader process convenience pointers to shared state (leader avoids TOC
185 : : * lookups).
186 : : *
187 : : * btshared is the shared state for entire build. sharedsort is the
188 : : * shared, tuplesort-managed state passed to each process tuplesort.
189 : : * sharedsort2 is the corresponding btspool2 shared state, used only when
190 : : * building unique indexes. snapshot is the snapshot used by the scan iff
191 : : * an MVCC snapshot is required.
192 : : */
193 : : BTShared *btshared;
194 : : Sharedsort *sharedsort;
195 : : Sharedsort *sharedsort2;
196 : : Snapshot snapshot;
197 : : WalUsage *walusage;
198 : : BufferUsage *bufferusage;
199 : : } BTLeader;
200 : :
201 : : /*
202 : : * Working state for btbuild and its callback.
203 : : *
204 : : * When parallel CREATE INDEX is used, there is a BTBuildState for each
205 : : * participant.
206 : : */
207 : : typedef struct BTBuildState
208 : : {
209 : : bool isunique;
210 : : bool nulls_not_distinct;
211 : : bool havedead;
212 : : Relation heap;
213 : : BTSpool *spool;
214 : :
215 : : /*
216 : : * spool2 is needed only when the index is a unique index. Dead tuples are
217 : : * put into spool2 instead of spool in order to avoid uniqueness check.
218 : : */
219 : : BTSpool *spool2;
220 : : double indtuples;
221 : :
222 : : /*
223 : : * btleader is only present when a parallel index build is performed, and
224 : : * only in the leader process. (Actually, only the leader has a
225 : : * BTBuildState. Workers have their own spool and spool2, though.)
226 : : */
227 : : BTLeader *btleader;
228 : : } BTBuildState;
229 : :
230 : : /*
231 : : * Status record for a btree page being built. We have one of these
232 : : * for each active tree level.
233 : : */
234 : : typedef struct BTPageState
235 : : {
236 : : BulkWriteBuffer btps_buf; /* workspace for page building */
237 : : BlockNumber btps_blkno; /* block # to write this page at */
238 : : IndexTuple btps_lowkey; /* page's strict lower bound pivot tuple */
239 : : OffsetNumber btps_lastoff; /* last item offset loaded */
240 : : Size btps_lastextra; /* last item's extra posting list space */
241 : : uint32 btps_level; /* tree level (0 = leaf) */
242 : : Size btps_full; /* "full" if less than this much free space */
243 : : struct BTPageState *btps_next; /* link to parent level, if any */
244 : : } BTPageState;
245 : :
246 : : /*
247 : : * Overall status record for index writing phase.
248 : : */
249 : : typedef struct BTWriteState
250 : : {
251 : : Relation heap;
252 : : Relation index;
253 : : BulkWriteState *bulkstate;
254 : : BTScanInsert inskey; /* generic insertion scankey */
255 : : BlockNumber btws_pages_alloced; /* # pages allocated */
256 : : } BTWriteState;
257 : :
258 : :
259 : : static double _bt_spools_heapscan(Relation heap, Relation index,
260 : : BTBuildState *buildstate, IndexInfo *indexInfo);
261 : : static void _bt_spooldestroy(BTSpool *btspool);
262 : : static void _bt_spool(BTSpool *btspool, const ItemPointerData *self,
263 : : const Datum *values, const bool *isnull);
264 : : static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2);
265 : : static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values,
266 : : bool *isnull, bool tupleIsAlive, void *state);
267 : : static BulkWriteBuffer _bt_blnewpage(BTWriteState *wstate, uint32 level);
268 : : static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level);
269 : : static void _bt_slideleft(Page rightmostpage);
270 : : static void _bt_sortaddtup(Page page, Size itemsize,
271 : : const IndexTupleData *itup, OffsetNumber itup_off,
272 : : bool newfirstdataitem);
273 : : static void _bt_buildadd(BTWriteState *wstate, BTPageState *state,
274 : : IndexTuple itup, Size truncextra);
275 : : static void _bt_sort_dedup_finish_pending(BTWriteState *wstate,
276 : : BTPageState *state,
277 : : BTDedupState dstate);
278 : : static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state);
279 : : static void _bt_load(BTWriteState *wstate,
280 : : BTSpool *btspool, BTSpool *btspool2);
281 : : static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent,
282 : : int request);
283 : : static void _bt_end_parallel(BTLeader *btleader);
284 : : static Size _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot);
285 : : static double _bt_parallel_heapscan(BTBuildState *buildstate,
286 : : bool *brokenhotchain);
287 : : static void _bt_leader_participate_as_worker(BTBuildState *buildstate);
288 : : static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
289 : : BTShared *btshared, Sharedsort *sharedsort,
290 : : Sharedsort *sharedsort2, int sortmem,
291 : : bool progress);
292 : :
293 : :
294 : : /*
295 : : * btbuild() -- build a new btree index.
296 : : */
297 : : IndexBuildResult *
2963 rhaas@postgresql.org 298 :CBC 26245 : btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
299 : : {
300 : : IndexBuildResult *result;
301 : : BTBuildState buildstate;
302 : : double reltuples;
303 : :
304 : : #ifdef BTREE_BUILD_STATS
305 : : if (log_btree_build_stats)
306 : : ResetUsage();
307 : : #endif /* BTREE_BUILD_STATS */
308 : :
309 : 26245 : buildstate.isunique = indexInfo->ii_Unique;
1501 peter@eisentraut.org 310 : 26245 : buildstate.nulls_not_distinct = indexInfo->ii_NullsNotDistinct;
2963 rhaas@postgresql.org 311 : 26245 : buildstate.havedead = false;
312 : 26245 : buildstate.heap = heap;
313 : 26245 : buildstate.spool = NULL;
314 : 26245 : buildstate.spool2 = NULL;
315 : 26245 : buildstate.indtuples = 0;
316 : 26245 : buildstate.btleader = NULL;
317 : :
318 : : /*
319 : : * We expect to be called exactly once for any index relation. If that's
320 : : * not the case, big trouble's what we have.
321 : : */
322 [ - + ]: 26245 : if (RelationGetNumberOfBlocks(index) != 0)
2963 rhaas@postgresql.org 323 [ # # ]:UBC 0 : elog(ERROR, "index \"%s\" already contains data",
324 : : RelationGetRelationName(index));
325 : :
2963 rhaas@postgresql.org 326 :CBC 26245 : reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo);
327 : :
328 : : /*
329 : : * Finish the build by (1) completing the sort of the spool file, (2)
330 : : * inserting the sorted tuples into btree pages and (3) building the upper
331 : : * levels. Finally, it may also be necessary to end use of parallelism.
332 : : */
333 : 26239 : _bt_leafbuild(buildstate.spool, buildstate.spool2);
334 : 26197 : _bt_spooldestroy(buildstate.spool);
335 [ + + ]: 26197 : if (buildstate.spool2)
336 : 14 : _bt_spooldestroy(buildstate.spool2);
337 [ + + ]: 26197 : if (buildstate.btleader)
338 : 82 : _bt_end_parallel(buildstate.btleader);
339 : :
95 michael@paquier.xyz 340 :GNC 26197 : result = palloc_object(IndexBuildResult);
341 : :
2963 rhaas@postgresql.org 342 :CBC 26197 : result->heap_tuples = reltuples;
343 : 26197 : result->index_tuples = buildstate.indtuples;
344 : :
345 : : #ifdef BTREE_BUILD_STATS
346 : : if (log_btree_build_stats)
347 : : {
348 : : ShowUsage("BTREE BUILD STATS");
349 : : ResetUsage();
350 : : }
351 : : #endif /* BTREE_BUILD_STATS */
352 : :
353 : 26197 : return result;
354 : : }
355 : :
356 : : /*
357 : : * Create and initialize one or two spool structures, and save them in caller's
358 : : * buildstate argument. May also fill-in fields within indexInfo used by index
359 : : * builds.
360 : : *
361 : : * Scans the heap, possibly in parallel, filling spools with IndexTuples. This
362 : : * routine encapsulates all aspects of managing parallelism. Caller need only
363 : : * call _bt_end_parallel() in parallel case after it is done with spool/spool2.
364 : : *
365 : : * Returns the total number of heap tuples scanned.
366 : : */
367 : : static double
368 : 26245 : _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate,
369 : : IndexInfo *indexInfo)
370 : : {
95 michael@paquier.xyz 371 :GNC 26245 : BTSpool *btspool = palloc0_object(BTSpool);
2963 rhaas@postgresql.org 372 :CBC 26245 : SortCoordinate coordinate = NULL;
373 : 26245 : double reltuples = 0;
374 : :
375 : : /*
376 : : * We size the sort area as maintenance_work_mem rather than work_mem to
377 : : * speed index creation. This should be OK since a single backend can't
378 : : * run multiple index creations in parallel (see also: notes on
379 : : * parallelism and maintenance_work_mem below).
380 : : */
4793 tgl@sss.pgh.pa.us 381 : 26245 : btspool->heap = heap;
9646 382 : 26245 : btspool->index = index;
2963 rhaas@postgresql.org 383 : 26245 : btspool->isunique = indexInfo->ii_Unique;
1501 peter@eisentraut.org 384 : 26245 : btspool->nulls_not_distinct = indexInfo->ii_NullsNotDistinct;
385 : :
386 : : /* Save as primary spool */
2963 rhaas@postgresql.org 387 : 26245 : buildstate->spool = btspool;
388 : :
389 : : /* Report table scan phase started */
2539 alvherre@alvh.no-ip. 390 : 26245 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
391 : : PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN);
392 : :
393 : : /* Attempt to launch parallel worker scan when required */
2963 rhaas@postgresql.org 394 [ + + ]: 26245 : if (indexInfo->ii_ParallelWorkers > 0)
395 : 82 : _bt_begin_parallel(buildstate, indexInfo->ii_Concurrent,
396 : : indexInfo->ii_ParallelWorkers);
397 : :
398 : : /*
399 : : * If parallel build requested and at least one worker process was
400 : : * successfully launched, set up coordination state
401 : : */
402 [ + + ]: 26245 : if (buildstate->btleader)
403 : : {
95 michael@paquier.xyz 404 :GNC 82 : coordinate = palloc0_object(SortCoordinateData);
2963 rhaas@postgresql.org 405 :CBC 82 : coordinate->isWorker = false;
406 : 82 : coordinate->nParticipants =
407 : 82 : buildstate->btleader->nparticipanttuplesorts;
408 : 82 : coordinate->sharedsort = buildstate->btleader->sharedsort;
409 : : }
410 : :
411 : : /*
412 : : * Begin serial/leader tuplesort.
413 : : *
414 : : * In cases where parallelism is involved, the leader receives the same
415 : : * share of maintenance_work_mem as a serial sort (it is generally treated
416 : : * in the same way as a serial sort once we return). Parallel worker
417 : : * Tuplesortstates will have received only a fraction of
418 : : * maintenance_work_mem, though.
419 : : *
420 : : * We rely on the lifetime of the Leader Tuplesortstate almost not
421 : : * overlapping with any worker Tuplesortstate's lifetime. There may be
422 : : * some small overlap, but that's okay because we rely on leader
423 : : * Tuplesortstate only allocating a small, fixed amount of memory here.
424 : : * When its tuplesort_performsort() is called (by our caller), and
425 : : * significant amounts of memory are likely to be used, all workers must
426 : : * have already freed almost all memory held by their Tuplesortstates
427 : : * (they are about to go away completely, too). The overall effect is
428 : : * that maintenance_work_mem always represents an absolute high watermark
429 : : * on the amount of memory used by a CREATE INDEX operation, regardless of
430 : : * the use of parallelism or any other factor.
431 : : */
432 : 52490 : buildstate->spool->sortstate =
433 : 26245 : tuplesort_begin_index_btree(heap, index, buildstate->isunique,
1501 peter@eisentraut.org 434 : 26245 : buildstate->nulls_not_distinct,
435 : : maintenance_work_mem, coordinate,
436 : : TUPLESORT_NONE);
437 : :
438 : : /*
439 : : * If building a unique index, put dead tuples in a second spool to keep
440 : : * them out of the uniqueness check. We expect that the second spool (for
441 : : * dead tuples) won't get very full, so we give it only work_mem.
442 : : */
2963 rhaas@postgresql.org 443 [ + + ]: 26245 : if (indexInfo->ii_Unique)
444 : : {
95 michael@paquier.xyz 445 :GNC 21246 : BTSpool *btspool2 = palloc0_object(BTSpool);
2963 rhaas@postgresql.org 446 :CBC 21246 : SortCoordinate coordinate2 = NULL;
447 : :
448 : : /* Initialize secondary spool */
449 : 21246 : btspool2->heap = heap;
450 : 21246 : btspool2->index = index;
451 : 21246 : btspool2->isunique = false;
452 : : /* Save as secondary spool */
453 : 21246 : buildstate->spool2 = btspool2;
454 : :
455 [ + + ]: 21246 : if (buildstate->btleader)
456 : : {
457 : : /*
458 : : * Set up non-private state that is passed to
459 : : * tuplesort_begin_index_btree() about the basic high level
460 : : * coordination of a parallel sort.
461 : : */
95 michael@paquier.xyz 462 :GNC 37 : coordinate2 = palloc0_object(SortCoordinateData);
2963 rhaas@postgresql.org 463 :CBC 37 : coordinate2->isWorker = false;
464 : 37 : coordinate2->nParticipants =
465 : 37 : buildstate->btleader->nparticipanttuplesorts;
466 : 37 : coordinate2->sharedsort = buildstate->btleader->sharedsort2;
467 : : }
468 : :
469 : : /*
470 : : * We expect that the second one (for dead tuples) won't get very
471 : : * full, so we give it only work_mem
472 : : */
473 : 21246 : buildstate->spool2->sortstate =
1501 peter@eisentraut.org 474 : 21246 : tuplesort_begin_index_btree(heap, index, false, false, work_mem,
475 : : coordinate2, TUPLESORT_NONE);
476 : : }
477 : :
478 : : /* Fill spool using either serial or parallel heap scan */
2963 rhaas@postgresql.org 479 [ + + ]: 26245 : if (!buildstate->btleader)
2539 alvherre@alvh.no-ip. 480 : 26163 : reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
481 : : _bt_build_callback, buildstate,
482 : : NULL);
483 : : else
2963 rhaas@postgresql.org 484 : 82 : reltuples = _bt_parallel_heapscan(buildstate,
485 : : &indexInfo->ii_BrokenHotChain);
486 : :
487 : : /*
488 : : * Set the progress target for the next phase. Reset the block number
489 : : * values set by table_index_build_scan
490 : : */
491 : : {
1867 peter@eisentraut.org 492 : 26239 : const int progress_index[] = {
493 : : PROGRESS_CREATEIDX_TUPLES_TOTAL,
494 : : PROGRESS_SCAN_BLOCKS_TOTAL,
495 : : PROGRESS_SCAN_BLOCKS_DONE
496 : : };
497 : 26239 : const int64 progress_vals[] = {
2539 alvherre@alvh.no-ip. 498 : 26239 : buildstate->indtuples,
499 : : 0, 0
500 : : };
501 : :
1867 peter@eisentraut.org 502 : 26239 : pgstat_progress_update_multi_param(3, progress_index, progress_vals);
503 : : }
504 : :
505 : : /* okay, all heap tuples are spooled */
2963 rhaas@postgresql.org 506 [ + + + + ]: 26239 : if (buildstate->spool2 && !buildstate->havedead)
507 : : {
508 : : /* spool2 turns out to be unnecessary */
509 : 21232 : _bt_spooldestroy(buildstate->spool2);
510 : 21232 : buildstate->spool2 = NULL;
511 : : }
512 : :
513 : 26239 : return reltuples;
514 : : }
515 : :
516 : : /*
517 : : * clean up a spool structure and its substructures.
518 : : */
519 : : static void
9646 tgl@sss.pgh.pa.us 520 : 47443 : _bt_spooldestroy(BTSpool *btspool)
521 : : {
522 : 47443 : tuplesort_end(btspool->sortstate);
7839 neilc@samurai.com 523 : 47443 : pfree(btspool);
10841 scrappy@hub.org 524 : 47443 : }
525 : :
526 : : /*
527 : : * spool an index entry into the sort file.
528 : : */
529 : : static void
135 peter@eisentraut.org 530 :GNC 6703633 : _bt_spool(BTSpool *btspool, const ItemPointerData *self, const Datum *values, const bool *isnull)
531 : : {
4275 rhaas@postgresql.org 532 :CBC 6703633 : tuplesort_putindextuplevalues(btspool->sortstate, btspool->index,
533 : : self, values, isnull);
10841 scrappy@hub.org 534 : 6703633 : }
535 : :
536 : : /*
537 : : * given a spool loaded by successive calls to _bt_spool,
538 : : * create an entire btree.
539 : : */
540 : : static void
9348 inoue@tpf.co.jp 541 : 26239 : _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
542 : : {
543 : : BTWriteState wstate;
544 : :
545 : : #ifdef BTREE_BUILD_STATS
546 : : if (log_btree_build_stats)
547 : : {
548 : : ShowUsage("BTREE BUILD (Spool) STATISTICS");
549 : : ResetUsage();
550 : : }
551 : : #endif /* BTREE_BUILD_STATS */
552 : :
553 : : /* Execute the sort */
2539 alvherre@alvh.no-ip. 554 : 26239 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
555 : : PROGRESS_BTREE_PHASE_PERFORMSORT_1);
8422 tgl@sss.pgh.pa.us 556 : 26239 : tuplesort_performsort(btspool->sortstate);
9348 inoue@tpf.co.jp 557 [ + + ]: 26197 : if (btspool2)
558 : : {
2539 alvherre@alvh.no-ip. 559 : 14 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
560 : : PROGRESS_BTREE_PHASE_PERFORMSORT_2);
9348 inoue@tpf.co.jp 561 : 14 : tuplesort_performsort(btspool2->sortstate);
562 : : }
563 : :
4793 tgl@sss.pgh.pa.us 564 : 26197 : wstate.heap = btspool->heap;
7956 565 : 26197 : wstate.index = btspool->index;
1009 pg@bowt.ie 566 : 26197 : wstate.inskey = _bt_mkscankey(wstate.index, NULL);
567 : : /* _bt_mkscankey() won't set allequalimage without metapage */
2209 568 : 26197 : wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true);
569 : :
570 : : /* reserve the metapage */
7956 tgl@sss.pgh.pa.us 571 : 26197 : wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
572 : :
2539 alvherre@alvh.no-ip. 573 : 26197 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
574 : : PROGRESS_BTREE_PHASE_LEAF_LOAD);
7956 tgl@sss.pgh.pa.us 575 : 26197 : _bt_load(&wstate, btspool, btspool2);
10841 scrappy@hub.org 576 : 26197 : }
577 : :
578 : : /*
579 : : * Per-tuple callback for table_index_build_scan
580 : : */
581 : : static void
2963 rhaas@postgresql.org 582 : 6703633 : _bt_build_callback(Relation index,
583 : : ItemPointer tid,
584 : : Datum *values,
585 : : bool *isnull,
586 : : bool tupleIsAlive,
587 : : void *state)
588 : : {
589 : 6703633 : BTBuildState *buildstate = (BTBuildState *) state;
590 : :
591 : : /*
592 : : * insert the index tuple into the appropriate spool file for subsequent
593 : : * processing
594 : : */
595 [ + + + + ]: 6703633 : if (tupleIsAlive || buildstate->spool2 == NULL)
2319 andres@anarazel.de 596 : 6703379 : _bt_spool(buildstate->spool, tid, values, isnull);
597 : : else
598 : : {
599 : : /* dead tuples are put into spool2 */
2963 rhaas@postgresql.org 600 : 254 : buildstate->havedead = true;
2319 andres@anarazel.de 601 : 254 : _bt_spool(buildstate->spool2, tid, values, isnull);
602 : : }
603 : :
2963 rhaas@postgresql.org 604 : 6703633 : buildstate->indtuples += 1;
605 : 6703633 : }
606 : :
607 : : /*
608 : : * allocate workspace for a new, clean btree page, not linked to any siblings.
609 : : */
610 : : static BulkWriteBuffer
751 heikki.linnakangas@i 611 : 27598 : _bt_blnewpage(BTWriteState *wstate, uint32 level)
612 : : {
613 : : BulkWriteBuffer buf;
614 : : Page page;
615 : : BTPageOpaque opaque;
616 : :
617 : 27598 : buf = smgr_bulk_get_buf(wstate->bulkstate);
618 : 27598 : page = (Page) buf;
619 : :
620 : : /* Zero the page and set up standard page header info */
7956 tgl@sss.pgh.pa.us 621 : 27598 : _bt_pageinit(page, BLCKSZ);
622 : :
623 : : /* Initialize BT opaque state */
1444 michael@paquier.xyz 624 : 27598 : opaque = BTPageGetOpaque(page);
10416 bruce@momjian.us 625 : 27598 : opaque->btpo_prev = opaque->btpo_next = P_NONE;
1845 pg@bowt.ie 626 : 27598 : opaque->btpo_level = level;
8423 tgl@sss.pgh.pa.us 627 : 27598 : opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;
7251 628 : 27598 : opaque->btpo_cycleid = 0;
629 : :
630 : : /* Make the P_HIKEY line pointer appear allocated */
7956 631 : 27598 : ((PageHeader) page)->pd_lower += sizeof(ItemIdData);
632 : :
751 heikki.linnakangas@i 633 : 27598 : return buf;
634 : : }
635 : :
636 : : /*
637 : : * emit a completed btree page, and release the working storage.
638 : : */
639 : : static void
640 : 53795 : _bt_blwritepage(BTWriteState *wstate, BulkWriteBuffer buf, BlockNumber blkno)
641 : : {
642 : 53795 : smgr_bulk_write(wstate->bulkstate, blkno, buf, true);
643 : : /* smgr_bulk_write took ownership of 'buf' */
8423 tgl@sss.pgh.pa.us 644 : 53795 : }
645 : :
646 : : /*
647 : : * allocate and initialize a new BTPageState. the returned structure
648 : : * is suitable for immediate use by _bt_buildadd.
649 : : */
650 : : static BTPageState *
7956 651 : 6904 : _bt_pagestate(BTWriteState *wstate, uint32 level)
652 : : {
95 michael@paquier.xyz 653 :GNC 6904 : BTPageState *state = palloc0_object(BTPageState);
654 : :
655 : : /* create initial page for level */
751 heikki.linnakangas@i 656 :CBC 6904 : state->btps_buf = _bt_blnewpage(wstate, level);
657 : :
658 : : /* and assign it a page position */
7956 tgl@sss.pgh.pa.us 659 : 6904 : state->btps_blkno = wstate->btws_pages_alloced++;
660 : :
2320 pg@bowt.ie 661 : 6904 : state->btps_lowkey = NULL;
662 : : /* initialize lastoff so first item goes into P_FIRSTKEY */
9368 tgl@sss.pgh.pa.us 663 : 6904 : state->btps_lastoff = P_HIKEY;
2209 pg@bowt.ie 664 : 6904 : state->btps_lastextra = 0;
9368 tgl@sss.pgh.pa.us 665 : 6904 : state->btps_level = level;
666 : : /* set "full" threshold based on level. See notes at head of file. */
7195 667 [ + + ]: 6904 : if (level > 0)
7187 668 : 1370 : state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100);
669 : : else
2302 michael@paquier.xyz 670 [ + - - + : 5534 : state->btps_full = BTGetTargetPageFreeSpace(wstate->index);
+ + ]
671 : :
672 : : /* no parent level, yet */
8103 neilc@samurai.com 673 : 6904 : state->btps_next = NULL;
674 : :
9368 tgl@sss.pgh.pa.us 675 : 6904 : return state;
676 : : }
677 : :
678 : : /*
679 : : * Slide the array of ItemIds from the page back one slot (from P_FIRSTKEY to
680 : : * P_HIKEY, overwriting P_HIKEY).
681 : : *
682 : : * _bt_blnewpage() makes the P_HIKEY line pointer appear allocated, but the
683 : : * rightmost page on its level is not supposed to get a high key. Now that
684 : : * it's clear that this page is a rightmost page, remove the unneeded empty
685 : : * P_HIKEY line pointer space.
686 : : */
687 : : static void
2078 pg@bowt.ie 688 : 6904 : _bt_slideleft(Page rightmostpage)
689 : : {
690 : : OffsetNumber off;
691 : : OffsetNumber maxoff;
692 : : ItemId previi;
693 : :
694 : 6904 : maxoff = PageGetMaxOffsetNumber(rightmostpage);
695 [ - + ]: 6904 : Assert(maxoff >= P_FIRSTKEY);
696 : 6904 : previi = PageGetItemId(rightmostpage, P_HIKEY);
697 [ + + ]: 400875 : for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off))
698 : : {
699 : 393971 : ItemId thisii = PageGetItemId(rightmostpage, off);
700 : :
701 : 393971 : *previi = *thisii;
702 : 393971 : previi = thisii;
703 : : }
704 : 6904 : ((PageHeader) rightmostpage)->pd_lower -= sizeof(ItemIdData);
10841 scrappy@hub.org 705 : 6904 : }
706 : :
707 : : /*
708 : : * Add an item to a page being built.
709 : : *
710 : : * This is very similar to nbtinsert.c's _bt_pgaddtup(), but this variant
711 : : * raises an error directly.
712 : : *
713 : : * Note that our nbtsort.c caller does not know yet if the page will be
714 : : * rightmost. Offset P_FIRSTKEY is always assumed to be the first data key by
715 : : * caller. Page that turns out to be the rightmost on its level is fixed by
716 : : * calling _bt_slideleft().
717 : : */
718 : : static void
9368 tgl@sss.pgh.pa.us 719 : 5958232 : _bt_sortaddtup(Page page,
720 : : Size itemsize,
721 : : const IndexTupleData *itup,
722 : : OffsetNumber itup_off,
723 : : bool newfirstdataitem)
724 : : {
725 : : IndexTupleData trunctuple;
726 : :
2162 pg@bowt.ie 727 [ + + ]: 5958232 : if (newfirstdataitem)
728 : : {
7354 tgl@sss.pgh.pa.us 729 : 1433 : trunctuple = *itup;
730 : 1433 : trunctuple.t_info = sizeof(IndexTupleData);
2168 pg@bowt.ie 731 : 1433 : BTreeTupleSetNAtts(&trunctuple, 0, false);
7354 tgl@sss.pgh.pa.us 732 : 1433 : itup = &trunctuple;
733 : 1433 : itemsize = sizeof(IndexTupleData);
734 : : }
735 : :
139 peter@eisentraut.org 736 [ - + ]:GNC 5958232 : if (PageAddItem(page, itup, itemsize, itup_off, false, false) == InvalidOffsetNumber)
8273 tgl@sss.pgh.pa.us 737 [ # # ]:UBC 0 : elog(ERROR, "failed to add item to the index page");
10623 scrappy@hub.org 738 :CBC 5958232 : }
739 : :
740 : : /*----------
741 : : * Add an item to a disk page from the sort output (or add a posting list
742 : : * item formed from the sort output).
743 : : *
744 : : * We must be careful to observe the page layout conventions of nbtsearch.c:
745 : : * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY.
746 : : * - on non-leaf pages, the key portion of the first item need not be
747 : : * stored, we should store only the link.
748 : : *
749 : : * A leaf page being built looks like:
750 : : *
751 : : * +----------------+---------------------------------+
752 : : * | PageHeaderData | linp0 linp1 linp2 ... |
753 : : * +-----------+----+---------------------------------+
754 : : * | ... linpN | |
755 : : * +-----------+--------------------------------------+
756 : : * | ^ last |
757 : : * | |
758 : : * +-------------+------------------------------------+
759 : : * | | itemN ... |
760 : : * +-------------+------------------+-----------------+
761 : : * | ... item3 item2 item1 | "special space" |
762 : : * +--------------------------------+-----------------+
763 : : *
764 : : * Contrast this with the diagram in bufpage.h; note the mismatch
765 : : * between linps and items. This is because we reserve linp0 as a
766 : : * placeholder for the pointer to the "high key" item; when we have
767 : : * filled up the page, we will set linp0 to point to itemN and clear
768 : : * linpN. On the other hand, if we find this is the last (rightmost)
769 : : * page, we leave the items alone and slide the linp array over. If
770 : : * the high key is to be truncated, offset 1 is deleted, and we insert
771 : : * the truncated high key at offset 1.
772 : : *
773 : : * 'last' pointer indicates the last offset added to the page.
774 : : *
775 : : * 'truncextra' is the size of the posting list in itup, if any. This
776 : : * information is stashed for the next call here, when we may benefit
777 : : * from considering the impact of truncating away the posting list on
778 : : * the page before deciding to finish the page off. Posting lists are
779 : : * often relatively large, so it is worth going to the trouble of
780 : : * accounting for the saving from truncating away the posting list of
781 : : * the tuple that becomes the high key (that may be the only way to
782 : : * get close to target free space on the page). Note that this is
783 : : * only used for the soft fillfactor-wise limit, not the critical hard
784 : : * limit.
785 : : *----------
786 : : */
787 : : static void
2209 pg@bowt.ie 788 : 5937538 : _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
789 : : Size truncextra)
790 : : {
791 : : BulkWriteBuffer nbuf;
792 : : Page npage;
793 : : BlockNumber nblkno;
794 : : OffsetNumber last_off;
795 : : Size last_truncextra;
796 : : Size pgspc;
797 : : Size itupsz;
798 : : bool isleaf;
799 : :
800 : : /*
801 : : * This is a handy place to check for cancel interrupts during the btree
802 : : * load phase of index creation.
803 : : */
7310 tgl@sss.pgh.pa.us 804 [ + + ]: 5937538 : CHECK_FOR_INTERRUPTS();
805 : :
751 heikki.linnakangas@i 806 : 5937538 : nbuf = state->btps_buf;
807 : 5937538 : npage = (Page) nbuf;
7956 tgl@sss.pgh.pa.us 808 : 5937538 : nblkno = state->btps_blkno;
10416 bruce@momjian.us 809 : 5937538 : last_off = state->btps_lastoff;
2209 pg@bowt.ie 810 : 5937538 : last_truncextra = state->btps_lastextra;
811 : 5937538 : state->btps_lastextra = truncextra;
812 : :
10416 bruce@momjian.us 813 : 5937538 : pgspc = PageGetFreeSpace(npage);
2937 tgl@sss.pgh.pa.us 814 : 5937538 : itupsz = IndexTupleSize(itup);
7354 815 : 5937538 : itupsz = MAXALIGN(itupsz);
816 : : /* Leaf case has slightly different rules due to suffix truncation */
2509 pg@bowt.ie 817 : 5937538 : isleaf = (state->btps_level == 0);
818 : :
819 : : /*
820 : : * Check whether the new item can fit on a btree page on current level at
821 : : * all.
822 : : *
823 : : * Every newly built index will treat heap TID as part of the keyspace,
824 : : * which imposes the requirement that new high keys must occasionally have
825 : : * a heap TID appended within _bt_truncate(). That may leave a new pivot
826 : : * tuple one or two MAXALIGN() quantums larger than the original
827 : : * firstright tuple it's derived from. v4 deals with the problem by
828 : : * decreasing the limit on the size of tuples inserted on the leaf level
829 : : * by the same small amount. Enforce the new v4+ limit on the leaf level,
830 : : * and the old limit on internal levels, since pivot tuples may need to
831 : : * make use of the reserved space. This should never fail on internal
832 : : * pages.
833 : : */
369 834 [ + + ]: 5937538 : if (unlikely(itupsz > BTMaxItemSize))
2509 835 : 132 : _bt_check_third_page(wstate->index, wstate->heap, isleaf, npage,
836 : : itup);
837 : :
838 : : /*
839 : : * Check to see if current page will fit new item, with space left over to
840 : : * append a heap TID during suffix truncation when page is a leaf page.
841 : : *
842 : : * It is guaranteed that we can fit at least 2 non-pivot tuples plus a
843 : : * high key with heap TID when finishing off a leaf page, since we rely on
844 : : * _bt_check_third_page() rejecting oversized non-pivot tuples. On
845 : : * internal pages we can always fit 3 pivot tuples with larger internal
846 : : * page tuple limit (includes page high key).
847 : : *
848 : : * Most of the time, a page is only "full" in the sense that the soft
849 : : * fillfactor-wise limit has been exceeded. However, we must always leave
850 : : * at least two items plus a high key on each page before starting a new
851 : : * page. Disregard fillfactor and insert on "full" current page if we
852 : : * don't have the minimum number of items yet. (Note that we deliberately
853 : : * assume that suffix truncation neither enlarges nor shrinks new high key
854 : : * when applying soft limit, except when last tuple has a posting list.)
855 : : */
2175 856 [ + + - + ]: 5937538 : Assert(last_truncextra == 0 || isleaf);
2509 857 [ + + + + ]: 5937538 : if (pgspc < itupsz + (isleaf ? MAXALIGN(sizeof(ItemPointerData)) : 0) ||
2209 858 [ + + + - ]: 5936876 : (pgspc + last_truncextra < state->btps_full && last_off > P_FIRSTKEY))
859 : : {
860 : : /*
861 : : * Finish off the page and write it out.
862 : : */
751 heikki.linnakangas@i 863 : 20694 : BulkWriteBuffer obuf = nbuf;
10415 bruce@momjian.us 864 : 20694 : Page opage = npage;
7868 865 : 20694 : BlockNumber oblkno = nblkno;
866 : : ItemId ii;
867 : : ItemId hii;
868 : : IndexTuple oitup;
869 : :
870 : : /* Create new page of same level */
751 heikki.linnakangas@i 871 : 20694 : nbuf = _bt_blnewpage(wstate, state->btps_level);
872 : 20694 : npage = (Page) nbuf;
873 : :
874 : : /* and assign it a page position */
7956 tgl@sss.pgh.pa.us 875 : 20694 : nblkno = wstate->btws_pages_alloced++;
876 : :
877 : : /*
878 : : * We copy the last item on the page into the new page, and then
879 : : * rearrange the old page so that the 'last item' becomes its high key
880 : : * rather than a true data item. There had better be at least two
881 : : * items on the page already, else the page would be empty of useful
882 : : * data.
883 : : */
9368 884 [ - + ]: 20694 : Assert(last_off > P_FIRSTKEY);
885 : 20694 : ii = PageGetItemId(opage, last_off);
7354 886 : 20694 : oitup = (IndexTuple) PageGetItem(opage, ii);
2162 pg@bowt.ie 887 : 20694 : _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY,
888 : 20694 : !isleaf);
889 : :
890 : : /*
891 : : * Move 'last' into the high key position on opage. _bt_blnewpage()
892 : : * allocated empty space for a line pointer when opage was first
893 : : * created, so this is a matter of rearranging already-allocated space
894 : : * on page, and initializing high key line pointer. (Actually, leaf
895 : : * pages must also swap oitup with a truncated version of oitup, which
896 : : * is sometimes larger than oitup, though never by more than the space
897 : : * needed to append a heap TID.)
898 : : */
10416 bruce@momjian.us 899 : 20694 : hii = PageGetItemId(opage, P_HIKEY);
900 : 20694 : *hii = *ii;
6759 tgl@sss.pgh.pa.us 901 : 20694 : ItemIdSetUnused(ii); /* redundant */
10416 bruce@momjian.us 902 : 20694 : ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
903 : :
2509 pg@bowt.ie 904 [ + + ]: 20694 : if (isleaf)
905 : : {
906 : : IndexTuple lastleft;
907 : : IndexTuple truncated;
908 : :
909 : : /*
910 : : * Truncate away any unneeded attributes from high key on leaf
911 : : * level. This is only done at the leaf level because downlinks
912 : : * in internal pages are either negative infinity items, or get
913 : : * their contents from copying from one level down. See also:
914 : : * _bt_split().
915 : : *
916 : : * We don't try to bias our choice of split point to make it more
917 : : * likely that _bt_truncate() can truncate away more attributes,
918 : : * whereas the split point used within _bt_split() is chosen much
919 : : * more delicately. Even still, the lastleft and firstright
920 : : * tuples passed to _bt_truncate() here are at least not fully
921 : : * equal to each other when deduplication is used, unless there is
922 : : * a large group of duplicates (also, unique index builds usually
923 : : * have few or no spool2 duplicates). When the split point is
924 : : * between two unequal tuples, _bt_truncate() will avoid including
925 : : * a heap TID in the new high key, which is the most important
926 : : * benefit of suffix truncation.
927 : : *
928 : : * Overwrite the old item with new truncated high key directly.
929 : : * oitup is already located at the physical beginning of tuple
930 : : * space, so this should directly reuse the existing tuple space.
931 : : */
2552 932 : 20631 : ii = PageGetItemId(opage, OffsetNumberPrev(last_off));
933 : 20631 : lastleft = (IndexTuple) PageGetItem(opage, ii);
934 : :
2175 935 [ - + ]: 20631 : Assert(IndexTupleSize(oitup) > last_truncextra);
2552 936 : 20631 : truncated = _bt_truncate(wstate->index, lastleft, oitup,
937 : : wstate->inskey);
139 peter@eisentraut.org 938 [ - + ]:GNC 20631 : if (!PageIndexTupleOverwrite(opage, P_HIKEY, truncated, IndexTupleSize(truncated)))
2406 pg@bowt.ie 939 [ # # ]:UBC 0 : elog(ERROR, "failed to add high key to the index page");
2887 teodor@sigaev.ru 940 :CBC 20631 : pfree(truncated);
941 : :
942 : : /* oitup should continue to point to the page's high key */
943 : 20631 : hii = PageGetItemId(opage, P_HIKEY);
944 : 20631 : oitup = (IndexTuple) PageGetItem(opage, hii);
945 : : }
946 : :
947 : : /*
948 : : * Link the old page into its parent, using its low key. If we don't
949 : : * have a parent, we have to create one; this adds a new btree level.
950 : : */
8103 neilc@samurai.com 951 [ + + ]: 20694 : if (state->btps_next == NULL)
7956 tgl@sss.pgh.pa.us 952 : 1370 : state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);
953 : :
2320 pg@bowt.ie 954 [ + - + - : 20694 : Assert((BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) <=
+ - + + -
- - + ]
955 : : IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
956 : : BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) > 0) ||
957 : : P_LEFTMOST(BTPageGetOpaque(opage)));
958 [ + - + + : 20694 : Assert(BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) == 0 ||
- + ]
959 : : !P_LEFTMOST(BTPageGetOpaque(opage)));
2281 960 : 20694 : BTreeTupleSetDownLink(state->btps_lowkey, oblkno);
2209 961 : 20694 : _bt_buildadd(wstate, state->btps_next, state->btps_lowkey, 0);
2320 962 : 20694 : pfree(state->btps_lowkey);
963 : :
964 : : /*
965 : : * Save a copy of the high key from the old page. It is also the low
966 : : * key for the new page.
967 : : */
968 : 20694 : state->btps_lowkey = CopyIndexTuple(oitup);
969 : :
970 : : /*
971 : : * Set the sibling links for both pages.
972 : : */
973 : : {
1444 michael@paquier.xyz 974 : 20694 : BTPageOpaque oopaque = BTPageGetOpaque(opage);
975 : 20694 : BTPageOpaque nopaque = BTPageGetOpaque(npage);
976 : :
7956 tgl@sss.pgh.pa.us 977 : 20694 : oopaque->btpo_next = nblkno;
978 : 20694 : nopaque->btpo_prev = oblkno;
3189 979 : 20694 : nopaque->btpo_next = P_NONE; /* redundant */
980 : : }
981 : :
982 : : /*
983 : : * Write out the old page. _bt_blwritepage takes ownership of the
984 : : * 'opage' buffer.
985 : : */
751 heikki.linnakangas@i 986 : 20694 : _bt_blwritepage(wstate, obuf, oblkno);
987 : :
988 : : /*
989 : : * Reset last_off to point to new page
990 : : */
9368 tgl@sss.pgh.pa.us 991 : 20694 : last_off = P_FIRSTKEY;
992 : : }
993 : :
994 : : /*
995 : : * By here, either original page is still the current page, or a new page
996 : : * was created that became the current page. Either way, the current page
997 : : * definitely has space for new item.
998 : : *
999 : : * If the new item is the first for its page, it must also be the first
1000 : : * item on its entire level. On later same-level pages, a low key for a
1001 : : * page will be copied from the prior page in the code above. Generate a
1002 : : * minus infinity low key here instead.
1003 : : */
1004 [ + + ]: 5937538 : if (last_off == P_HIKEY)
1005 : : {
2320 pg@bowt.ie 1006 [ - + ]: 6904 : Assert(state->btps_lowkey == NULL);
95 michael@paquier.xyz 1007 :GNC 6904 : state->btps_lowkey = palloc0_object(IndexTupleData);
2320 pg@bowt.ie 1008 :CBC 6904 : state->btps_lowkey->t_info = sizeof(IndexTupleData);
2168 1009 : 6904 : BTreeTupleSetNAtts(state->btps_lowkey, 0, false);
1010 : : }
1011 : :
1012 : : /*
1013 : : * Add the new item into the current page.
1014 : : */
9368 tgl@sss.pgh.pa.us 1015 : 5937538 : last_off = OffsetNumberNext(last_off);
2162 pg@bowt.ie 1016 : 5937538 : _bt_sortaddtup(npage, itupsz, itup, last_off,
1017 [ + + + + ]: 5937538 : !isleaf && last_off == P_FIRSTKEY);
1018 : :
751 heikki.linnakangas@i 1019 : 5937538 : state->btps_buf = nbuf;
7956 tgl@sss.pgh.pa.us 1020 : 5937538 : state->btps_blkno = nblkno;
10416 bruce@momjian.us 1021 : 5937538 : state->btps_lastoff = last_off;
10623 scrappy@hub.org 1022 : 5937538 : }
1023 : :
1024 : : /*
1025 : : * Finalize pending posting list tuple, and add it to the index. Final tuple
1026 : : * is based on saved base tuple, and saved list of heap TIDs.
1027 : : *
1028 : : * This is almost like _bt_dedup_finish_pending(), but it adds a new tuple
1029 : : * using _bt_buildadd().
1030 : : */
1031 : : static void
2209 pg@bowt.ie 1032 : 2533099 : _bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state,
1033 : : BTDedupState dstate)
1034 : : {
1035 [ - + ]: 2533099 : Assert(dstate->nitems > 0);
1036 : :
1037 [ + + ]: 2533099 : if (dstate->nitems == 1)
1038 : 2512017 : _bt_buildadd(wstate, state, dstate->base, 0);
1039 : : else
1040 : : {
1041 : : IndexTuple postingtuple;
1042 : : Size truncextra;
1043 : :
1044 : : /* form a tuple with a posting list */
1045 : 21082 : postingtuple = _bt_form_posting(dstate->base,
2209 pg@bowt.ie 1046 :GIC 21082 : dstate->htids,
1047 : : dstate->nhtids);
1048 : : /* Calculate posting list overhead */
2209 pg@bowt.ie 1049 :CBC 21082 : truncextra = IndexTupleSize(postingtuple) -
1050 : 21082 : BTreeTupleGetPostingOffset(postingtuple);
1051 : :
1052 : 21082 : _bt_buildadd(wstate, state, postingtuple, truncextra);
1053 : 21082 : pfree(postingtuple);
1054 : : }
1055 : :
2095 1056 : 2533099 : dstate->nmaxitems = 0;
2209 1057 : 2533099 : dstate->nhtids = 0;
1058 : 2533099 : dstate->nitems = 0;
1059 : 2533099 : dstate->phystupsize = 0;
1060 : 2533099 : }
1061 : :
1062 : : /*
1063 : : * Finish writing out the completed btree.
1064 : : */
1065 : : static void
7956 tgl@sss.pgh.pa.us 1066 : 26197 : _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
1067 : : {
1068 : : BTPageState *s;
7868 bruce@momjian.us 1069 : 26197 : BlockNumber rootblkno = P_NONE;
8203 tgl@sss.pgh.pa.us 1070 : 26197 : uint32 rootlevel = 0;
1071 : : BulkWriteBuffer metabuf;
1072 : :
1073 : : /*
1074 : : * Each iteration of this loop completes one more level of the tree.
1075 : : */
8103 neilc@samurai.com 1076 [ + + ]: 33101 : for (s = state; s != NULL; s = s->btps_next)
1077 : : {
1078 : : BlockNumber blkno;
1079 : : BTPageOpaque opaque;
1080 : :
7956 tgl@sss.pgh.pa.us 1081 : 6904 : blkno = s->btps_blkno;
751 heikki.linnakangas@i 1082 : 6904 : opaque = BTPageGetOpaque((Page) s->btps_buf);
1083 : :
1084 : : /*
1085 : : * We have to link the last page on this level to somewhere.
1086 : : *
1087 : : * If we're at the top, it's the root, so attach it to the metapage.
1088 : : * Otherwise, add an entry for it to its parent using its low key.
1089 : : * This may cause the last page of the parent level to split, but
1090 : : * that's not a problem -- we haven't gotten to it yet.
1091 : : */
8103 neilc@samurai.com 1092 [ + + ]: 6904 : if (s->btps_next == NULL)
1093 : : {
9368 tgl@sss.pgh.pa.us 1094 : 5534 : opaque->btpo_flags |= BTP_ROOT;
8203 1095 : 5534 : rootblkno = blkno;
1096 : 5534 : rootlevel = s->btps_level;
1097 : : }
1098 : : else
1099 : : {
2320 pg@bowt.ie 1100 [ + - + - : 1370 : Assert((BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) <=
+ - - + -
- - - ]
1101 : : IndexRelationGetNumberOfKeyAttributes(wstate->index) &&
1102 : : BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) > 0) ||
1103 : : P_LEFTMOST(opaque));
1104 [ + - + - : 1370 : Assert(BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) == 0 ||
- + ]
1105 : : !P_LEFTMOST(opaque));
2281 1106 : 1370 : BTreeTupleSetDownLink(s->btps_lowkey, blkno);
2209 1107 : 1370 : _bt_buildadd(wstate, s->btps_next, s->btps_lowkey, 0);
2320 1108 : 1370 : pfree(s->btps_lowkey);
1109 : 1370 : s->btps_lowkey = NULL;
1110 : : }
1111 : :
1112 : : /*
1113 : : * This is the rightmost page, so the ItemId array needs to be slid
1114 : : * back one slot. Then we can dump out the page.
1115 : : */
751 heikki.linnakangas@i 1116 : 6904 : _bt_slideleft((Page) s->btps_buf);
1117 : 6904 : _bt_blwritepage(wstate, s->btps_buf, s->btps_blkno);
1118 : 6904 : s->btps_buf = NULL; /* writepage took ownership of the buffer */
1119 : : }
1120 : :
1121 : : /*
1122 : : * As the last step in the process, construct the metapage and make it
1123 : : * point to the new root (unless we had no data at all, in which case it's
1124 : : * set to point to "P_NONE"). This changes the index to the "valid" state
1125 : : * by filling in a valid magic number in the metapage.
1126 : : */
1127 : 26197 : metabuf = smgr_bulk_get_buf(wstate->bulkstate);
1128 : 26197 : _bt_initmetapage((Page) metabuf, rootblkno, rootlevel,
2209 pg@bowt.ie 1129 : 26197 : wstate->inskey->allequalimage);
751 heikki.linnakangas@i 1130 : 26197 : _bt_blwritepage(wstate, metabuf, BTREE_METAPAGE);
10841 scrappy@hub.org 1131 : 26197 : }
1132 : :
1133 : : /*
1134 : : * Read tuples in correct sort order from tuplesort, and load them into
1135 : : * btree leaves.
1136 : : */
1137 : : static void
7956 tgl@sss.pgh.pa.us 1138 : 26197 : _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
1139 : : {
9368 1140 : 26197 : BTPageState *state = NULL;
9348 inoue@tpf.co.jp 1141 : 26197 : bool merge = (btspool2 != NULL);
1142 : : IndexTuple itup,
7354 tgl@sss.pgh.pa.us 1143 : 26197 : itup2 = NULL;
1144 : : bool load1;
7956 1145 : 26197 : TupleDesc tupdes = RelationGetDescr(wstate->index);
1146 : : int i,
2899 teodor@sigaev.ru 1147 : 26197 : keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index);
1148 : : SortSupport sortKeys;
2512 alvherre@alvh.no-ip. 1149 : 26197 : int64 tuples_done = 0;
1150 : : bool deduplicate;
1151 : :
751 heikki.linnakangas@i 1152 : 26197 : wstate->bulkstate = smgr_bulk_start_rel(wstate->index, MAIN_FORKNUM);
1153 : :
2067 pg@bowt.ie 1154 [ + + + + : 31062 : deduplicate = wstate->inskey->allequalimage && !btspool->isunique &&
+ - ]
2209 1155 [ + - - + : 4865 : BTGetDeduplicateItems(wstate->index);
+ + + - ]
1156 : :
9348 inoue@tpf.co.jp 1157 [ + + ]: 26197 : if (merge)
1158 : : {
1159 : : /*
1160 : : * Another BTSpool for dead tuples exists. Now we have to merge
1161 : : * btspool and btspool2.
1162 : : */
1163 : :
1164 : : /* the preparation of merge */
3380 rhaas@postgresql.org 1165 : 14 : itup = tuplesort_getindextuple(btspool->sortstate, true);
1166 : 14 : itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
1167 : :
1168 : : /* Prepare SortSupport data for each column */
95 michael@paquier.xyz 1169 :GNC 14 : sortKeys = palloc0_array(SortSupportData, keysz);
1170 : :
4146 rhaas@postgresql.org 1171 [ + + ]:CBC 30 : for (i = 0; i < keysz; i++)
1172 : : {
1173 : 16 : SortSupport sortKey = sortKeys + i;
2552 pg@bowt.ie 1174 : 16 : ScanKey scanKey = wstate->inskey->scankeys + i;
1175 : : bool reverse;
1176 : :
4146 rhaas@postgresql.org 1177 : 16 : sortKey->ssup_cxt = CurrentMemoryContext;
1178 : 16 : sortKey->ssup_collation = scanKey->sk_collation;
1179 : 16 : sortKey->ssup_nulls_first =
1180 : 16 : (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;
1181 : 16 : sortKey->ssup_attno = scanKey->sk_attno;
1182 : : /* Abbreviation is not supported here */
4073 1183 : 16 : sortKey->abbreviate = false;
1184 : :
1234 peter@eisentraut.org 1185 [ - + ]: 16 : Assert(sortKey->ssup_attno != 0);
1186 : :
366 1187 : 16 : reverse = (scanKey->sk_flags & SK_BT_DESC) != 0;
1188 : :
1189 : 16 : PrepareSortSupportFromIndexRel(wstate->index, reverse, sortKey);
1190 : : }
1191 : :
1192 : : for (;;)
1193 : : {
9124 bruce@momjian.us 1194 : 1685 : load1 = true; /* load BTSpool next ? */
7354 tgl@sss.pgh.pa.us 1195 [ + + ]: 1685 : if (itup2 == NULL)
1196 : : {
1197 [ + + ]: 83 : if (itup == NULL)
9348 inoue@tpf.co.jp 1198 : 14 : break;
1199 : : }
7354 tgl@sss.pgh.pa.us 1200 [ + + ]: 1602 : else if (itup != NULL)
1201 : : {
2552 pg@bowt.ie 1202 : 1503 : int32 compare = 0;
1203 : :
9348 inoue@tpf.co.jp 1204 [ + + ]: 1629 : for (i = 1; i <= keysz; i++)
1205 : : {
1206 : : SortSupport entry;
1207 : : Datum attrDatum1,
1208 : : attrDatum2;
1209 : : bool isNull1,
1210 : : isNull2;
1211 : :
4146 rhaas@postgresql.org 1212 : 1530 : entry = sortKeys + i - 1;
7005 tgl@sss.pgh.pa.us 1213 : 1530 : attrDatum1 = index_getattr(itup, i, tupdes, &isNull1);
1214 : 1530 : attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2);
1215 : :
4146 rhaas@postgresql.org 1216 : 1530 : compare = ApplySortComparator(attrDatum1, isNull1,
1217 : : attrDatum2, isNull2,
1218 : : entry);
7005 tgl@sss.pgh.pa.us 1219 [ + + ]: 1530 : if (compare > 0)
1220 : : {
1221 : 139 : load1 = false;
1222 : 1404 : break;
1223 : : }
1224 [ + + ]: 1391 : else if (compare < 0)
1225 : 1265 : break;
1226 : : }
1227 : :
1228 : : /*
1229 : : * If key values are equal, we sort on ItemPointer. This is
1230 : : * required for btree indexes, since heap TID is treated as an
1231 : : * implicit last key attribute in order to ensure that all
1232 : : * keys in the index are physically unique.
1233 : : */
2552 pg@bowt.ie 1234 [ + + ]: 1503 : if (compare == 0)
1235 : : {
1236 : 99 : compare = ItemPointerCompare(&itup->t_tid, &itup2->t_tid);
1237 [ - + ]: 99 : Assert(compare != 0);
1238 [ + + ]: 99 : if (compare > 0)
1239 : 16 : load1 = false;
1240 : : }
1241 : : }
1242 : : else
9348 inoue@tpf.co.jp 1243 : 99 : load1 = false;
1244 : :
1245 : : /* When we see first tuple, create first index page */
1246 [ + + ]: 1671 : if (state == NULL)
7956 tgl@sss.pgh.pa.us 1247 : 14 : state = _bt_pagestate(wstate, 0);
1248 : :
9348 inoue@tpf.co.jp 1249 [ + + ]: 1671 : if (load1)
1250 : : {
2209 pg@bowt.ie 1251 : 1417 : _bt_buildadd(wstate, state, itup, 0);
3380 rhaas@postgresql.org 1252 : 1417 : itup = tuplesort_getindextuple(btspool->sortstate, true);
1253 : : }
1254 : : else
1255 : : {
2209 pg@bowt.ie 1256 : 254 : _bt_buildadd(wstate, state, itup2, 0);
3380 rhaas@postgresql.org 1257 : 254 : itup2 = tuplesort_getindextuple(btspool2->sortstate, true);
1258 : : }
1259 : :
1260 : : /* Report progress */
2539 alvherre@alvh.no-ip. 1261 : 1671 : pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
1262 : : ++tuples_done);
1263 : : }
4146 rhaas@postgresql.org 1264 : 14 : pfree(sortKeys);
1265 : : }
2209 pg@bowt.ie 1266 [ + + ]: 26183 : else if (deduplicate)
1267 : : {
1268 : : /* merge is unnecessary, deduplicate into posting lists */
1269 : : BTDedupState dstate;
1270 : :
95 michael@paquier.xyz 1271 :GNC 4865 : dstate = palloc_object(BTDedupStateData);
2209 pg@bowt.ie 1272 :CBC 4865 : dstate->deduplicate = true; /* unused */
2095 1273 : 4865 : dstate->nmaxitems = 0; /* unused */
2209 1274 : 4865 : dstate->maxpostingsize = 0; /* set later */
1275 : : /* Metadata about base tuple of current pending posting list */
1276 : 4865 : dstate->base = NULL;
1277 : 4865 : dstate->baseoff = InvalidOffsetNumber; /* unused */
1278 : 4865 : dstate->basetupsize = 0;
1279 : : /* Metadata about current pending posting list TIDs */
1280 : 4865 : dstate->htids = NULL;
1281 : 4865 : dstate->nhtids = 0;
1282 : 4865 : dstate->nitems = 0;
1283 : 4865 : dstate->phystupsize = 0; /* unused */
1284 : 4865 : dstate->nintervals = 0; /* unused */
1285 : :
1286 : 3325934 : while ((itup = tuplesort_getindextuple(btspool->sortstate,
1287 [ + + ]: 3325934 : true)) != NULL)
1288 : : {
1289 : : /* When we see first tuple, create first index page */
1290 [ + + ]: 3321069 : if (state == NULL)
1291 : : {
1292 : 1424 : state = _bt_pagestate(wstate, 0);
1293 : :
1294 : : /*
1295 : : * Limit size of posting list tuples to 1/10 space we want to
1296 : : * leave behind on the page, plus space for final item's line
1297 : : * pointer. This is equal to the space that we'd like to
1298 : : * leave behind on each leaf page when fillfactor is 90,
1299 : : * allowing us to get close to fillfactor% space utilization
1300 : : * when there happen to be a great many duplicates. (This
1301 : : * makes higher leaf fillfactor settings ineffective when
1302 : : * building indexes that have many duplicates, but packing
1303 : : * leaf pages full with few very large tuples doesn't seem
1304 : : * like a useful goal.)
1305 : : */
1306 : 1424 : dstate->maxpostingsize = MAXALIGN_DOWN((BLCKSZ * 10 / 100)) -
1307 : : sizeof(ItemIdData);
369 1308 [ + - - + ]: 1424 : Assert(dstate->maxpostingsize <= BTMaxItemSize &&
1309 : : dstate->maxpostingsize <= INDEX_SIZE_MASK);
2209 1310 : 1424 : dstate->htids = palloc(dstate->maxpostingsize);
1311 : :
1312 : : /* start new pending posting list with itup copy */
1313 : 1424 : _bt_dedup_start_pending(dstate, CopyIndexTuple(itup),
1314 : : InvalidOffsetNumber);
1315 : : }
1316 [ + + ]: 3319645 : else if (_bt_keep_natts_fast(wstate->index, dstate->base,
1317 [ + + ]: 792705 : itup) > keysz &&
1318 : 792705 : _bt_dedup_save_htid(dstate, itup))
1319 : : {
1320 : : /*
1321 : : * Tuple is equal to base tuple of pending posting list. Heap
1322 : : * TID from itup has been saved in state.
1323 : : */
1324 : : }
1325 : : else
1326 : : {
1327 : : /*
1328 : : * Tuple is not equal to pending posting list tuple, or
1329 : : * _bt_dedup_save_htid() opted to not merge current item into
1330 : : * pending posting list.
1331 : : */
1332 : 2531675 : _bt_sort_dedup_finish_pending(wstate, state, dstate);
1333 : 2531675 : pfree(dstate->base);
1334 : :
1335 : : /* start new pending posting list with itup copy */
1336 : 2531675 : _bt_dedup_start_pending(dstate, CopyIndexTuple(itup),
1337 : : InvalidOffsetNumber);
1338 : : }
1339 : :
1340 : : /* Report progress */
1341 : 3321069 : pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
1342 : : ++tuples_done);
1343 : : }
1344 : :
1345 [ + + ]: 4865 : if (state)
1346 : : {
1347 : : /*
1348 : : * Handle the last item (there must be a last item when the
1349 : : * tuplesort returned one or more tuples)
1350 : : */
1351 : 1424 : _bt_sort_dedup_finish_pending(wstate, state, dstate);
1352 : 1424 : pfree(dstate->base);
1353 : 1424 : pfree(dstate->htids);
1354 : : }
1355 : :
1356 : 4865 : pfree(dstate);
1357 : : }
1358 : : else
1359 : : {
1360 : : /* merging and deduplication are both unnecessary */
7354 tgl@sss.pgh.pa.us 1361 : 3402022 : while ((itup = tuplesort_getindextuple(btspool->sortstate,
3380 rhaas@postgresql.org 1362 [ + + ]: 3402022 : true)) != NULL)
1363 : : {
1364 : : /* When we see first tuple, create first index page */
9348 inoue@tpf.co.jp 1365 [ + + ]: 3380704 : if (state == NULL)
7956 tgl@sss.pgh.pa.us 1366 : 4096 : state = _bt_pagestate(wstate, 0);
1367 : :
2209 pg@bowt.ie 1368 : 3380704 : _bt_buildadd(wstate, state, itup, 0);
1369 : :
1370 : : /* Report progress */
2539 alvherre@alvh.no-ip. 1371 : 3380704 : pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
1372 : : ++tuples_done);
1373 : : }
1374 : : }
1375 : :
1376 : : /* Close down final pages and write the metapage */
7956 tgl@sss.pgh.pa.us 1377 : 26197 : _bt_uppershutdown(wstate, state);
751 heikki.linnakangas@i 1378 : 26197 : smgr_bulk_finish(wstate->bulkstate);
10841 scrappy@hub.org 1379 : 26197 : }
1380 : :
1381 : : /*
1382 : : * Create parallel context, and launch workers for leader.
1383 : : *
1384 : : * buildstate argument should be initialized (with the exception of the
1385 : : * tuplesort state in spools, which may later be created based on shared
1386 : : * state initially set up here).
1387 : : *
1388 : : * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.
1389 : : *
1390 : : * request is the target number of parallel worker processes to launch.
1391 : : *
1392 : : * Sets buildstate's BTLeader, which caller must use to shut down parallel
1393 : : * mode by passing it to _bt_end_parallel() at the very end of its index
1394 : : * build. If not even a single worker process can be launched, this is
1395 : : * never set, and caller should proceed with a serial index build.
1396 : : */
1397 : : static void
2963 rhaas@postgresql.org 1398 : 82 : _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
1399 : : {
1400 : : ParallelContext *pcxt;
1401 : : int scantuplesortstates;
1402 : : Snapshot snapshot;
1403 : : Size estbtshared;
1404 : : Size estsort;
1405 : : BTShared *btshared;
1406 : : Sharedsort *sharedsort;
1407 : : Sharedsort *sharedsort2;
1408 : 82 : BTSpool *btspool = buildstate->spool;
95 michael@paquier.xyz 1409 :GNC 82 : BTLeader *btleader = palloc0_object(BTLeader);
1410 : : WalUsage *walusage;
1411 : : BufferUsage *bufferusage;
2963 rhaas@postgresql.org 1412 :CBC 82 : bool leaderparticipates = true;
1413 : : int querylen;
1414 : :
1415 : : #ifdef DISABLE_LEADER_PARTICIPATION
1416 : : leaderparticipates = false;
1417 : : #endif
1418 : :
1419 : : /*
1420 : : * Enter parallel mode, and create context for parallel build of btree
1421 : : * index
1422 : : */
1423 : 82 : EnterParallelMode();
1424 [ - + ]: 82 : Assert(request > 0);
1425 : 82 : pcxt = CreateParallelContext("postgres", "_bt_parallel_build_main",
1426 : : request);
1427 : :
1428 [ + - ]: 82 : scantuplesortstates = leaderparticipates ? request + 1 : request;
1429 : :
1430 : : /*
1431 : : * Prepare for scan of the base relation. In a normal index build, we use
1432 : : * SnapshotAny because we must retrieve all tuples and do our own time
1433 : : * qual checks (because we have to index RECENTLY_DEAD tuples). In a
1434 : : * concurrent build, we take a regular MVCC snapshot and index whatever's
1435 : : * live according to that.
1436 : : */
1437 [ + - ]: 82 : if (!isconcurrent)
1438 : 82 : snapshot = SnapshotAny;
1439 : : else
2963 rhaas@postgresql.org 1440 :UBC 0 : snapshot = RegisterSnapshot(GetTransactionSnapshot());
1441 : :
1442 : : /*
1443 : : * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and
1444 : : * PARALLEL_KEY_TUPLESORT tuplesort workspace
1445 : : */
2561 andres@anarazel.de 1446 :CBC 82 : estbtshared = _bt_parallel_estimate_shared(btspool->heap, snapshot);
2963 rhaas@postgresql.org 1447 : 82 : shm_toc_estimate_chunk(&pcxt->estimator, estbtshared);
1448 : 82 : estsort = tuplesort_estimate_shared(scantuplesortstates);
1449 : 82 : shm_toc_estimate_chunk(&pcxt->estimator, estsort);
1450 : :
1451 : : /*
1452 : : * Unique case requires a second spool, and so we may have to account for
1453 : : * another shared workspace for that -- PARALLEL_KEY_TUPLESORT_SPOOL2
1454 : : */
1455 [ + + ]: 82 : if (!btspool->isunique)
1456 : 45 : shm_toc_estimate_keys(&pcxt->estimator, 2);
1457 : : else
1458 : : {
1459 : 37 : shm_toc_estimate_chunk(&pcxt->estimator, estsort);
1460 : 37 : shm_toc_estimate_keys(&pcxt->estimator, 3);
1461 : : }
1462 : :
1463 : : /*
1464 : : * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
1465 : : * and PARALLEL_KEY_BUFFER_USAGE.
1466 : : *
1467 : : * If there are no extensions loaded that care, we could skip this. We
1468 : : * have no way of knowing whether anyone's looking at pgWalUsage or
1469 : : * pgBufferUsage, so do it unconditionally.
1470 : : */
2171 akapila@postgresql.o 1471 : 82 : shm_toc_estimate_chunk(&pcxt->estimator,
1472 : : mul_size(sizeof(WalUsage), pcxt->nworkers));
1473 : 82 : shm_toc_estimate_keys(&pcxt->estimator, 1);
2166 1474 : 82 : shm_toc_estimate_chunk(&pcxt->estimator,
1475 : : mul_size(sizeof(BufferUsage), pcxt->nworkers));
1476 : 82 : shm_toc_estimate_keys(&pcxt->estimator, 1);
1477 : :
1478 : : /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
1961 noah@leadboat.com 1479 [ + - ]: 82 : if (debug_query_string)
1480 : : {
1481 : 82 : querylen = strlen(debug_query_string);
1482 : 82 : shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);
1483 : 82 : shm_toc_estimate_keys(&pcxt->estimator, 1);
1484 : : }
1485 : : else
1961 noah@leadboat.com 1486 :UBC 0 : querylen = 0; /* keep compiler quiet */
1487 : :
1488 : : /* Everyone's had a chance to ask for space, so now create the DSM */
2963 rhaas@postgresql.org 1489 :CBC 82 : InitializeParallelDSM(pcxt);
1490 : :
1491 : : /* If no DSM segment was available, back out (do serial build) */
2230 tmunro@postgresql.or 1492 [ - + ]: 82 : if (pcxt->seg == NULL)
1493 : : {
2230 tmunro@postgresql.or 1494 [ # # # # ]:UBC 0 : if (IsMVCCSnapshot(snapshot))
1495 : 0 : UnregisterSnapshot(snapshot);
1496 : 0 : DestroyParallelContext(pcxt);
1497 : 0 : ExitParallelMode();
1498 : 0 : return;
1499 : : }
1500 : :
1501 : : /* Store shared build state, for which we reserved space */
2963 rhaas@postgresql.org 1502 :CBC 82 : btshared = (BTShared *) shm_toc_allocate(pcxt->toc, estbtshared);
1503 : : /* Initialize immutable state */
1504 : 82 : btshared->heaprelid = RelationGetRelid(btspool->heap);
1505 : 82 : btshared->indexrelid = RelationGetRelid(btspool->index);
1506 : 82 : btshared->isunique = btspool->isunique;
1501 peter@eisentraut.org 1507 : 82 : btshared->nulls_not_distinct = btspool->nulls_not_distinct;
2963 rhaas@postgresql.org 1508 : 82 : btshared->isconcurrent = isconcurrent;
1509 : 82 : btshared->scantuplesortstates = scantuplesortstates;
531 michael@paquier.xyz 1510 : 82 : btshared->queryid = pgstat_get_my_query_id();
2963 rhaas@postgresql.org 1511 : 82 : ConditionVariableInit(&btshared->workersdonecv);
1512 : 82 : SpinLockInit(&btshared->mutex);
1513 : : /* Initialize mutable state */
1514 : 82 : btshared->nparticipantsdone = 0;
1515 : 82 : btshared->reltuples = 0.0;
1516 : 82 : btshared->havedead = false;
1517 : 82 : btshared->indtuples = 0.0;
1518 : 82 : btshared->brokenhotchain = false;
2561 andres@anarazel.de 1519 : 82 : table_parallelscan_initialize(btspool->heap,
1520 : : ParallelTableScanFromBTShared(btshared),
1521 : : snapshot);
1522 : :
1523 : : /*
1524 : : * Store shared tuplesort-private state, for which we reserved space.
1525 : : * Then, initialize opaque state using tuplesort routine.
1526 : : */
2963 rhaas@postgresql.org 1527 : 82 : sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1528 : 82 : tuplesort_initialize_shared(sharedsort, scantuplesortstates,
1529 : : pcxt->seg);
1530 : :
1531 : 82 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_BTREE_SHARED, btshared);
1532 : 82 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);
1533 : :
1534 : : /* Unique case requires a second spool, and associated shared state */
1535 [ + + ]: 82 : if (!btspool->isunique)
1536 : 45 : sharedsort2 = NULL;
1537 : : else
1538 : : {
1539 : : /*
1540 : : * Store additional shared tuplesort-private state, for which we
1541 : : * reserved space. Then, initialize opaque state using tuplesort
1542 : : * routine.
1543 : : */
1544 : 37 : sharedsort2 = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);
1545 : 37 : tuplesort_initialize_shared(sharedsort2, scantuplesortstates,
1546 : : pcxt->seg);
1547 : :
1548 : 37 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT_SPOOL2, sharedsort2);
1549 : : }
1550 : :
1551 : : /* Store query string for workers */
1961 noah@leadboat.com 1552 [ + - ]: 82 : if (debug_query_string)
1553 : : {
1554 : : char *sharedquery;
1555 : :
1556 : 82 : sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);
1557 : 82 : memcpy(sharedquery, debug_query_string, querylen + 1);
1558 : 82 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery);
1559 : : }
1560 : :
1561 : : /*
1562 : : * Allocate space for each worker's WalUsage and BufferUsage; no need to
1563 : : * initialize.
1564 : : */
2171 akapila@postgresql.o 1565 : 82 : walusage = shm_toc_allocate(pcxt->toc,
1566 : 82 : mul_size(sizeof(WalUsage), pcxt->nworkers));
1567 : 82 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);
2166 1568 : 82 : bufferusage = shm_toc_allocate(pcxt->toc,
1569 : 82 : mul_size(sizeof(BufferUsage), pcxt->nworkers));
1570 : 82 : shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);
1571 : :
1572 : : /* Launch workers, saving status for leader/caller */
2963 rhaas@postgresql.org 1573 : 82 : LaunchParallelWorkers(pcxt);
1574 : 82 : btleader->pcxt = pcxt;
1575 : 82 : btleader->nparticipanttuplesorts = pcxt->nworkers_launched;
1576 [ + - ]: 82 : if (leaderparticipates)
1577 : 82 : btleader->nparticipanttuplesorts++;
1578 : 82 : btleader->btshared = btshared;
1579 : 82 : btleader->sharedsort = sharedsort;
1580 : 82 : btleader->sharedsort2 = sharedsort2;
1581 : 82 : btleader->snapshot = snapshot;
2171 akapila@postgresql.o 1582 : 82 : btleader->walusage = walusage;
2166 1583 : 82 : btleader->bufferusage = bufferusage;
1584 : :
1585 : : /* If no workers were successfully launched, back out (do serial build) */
2963 rhaas@postgresql.org 1586 [ - + ]: 82 : if (pcxt->nworkers_launched == 0)
1587 : : {
2963 rhaas@postgresql.org 1588 :UBC 0 : _bt_end_parallel(btleader);
1589 : 0 : return;
1590 : : }
1591 : :
1592 : : /* Save leader state now that it's clear build will be parallel */
2963 rhaas@postgresql.org 1593 :CBC 82 : buildstate->btleader = btleader;
1594 : :
1595 : : /* Join heap scan ourselves */
1596 [ + - ]: 82 : if (leaderparticipates)
1597 : 82 : _bt_leader_participate_as_worker(buildstate);
1598 : :
1599 : : /*
1600 : : * Caller needs to wait for all launched workers when we return. Make
1601 : : * sure that the failure-to-start case will not hang forever.
1602 : : */
1603 : 82 : WaitForParallelWorkersToAttach(pcxt);
1604 : : }
1605 : :
1606 : : /*
1607 : : * Shut down workers, destroy parallel context, and end parallel mode.
1608 : : */
1609 : : static void
1610 : 82 : _bt_end_parallel(BTLeader *btleader)
1611 : : {
1612 : : int i;
1613 : :
1614 : : /* Shutdown worker processes */
1615 : 82 : WaitForParallelWorkersToFinish(btleader->pcxt);
1616 : :
1617 : : /*
1618 : : * Next, accumulate WAL usage. (This must wait for the workers to finish,
1619 : : * or we might get incomplete data.)
1620 : : */
2171 akapila@postgresql.o 1621 [ + + ]: 167 : for (i = 0; i < btleader->pcxt->nworkers_launched; i++)
2166 1622 : 85 : InstrAccumParallelQuery(&btleader->bufferusage[i], &btleader->walusage[i]);
1623 : :
1624 : : /* Free last reference to MVCC snapshot, if one was used */
2963 rhaas@postgresql.org 1625 [ - + - + ]: 82 : if (IsMVCCSnapshot(btleader->snapshot))
2963 rhaas@postgresql.org 1626 :UBC 0 : UnregisterSnapshot(btleader->snapshot);
2963 rhaas@postgresql.org 1627 :CBC 82 : DestroyParallelContext(btleader->pcxt);
1628 : 82 : ExitParallelMode();
1629 : 82 : }
1630 : :
1631 : : /*
1632 : : * Returns size of shared memory required to store state for a parallel
1633 : : * btree index build based on the snapshot its parallel scan will use.
1634 : : */
1635 : : static Size
2561 andres@anarazel.de 1636 : 82 : _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot)
1637 : : {
1638 : : /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
1639 : 82 : return add_size(BUFFERALIGN(sizeof(BTShared)),
1640 : : table_parallelscan_estimate(heap, snapshot));
1641 : : }
1642 : :
1643 : : /*
1644 : : * Within leader, wait for end of heap scan.
1645 : : *
1646 : : * When called, parallel heap scan started by _bt_begin_parallel() will
1647 : : * already be underway within worker processes (when leader participates
1648 : : * as a worker, we should end up here just as workers are finishing).
1649 : : *
1650 : : * Fills in fields needed for ambuild statistics, and lets caller set
1651 : : * field indicating that some worker encountered a broken HOT chain.
1652 : : *
1653 : : * Returns the total number of heap tuples scanned.
1654 : : */
1655 : : static double
2963 rhaas@postgresql.org 1656 : 82 : _bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain)
1657 : : {
1658 : 82 : BTShared *btshared = buildstate->btleader->btshared;
1659 : : int nparticipanttuplesorts;
1660 : : double reltuples;
1661 : :
1662 : 82 : nparticipanttuplesorts = buildstate->btleader->nparticipanttuplesorts;
1663 : : for (;;)
1664 : : {
1665 [ - + ]: 220 : SpinLockAcquire(&btshared->mutex);
1666 [ + + ]: 220 : if (btshared->nparticipantsdone == nparticipanttuplesorts)
1667 : : {
1668 : 82 : buildstate->havedead = btshared->havedead;
1669 : 82 : buildstate->indtuples = btshared->indtuples;
1670 : 82 : *brokenhotchain = btshared->brokenhotchain;
1671 : 82 : reltuples = btshared->reltuples;
1672 : 82 : SpinLockRelease(&btshared->mutex);
1673 : 82 : break;
1674 : : }
1675 : 138 : SpinLockRelease(&btshared->mutex);
1676 : :
1677 : 138 : ConditionVariableSleep(&btshared->workersdonecv,
1678 : : WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN);
1679 : : }
1680 : :
1681 : 82 : ConditionVariableCancelSleep();
1682 : :
1683 : 82 : return reltuples;
1684 : : }
1685 : :
1686 : : /*
1687 : : * Within leader, participate as a parallel worker.
1688 : : */
1689 : : static void
1690 : 82 : _bt_leader_participate_as_worker(BTBuildState *buildstate)
1691 : : {
1692 : 82 : BTLeader *btleader = buildstate->btleader;
1693 : : BTSpool *leaderworker;
1694 : : BTSpool *leaderworker2;
1695 : : int sortmem;
1696 : :
1697 : : /* Allocate memory and initialize private spool */
95 michael@paquier.xyz 1698 :GNC 82 : leaderworker = palloc0_object(BTSpool);
2963 rhaas@postgresql.org 1699 :CBC 82 : leaderworker->heap = buildstate->spool->heap;
1700 : 82 : leaderworker->index = buildstate->spool->index;
1701 : 82 : leaderworker->isunique = buildstate->spool->isunique;
1501 peter@eisentraut.org 1702 : 82 : leaderworker->nulls_not_distinct = buildstate->spool->nulls_not_distinct;
1703 : :
1704 : : /* Initialize second spool, if required */
2963 rhaas@postgresql.org 1705 [ + + ]: 82 : if (!btleader->btshared->isunique)
1706 : 45 : leaderworker2 = NULL;
1707 : : else
1708 : : {
1709 : : /* Allocate memory for worker's own private secondary spool */
95 michael@paquier.xyz 1710 :GNC 37 : leaderworker2 = palloc0_object(BTSpool);
1711 : :
1712 : : /* Initialize worker's own secondary spool */
2963 rhaas@postgresql.org 1713 :CBC 37 : leaderworker2->heap = leaderworker->heap;
1714 : 37 : leaderworker2->index = leaderworker->index;
1715 : 37 : leaderworker2->isunique = false;
1716 : : }
1717 : :
1718 : : /*
1719 : : * Might as well use reliable figure when doling out maintenance_work_mem
1720 : : * (when requested number of workers were not launched, this will be
1721 : : * somewhat higher than it is for other workers).
1722 : : */
1723 : 82 : sortmem = maintenance_work_mem / btleader->nparticipanttuplesorts;
1724 : :
1725 : : /* Perform work common to all participants */
1726 : 82 : _bt_parallel_scan_and_sort(leaderworker, leaderworker2, btleader->btshared,
1727 : : btleader->sharedsort, btleader->sharedsort2,
1728 : : sortmem, true);
1729 : :
1730 : : #ifdef BTREE_BUILD_STATS
1731 : : if (log_btree_build_stats)
1732 : : {
1733 : : ShowUsage("BTREE BUILD (Leader Partial Spool) STATISTICS");
1734 : : ResetUsage();
1735 : : }
1736 : : #endif /* BTREE_BUILD_STATS */
1737 : 82 : }
1738 : :
1739 : : /*
1740 : : * Perform work within a launched parallel process.
1741 : : */
1742 : : void
1743 : 85 : _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
1744 : : {
1745 : : char *sharedquery;
1746 : : BTSpool *btspool;
1747 : : BTSpool *btspool2;
1748 : : BTShared *btshared;
1749 : : Sharedsort *sharedsort;
1750 : : Sharedsort *sharedsort2;
1751 : : Relation heapRel;
1752 : : Relation indexRel;
1753 : : LOCKMODE heapLockmode;
1754 : : LOCKMODE indexLockmode;
1755 : : WalUsage *walusage;
1756 : : BufferUsage *bufferusage;
1757 : : int sortmem;
1758 : :
1759 : : #ifdef BTREE_BUILD_STATS
1760 : : if (log_btree_build_stats)
1761 : : ResetUsage();
1762 : : #endif /* BTREE_BUILD_STATS */
1763 : :
1764 : : /*
1765 : : * The only possible status flag that can be set to the parallel worker is
1766 : : * PROC_IN_SAFE_IC.
1767 : : */
1577 akapila@postgresql.o 1768 [ - + - - ]: 85 : Assert((MyProc->statusFlags == 0) ||
1769 : : (MyProc->statusFlags == PROC_IN_SAFE_IC));
1770 : :
1771 : : /* Set debug_query_string for individual workers first */
1961 noah@leadboat.com 1772 : 85 : sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true);
2915 rhaas@postgresql.org 1773 : 85 : debug_query_string = sharedquery;
1774 : :
1775 : : /* Report the query string from leader */
1776 : 85 : pgstat_report_activity(STATE_RUNNING, debug_query_string);
1777 : :
1778 : : /* Look up nbtree shared state */
2963 1779 : 85 : btshared = shm_toc_lookup(toc, PARALLEL_KEY_BTREE_SHARED, false);
1780 : :
1781 : : /* Open relations using lock modes known to be obtained by index.c */
1782 [ + - ]: 85 : if (!btshared->isconcurrent)
1783 : : {
1784 : 85 : heapLockmode = ShareLock;
1785 : 85 : indexLockmode = AccessExclusiveLock;
1786 : : }
1787 : : else
1788 : : {
2963 rhaas@postgresql.org 1789 :UBC 0 : heapLockmode = ShareUpdateExclusiveLock;
1790 : 0 : indexLockmode = RowExclusiveLock;
1791 : : }
1792 : :
1793 : : /* Track query ID */
531 michael@paquier.xyz 1794 :CBC 85 : pgstat_report_query_id(btshared->queryid, false);
1795 : :
1796 : : /* Open relations within worker */
2610 andres@anarazel.de 1797 : 85 : heapRel = table_open(btshared->heaprelid, heapLockmode);
2963 rhaas@postgresql.org 1798 : 85 : indexRel = index_open(btshared->indexrelid, indexLockmode);
1799 : :
1800 : : /* Initialize worker's own spool */
95 michael@paquier.xyz 1801 :GNC 85 : btspool = palloc0_object(BTSpool);
2963 rhaas@postgresql.org 1802 :CBC 85 : btspool->heap = heapRel;
1803 : 85 : btspool->index = indexRel;
1804 : 85 : btspool->isunique = btshared->isunique;
1501 peter@eisentraut.org 1805 : 85 : btspool->nulls_not_distinct = btshared->nulls_not_distinct;
1806 : :
1807 : : /* Look up shared state private to tuplesort.c */
2963 rhaas@postgresql.org 1808 : 85 : sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);
1809 : 85 : tuplesort_attach_shared(sharedsort, seg);
1810 [ + + ]: 85 : if (!btshared->isunique)
1811 : : {
1812 : 48 : btspool2 = NULL;
1813 : 48 : sharedsort2 = NULL;
1814 : : }
1815 : : else
1816 : : {
1817 : : /* Allocate memory for worker's own private secondary spool */
95 michael@paquier.xyz 1818 :GNC 37 : btspool2 = palloc0_object(BTSpool);
1819 : :
1820 : : /* Initialize worker's own secondary spool */
2963 rhaas@postgresql.org 1821 :CBC 37 : btspool2->heap = btspool->heap;
1822 : 37 : btspool2->index = btspool->index;
1823 : 37 : btspool2->isunique = false;
1824 : : /* Look up shared state private to tuplesort.c */
1825 : 37 : sharedsort2 = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT_SPOOL2, false);
1826 : 37 : tuplesort_attach_shared(sharedsort2, seg);
1827 : : }
1828 : :
1829 : : /* Prepare to track buffer usage during parallel execution */
2171 akapila@postgresql.o 1830 : 85 : InstrStartParallelQuery();
1831 : :
1832 : : /* Perform sorting of spool, and possibly a spool2 */
2963 rhaas@postgresql.org 1833 : 85 : sortmem = maintenance_work_mem / btshared->scantuplesortstates;
1834 : 85 : _bt_parallel_scan_and_sort(btspool, btspool2, btshared, sharedsort,
1835 : : sharedsort2, sortmem, false);
1836 : :
1837 : : /* Report WAL/buffer usage during parallel execution */
2166 akapila@postgresql.o 1838 : 85 : bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);
2171 1839 : 85 : walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);
2166 1840 : 85 : InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber],
1841 : 85 : &walusage[ParallelWorkerNumber]);
1842 : :
1843 : : #ifdef BTREE_BUILD_STATS
1844 : : if (log_btree_build_stats)
1845 : : {
1846 : : ShowUsage("BTREE BUILD (Worker Partial Spool) STATISTICS");
1847 : : ResetUsage();
1848 : : }
1849 : : #endif /* BTREE_BUILD_STATS */
1850 : :
2963 rhaas@postgresql.org 1851 : 85 : index_close(indexRel, indexLockmode);
2610 andres@anarazel.de 1852 : 85 : table_close(heapRel, heapLockmode);
2963 rhaas@postgresql.org 1853 : 85 : }
1854 : :
1855 : : /*
1856 : : * Perform a worker's portion of a parallel sort.
1857 : : *
1858 : : * This generates a tuplesort for passed btspool, and a second tuplesort
1859 : : * state if a second btspool is need (i.e. for unique index builds). All
1860 : : * other spool fields should already be set when this is called.
1861 : : *
1862 : : * sortmem is the amount of working memory to use within each worker,
1863 : : * expressed in KBs.
1864 : : *
1865 : : * When this returns, workers are done, and need only release resources.
1866 : : */
1867 : : static void
1868 : 167 : _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
1869 : : BTShared *btshared, Sharedsort *sharedsort,
1870 : : Sharedsort *sharedsort2, int sortmem, bool progress)
1871 : : {
1872 : : SortCoordinate coordinate;
1873 : : BTBuildState buildstate;
1874 : : TableScanDesc scan;
1875 : : double reltuples;
1876 : : IndexInfo *indexInfo;
1877 : :
1878 : : /* Initialize local tuplesort coordination state */
95 michael@paquier.xyz 1879 :GNC 167 : coordinate = palloc0_object(SortCoordinateData);
2963 rhaas@postgresql.org 1880 :CBC 167 : coordinate->isWorker = true;
1881 : 167 : coordinate->nParticipants = -1;
1882 : 167 : coordinate->sharedsort = sharedsort;
1883 : :
1884 : : /* Begin "partial" tuplesort */
1885 : 334 : btspool->sortstate = tuplesort_begin_index_btree(btspool->heap,
1886 : : btspool->index,
1887 : 167 : btspool->isunique,
1501 peter@eisentraut.org 1888 : 167 : btspool->nulls_not_distinct,
1889 : : sortmem, coordinate,
1890 : : TUPLESORT_NONE);
1891 : :
1892 : : /*
1893 : : * Just as with serial case, there may be a second spool. If so, a
1894 : : * second, dedicated spool2 partial tuplesort is required.
1895 : : */
2963 rhaas@postgresql.org 1896 [ + + ]: 167 : if (btspool2)
1897 : : {
1898 : : SortCoordinate coordinate2;
1899 : :
1900 : : /*
1901 : : * We expect that the second one (for dead tuples) won't get very
1902 : : * full, so we give it only work_mem (unless sortmem is less for
1903 : : * worker). Worker processes are generally permitted to allocate
1904 : : * work_mem independently.
1905 : : */
95 michael@paquier.xyz 1906 :GNC 74 : coordinate2 = palloc0_object(SortCoordinateData);
2963 rhaas@postgresql.org 1907 :CBC 74 : coordinate2->isWorker = true;
1908 : 74 : coordinate2->nParticipants = -1;
1909 : 74 : coordinate2->sharedsort = sharedsort2;
1910 : 74 : btspool2->sortstate =
1501 peter@eisentraut.org 1911 : 74 : tuplesort_begin_index_btree(btspool->heap, btspool->index, false, false,
1912 : : Min(sortmem, work_mem), coordinate2,
1913 : : false);
1914 : : }
1915 : :
1916 : : /* Fill in buildstate for _bt_build_callback() */
2963 rhaas@postgresql.org 1917 : 167 : buildstate.isunique = btshared->isunique;
1501 peter@eisentraut.org 1918 : 167 : buildstate.nulls_not_distinct = btshared->nulls_not_distinct;
2963 rhaas@postgresql.org 1919 : 167 : buildstate.havedead = false;
1920 : 167 : buildstate.heap = btspool->heap;
1921 : 167 : buildstate.spool = btspool;
1922 : 167 : buildstate.spool2 = btspool2;
1923 : 167 : buildstate.indtuples = 0;
1924 : 167 : buildstate.btleader = NULL;
1925 : :
1926 : : /* Join parallel scan */
1927 : 167 : indexInfo = BuildIndexInfo(btspool->index);
1928 : 167 : indexInfo->ii_Concurrent = btshared->isconcurrent;
2539 alvherre@alvh.no-ip. 1929 : 167 : scan = table_beginscan_parallel(btspool->heap,
1930 : : ParallelTableScanFromBTShared(btshared));
2545 andres@anarazel.de 1931 : 167 : reltuples = table_index_build_scan(btspool->heap, btspool->index, indexInfo,
1932 : : true, progress, _bt_build_callback,
1933 : : &buildstate, scan);
1934 : :
1935 : : /* Execute this worker's part of the sort */
1738 alvherre@alvh.no-ip. 1936 [ + + ]: 167 : if (progress)
1937 : 82 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
1938 : : PROGRESS_BTREE_PHASE_PERFORMSORT_1);
2963 rhaas@postgresql.org 1939 : 167 : tuplesort_performsort(btspool->sortstate);
1940 [ + + ]: 167 : if (btspool2)
1941 : : {
1738 alvherre@alvh.no-ip. 1942 [ + + ]: 74 : if (progress)
1943 : 37 : pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
1944 : : PROGRESS_BTREE_PHASE_PERFORMSORT_2);
2963 rhaas@postgresql.org 1945 : 74 : tuplesort_performsort(btspool2->sortstate);
1946 : : }
1947 : :
1948 : : /*
1949 : : * Done. Record ambuild statistics, and whether we encountered a broken
1950 : : * HOT chain.
1951 : : */
1952 [ - + ]: 167 : SpinLockAcquire(&btshared->mutex);
1953 : 167 : btshared->nparticipantsdone++;
1954 : 167 : btshared->reltuples += reltuples;
1955 [ - + ]: 167 : if (buildstate.havedead)
2963 rhaas@postgresql.org 1956 :UBC 0 : btshared->havedead = true;
2963 rhaas@postgresql.org 1957 :CBC 167 : btshared->indtuples += buildstate.indtuples;
1958 [ - + ]: 167 : if (indexInfo->ii_BrokenHotChain)
2963 rhaas@postgresql.org 1959 :UBC 0 : btshared->brokenhotchain = true;
2963 rhaas@postgresql.org 1960 :CBC 167 : SpinLockRelease(&btshared->mutex);
1961 : :
1962 : : /* Notify leader */
1963 : 167 : ConditionVariableSignal(&btshared->workersdonecv);
1964 : :
1965 : : /* We can end tuplesorts immediately */
1966 : 167 : tuplesort_end(btspool->sortstate);
1967 [ + + ]: 167 : if (btspool2)
1968 : 74 : tuplesort_end(btspool2->sortstate);
1969 : 167 : }
|