Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * gist.c
4 : : * interface routines for the postgres GiST index access method.
5 : : *
6 : : *
7 : : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
8 : : * Portions Copyright (c) 1994, Regents of the University of California
9 : : *
10 : : * IDENTIFICATION
11 : : * src/backend/access/gist/gist.c
12 : : *
13 : : *-------------------------------------------------------------------------
14 : : */
15 : : #include "postgres.h"
16 : :
17 : : #include "access/gist_private.h"
18 : : #include "access/gistscan.h"
19 : : #include "access/xloginsert.h"
20 : : #include "catalog/pg_collation.h"
21 : : #include "commands/vacuum.h"
22 : : #include "miscadmin.h"
23 : : #include "nodes/execnodes.h"
24 : : #include "storage/predicate.h"
25 : : #include "utils/fmgrprotos.h"
26 : : #include "utils/index_selfuncs.h"
27 : : #include "utils/memutils.h"
28 : : #include "utils/rel.h"
29 : :
30 : : /* non-export function prototypes */
31 : : static void gistfixsplit(GISTInsertState *state, GISTSTATE *giststate);
32 : : static bool gistinserttuple(GISTInsertState *state, GISTInsertStack *stack,
33 : : GISTSTATE *giststate, IndexTuple tuple, OffsetNumber oldoffnum);
34 : : static bool gistinserttuples(GISTInsertState *state, GISTInsertStack *stack,
35 : : GISTSTATE *giststate,
36 : : IndexTuple *tuples, int ntup, OffsetNumber oldoffnum,
37 : : Buffer leftchild, Buffer rightchild,
38 : : bool unlockbuf, bool unlockleftchild);
39 : : static void gistfinishsplit(GISTInsertState *state, GISTInsertStack *stack,
40 : : GISTSTATE *giststate, List *splitinfo, bool unlockbuf);
41 : : static void gistprunepage(Relation rel, Page page, Buffer buffer,
42 : : Relation heapRel);
43 : :
44 : :
45 : : #define ROTATEDIST(d) do { \
46 : : SplitPageLayout *tmp = palloc0_object(SplitPageLayout); \
47 : : tmp->block.blkno = InvalidBlockNumber; \
48 : : tmp->buffer = InvalidBuffer; \
49 : : tmp->next = (d); \
50 : : (d)=tmp; \
51 : : } while(0)
52 : :
53 : :
54 : : /*
55 : : * GiST handler function: return IndexAmRoutine with access method parameters
56 : : * and callbacks.
57 : : */
58 : : Datum
3621 tgl@sss.pgh.pa.us 59 :CBC 7710 : gisthandler(PG_FUNCTION_ARGS)
60 : : {
61 : 7710 : IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
62 : :
63 : 7710 : amroutine->amstrategies = 0;
3519 teodor@sigaev.ru 64 : 7710 : amroutine->amsupport = GISTNProcs;
2087 akorotkov@postgresql 65 : 7710 : amroutine->amoptsprocnum = GIST_OPTIONS_PROC;
3621 tgl@sss.pgh.pa.us 66 : 7710 : amroutine->amcanorder = false;
67 : 7710 : amroutine->amcanorderbyop = true;
292 peter@eisentraut.org 68 : 7710 : amroutine->amcanhash = false;
284 69 : 7710 : amroutine->amconsistentequality = false;
70 : 7710 : amroutine->amconsistentordering = false;
3621 tgl@sss.pgh.pa.us 71 : 7710 : amroutine->amcanbackward = false;
72 : 7710 : amroutine->amcanunique = false;
73 : 7710 : amroutine->amcanmulticol = true;
74 : 7710 : amroutine->amoptionalkey = true;
75 : 7710 : amroutine->amsearcharray = false;
76 : 7710 : amroutine->amsearchnulls = true;
77 : 7710 : amroutine->amstorage = true;
78 : 7710 : amroutine->amclusterable = true;
2821 teodor@sigaev.ru 79 : 7710 : amroutine->ampredlocks = true;
3226 rhaas@postgresql.org 80 : 7710 : amroutine->amcanparallel = false;
739 tomas.vondra@postgre 81 : 7710 : amroutine->amcanbuildparallel = false;
2473 akorotkov@postgresql 82 : 7710 : amroutine->amcaninclude = true;
2162 akapila@postgresql.o 83 : 7710 : amroutine->amusemaintenanceworkmem = false;
1002 tomas.vondra@postgre 84 : 7710 : amroutine->amsummarizing = false;
2162 akapila@postgresql.o 85 : 7710 : amroutine->amparallelvacuumoptions =
86 : : VACUUM_OPTION_PARALLEL_BULKDEL | VACUUM_OPTION_PARALLEL_COND_CLEANUP;
3621 tgl@sss.pgh.pa.us 87 : 7710 : amroutine->amkeytype = InvalidOid;
88 : :
89 : 7710 : amroutine->ambuild = gistbuild;
90 : 7710 : amroutine->ambuildempty = gistbuildempty;
91 : 7710 : amroutine->aminsert = gistinsert;
752 tomas.vondra@postgre 92 : 7710 : amroutine->aminsertcleanup = NULL;
3621 tgl@sss.pgh.pa.us 93 : 7710 : amroutine->ambulkdelete = gistbulkdelete;
94 : 7710 : amroutine->amvacuumcleanup = gistvacuumcleanup;
95 : 7710 : amroutine->amcanreturn = gistcanreturn;
96 : 7710 : amroutine->amcostestimate = gistcostestimate;
462 peter@eisentraut.org 97 : 7710 : amroutine->amgettreeheight = NULL;
3621 tgl@sss.pgh.pa.us 98 : 7710 : amroutine->amoptions = gistoptions;
3412 99 : 7710 : amroutine->amproperty = gistproperty;
2450 alvherre@alvh.no-ip. 100 : 7710 : amroutine->ambuildphasename = NULL;
3621 tgl@sss.pgh.pa.us 101 : 7710 : amroutine->amvalidate = gistvalidate;
1963 102 : 7710 : amroutine->amadjustmembers = gistadjustmembers;
3621 103 : 7710 : amroutine->ambeginscan = gistbeginscan;
104 : 7710 : amroutine->amrescan = gistrescan;
105 : 7710 : amroutine->amgettuple = gistgettuple;
106 : 7710 : amroutine->amgetbitmap = gistgetbitmap;
107 : 7710 : amroutine->amendscan = gistendscan;
108 : 7710 : amroutine->ammarkpos = NULL;
109 : 7710 : amroutine->amrestrpos = NULL;
3248 rhaas@postgresql.org 110 : 7710 : amroutine->amestimateparallelscan = NULL;
111 : 7710 : amroutine->aminitparallelscan = NULL;
112 : 7710 : amroutine->amparallelrescan = NULL;
317 peter@eisentraut.org 113 : 7710 : amroutine->amtranslatestrategy = NULL;
316 114 : 7710 : amroutine->amtranslatecmptype = gisttranslatecmptype;
115 : :
3621 tgl@sss.pgh.pa.us 116 : 7710 : PG_RETURN_POINTER(amroutine);
117 : : }
118 : :
119 : : /*
120 : : * Create and return a temporary memory context for use by GiST. We
121 : : * _always_ invoke user-provided methods in a temporary memory
122 : : * context, so that memory leaks in those functions cannot cause
123 : : * problems. Also, we use some additional temporary contexts in the
124 : : * GiST code itself, to avoid the need to do some awkward manual
125 : : * memory management.
126 : : */
127 : : MemoryContext
7390 bruce@momjian.us 128 : 4884 : createTempGistContext(void)
129 : : {
130 : 4884 : return AllocSetContextCreate(CurrentMemoryContext,
131 : : "GiST temporary context",
132 : : ALLOCSET_DEFAULT_SIZES);
133 : : }
134 : :
135 : : /*
136 : : * gistbuildempty() -- build an empty gist index in the initialization fork
137 : : */
138 : : void
3621 tgl@sss.pgh.pa.us 139 : 5 : gistbuildempty(Relation index)
140 : : {
141 : : Buffer buffer;
142 : :
143 : : /* Initialize the root page */
846 tmunro@postgresql.or 144 : 5 : buffer = ExtendBufferedRel(BMR_REL(index), INIT_FORKNUM, NULL,
145 : : EB_SKIP_EXTENSION_LOCK | EB_LOCK_FIRST);
146 : :
147 : : /* Initialize and xlog buffer */
4691 heikki.linnakangas@i 148 : 5 : START_CRIT_SECTION();
149 : 5 : GISTInitBuffer(buffer, F_LEAF);
150 : 5 : MarkBufferDirty(buffer);
4395 151 : 5 : log_newpage_buffer(buffer, true);
4691 152 [ - + ]: 5 : END_CRIT_SECTION();
153 : :
154 : : /* Unlock and release the buffer */
155 : 5 : UnlockReleaseBuffer(buffer);
5466 rhaas@postgresql.org 156 : 5 : }
157 : :
158 : : /*
159 : : * gistinsert -- wrapper for GiST tuple insertion.
160 : : *
161 : : * This is the public interface routine for tuple insertion in GiSTs.
162 : : * It doesn't do any work; just locks the relation and passes the buck.
163 : : */
164 : : bool
3621 tgl@sss.pgh.pa.us 165 : 152115 : gistinsert(Relation r, Datum *values, bool *isnull,
166 : : ItemPointer ht_ctid, Relation heapRel,
167 : : IndexUniqueCheck checkUnique,
168 : : bool indexUnchanged,
169 : : IndexInfo *indexInfo)
170 : : {
3232 171 : 152115 : GISTSTATE *giststate = (GISTSTATE *) indexInfo->ii_AmCache;
172 : : IndexTuple itup;
173 : : MemoryContext oldCxt;
174 : :
175 : : /* Initialize GISTSTATE cache if first call in this statement */
176 [ + + ]: 152115 : if (giststate == NULL)
177 : : {
178 : 960 : oldCxt = MemoryContextSwitchTo(indexInfo->ii_Context);
179 : 960 : giststate = initGISTstate(r);
180 : 960 : giststate->tempCxt = createTempGistContext();
383 peter@eisentraut.org 181 : 960 : indexInfo->ii_AmCache = giststate;
3232 tgl@sss.pgh.pa.us 182 : 960 : MemoryContextSwitchTo(oldCxt);
183 : : }
184 : :
5191 185 : 152115 : oldCxt = MemoryContextSwitchTo(giststate->tempCxt);
186 : :
404 michael@paquier.xyz 187 : 152115 : itup = gistFormTuple(giststate, r, values, isnull, true);
10327 bruce@momjian.us 188 : 152114 : itup->t_tid = *ht_ctid;
189 : :
2449 heikki.linnakangas@i 190 : 152114 : gistdoinsert(r, itup, 0, giststate, heapRel, false);
191 : :
192 : : /* cleanup */
5191 tgl@sss.pgh.pa.us 193 : 152108 : MemoryContextSwitchTo(oldCxt);
3232 194 : 152108 : MemoryContextReset(giststate->tempCxt);
195 : :
3621 196 : 152108 : return false;
197 : : }
198 : :
199 : :
200 : : /*
201 : : * Place tuples from 'itup' to 'buffer'. If 'oldoffnum' is valid, the tuple
202 : : * at that offset is atomically removed along with inserting the new tuples.
203 : : * This is used to replace a tuple with a new one.
204 : : *
205 : : * If 'leftchildbuf' is valid, we're inserting the downlink for the page
206 : : * to the right of 'leftchildbuf', or updating the downlink for 'leftchildbuf'.
207 : : * F_FOLLOW_RIGHT flag on 'leftchildbuf' is cleared and NSN is set.
208 : : *
209 : : * If 'markfollowright' is true and the page is split, the left child is
210 : : * marked with F_FOLLOW_RIGHT flag. That is the normal case. During buffered
211 : : * index build, however, there is no concurrent access and the page splitting
212 : : * is done in a slightly simpler fashion, and false is passed.
213 : : *
214 : : * If there is not enough room on the page, it is split. All the split
215 : : * pages are kept pinned and locked and returned in *splitinfo, the caller
216 : : * is responsible for inserting the downlinks for them. However, if
217 : : * 'buffer' is the root page and it needs to be split, gistplacetopage()
218 : : * performs the split as one atomic operation, and *splitinfo is set to NIL.
219 : : * In that case, we continue to hold the root page locked, and the child
220 : : * pages are released; note that new tuple(s) are *not* on the root page
221 : : * but in one of the new child pages.
222 : : *
223 : : * If 'newblkno' is not NULL, returns the block number of page the first
224 : : * new/updated tuple was inserted to. Usually it's the given page, but could
225 : : * be its right sibling if the page was split.
226 : : *
227 : : * Returns 'true' if the page was split, 'false' otherwise.
228 : : */
229 : : bool
5213 heikki.linnakangas@i 230 : 816131 : gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
231 : : Buffer buffer,
232 : : IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
233 : : BlockNumber *newblkno,
234 : : Buffer leftchildbuf,
235 : : List **splitinfo,
236 : : bool markfollowright,
237 : : Relation heapRel,
238 : : bool is_build)
239 : : {
4870 240 : 816131 : BlockNumber blkno = BufferGetBlockNumber(buffer);
3527 kgrittn@postgresql.o 241 : 816131 : Page page = BufferGetPage(buffer);
5472 heikki.linnakangas@i 242 : 816131 : bool is_leaf = (GistPageIsLeaf(page)) ? true : false;
243 : : XLogRecPtr recptr;
244 : : bool is_split;
245 : :
246 : : /*
247 : : * Refuse to modify a page that's incompletely split. This should not
248 : : * happen because we finish any incomplete splits while we walk down the
249 : : * tree. However, it's remotely possible that another concurrent inserter
250 : : * splits a parent page, and errors out before completing the split. We
251 : : * will just throw an error in that case, and leave any split we had in
252 : : * progress unfinished too. The next insert that comes along will clean up
253 : : * the mess.
254 : : */
255 [ - + ]: 816131 : if (GistFollowRight(page))
5472 heikki.linnakangas@i 256 [ # # ]:UBC 0 : elog(ERROR, "concurrent GiST page split was incomplete");
257 : :
258 : : /* should never try to insert to a deleted page */
1791 heikki.linnakangas@i 259 [ - + ]:CBC 816131 : Assert(!GistPageIsDeleted(page));
260 : :
5472 261 : 816131 : *splitinfo = NIL;
262 : :
263 : : /*
264 : : * if isupdate, remove old key: This node's key has been modified, either
265 : : * because a child split occurred or because we needed to adjust our key
266 : : * for an insert in a child node. Therefore, remove the old version of
267 : : * this node's key.
268 : : *
269 : : * for WAL replay, in the non-split case we handle this by setting up a
270 : : * one-element todelete array; in the split case, it's handled implicitly
271 : : * because the tuple vector passed to gistSplit won't include this tuple.
272 : : */
5213 273 : 816131 : is_split = gistnospace(page, itup, ntup, oldoffnum, freespace);
274 : :
275 : : /*
276 : : * If leaf page is full, try at first to delete dead tuples. And then
277 : : * check again.
278 : : */
3751 teodor@sigaev.ru 279 [ + + + + : 816131 : if (is_split && GistPageIsLeaf(page) && GistPageHasGarbage(page))
- + ]
280 : : {
2478 heikki.linnakangas@i 281 :UBC 0 : gistprunepage(rel, page, buffer, heapRel);
3751 teodor@sigaev.ru 282 : 0 : is_split = gistnospace(page, itup, ntup, oldoffnum, freespace);
283 : : }
284 : :
5472 heikki.linnakangas@i 285 [ + + ]:CBC 816131 : if (is_split)
286 : : {
287 : : /* no space for insertion */
288 : : IndexTuple *itvec;
289 : : int tlen;
714 rhaas@postgresql.org 290 : 12539 : SplitPageLayout *dist = NULL,
291 : : *ptr;
5472 heikki.linnakangas@i 292 : 12539 : BlockNumber oldrlink = InvalidBlockNumber;
4923 293 : 12539 : GistNSN oldnsn = 0;
294 : : SplitPageLayout rootpg;
295 : : bool is_rootsplit;
296 : : int npage;
297 : :
5472 298 : 12539 : is_rootsplit = (blkno == GIST_ROOT_BLKNO);
299 : :
300 : : /*
301 : : * Form index tuples vector to split. If we're replacing an old tuple,
302 : : * remove the old version from the vector.
303 : : */
304 : 12539 : itvec = gistextractpage(page, &tlen);
305 [ + + + - : 12539 : if (OffsetNumberIsValid(oldoffnum))
+ + ]
306 : : {
307 : : /* on inner page we should remove old tuple */
308 : 2701 : int pos = oldoffnum - FirstOffsetNumber;
309 : :
7013 bruce@momjian.us 310 : 2701 : tlen--;
311 [ + + ]: 2701 : if (pos != tlen)
312 : 1765 : memmove(itvec + pos, itvec + pos + 1, sizeof(IndexTuple) * (tlen - pos));
313 : : }
5472 heikki.linnakangas@i 314 : 12539 : itvec = gistjoinvector(itvec, &tlen, itup, ntup);
5213 315 : 12539 : dist = gistSplit(rel, page, itvec, tlen, giststate);
316 : :
317 : : /*
318 : : * Check that split didn't produce too many pages.
319 : : */
4275 320 : 12539 : npage = 0;
321 [ + + ]: 37648 : for (ptr = dist; ptr; ptr = ptr->next)
322 : 25109 : npage++;
323 : : /* in a root split, we'll add one more page to the list below */
324 [ + + ]: 12539 : if (is_rootsplit)
325 : 176 : npage++;
326 [ - + ]: 12539 : if (npage > GIST_MAX_SPLIT_PAGES)
4275 heikki.linnakangas@i 327 [ # # ]:UBC 0 : elog(ERROR, "GiST page split into too many halves (%d, maximum %d)",
328 : : npage, GIST_MAX_SPLIT_PAGES);
329 : :
330 : : /*
331 : : * Set up pages to work with. Allocate new buffers for all but the
332 : : * leftmost page. The original page becomes the new leftmost page, and
333 : : * is just replaced with the new contents.
334 : : *
335 : : * For a root-split, allocate new buffers for all child pages, the
336 : : * original page is overwritten with new root page containing
337 : : * downlinks to the new child pages.
338 : : */
5472 heikki.linnakangas@i 339 :CBC 12539 : ptr = dist;
340 [ + + ]: 12539 : if (!is_rootsplit)
341 : : {
342 : : /* save old rightlink and NSN */
343 : 12363 : oldrlink = GistPageGetOpaque(page)->rightlink;
4716 344 : 12363 : oldnsn = GistPageGetNSN(page);
345 : :
5472 346 : 12363 : dist->buffer = buffer;
347 : 12363 : dist->block.blkno = BufferGetBlockNumber(buffer);
3527 kgrittn@postgresql.o 348 : 12363 : dist->page = PageGetTempPageCopySpecial(BufferGetPage(buffer));
349 : :
350 : : /* clean all flags except F_LEAF */
7160 teodor@sigaev.ru 351 : 12363 : GistPageGetOpaque(dist->page)->flags = (is_leaf) ? F_LEAF : 0;
352 : :
5472 heikki.linnakangas@i 353 : 12363 : ptr = ptr->next;
354 : : }
355 [ + + ]: 25285 : for (; ptr; ptr = ptr->next)
356 : : {
357 : : /* Allocate new page */
990 andres@anarazel.de 358 : 12746 : ptr->buffer = gistNewBuffer(rel, heapRel);
5472 heikki.linnakangas@i 359 : 12746 : GISTInitBuffer(ptr->buffer, (is_leaf) ? F_LEAF : 0);
3527 kgrittn@postgresql.o 360 : 12746 : ptr->page = BufferGetPage(ptr->buffer);
5472 heikki.linnakangas@i 361 : 12746 : ptr->block.blkno = BufferGetBlockNumber(ptr->buffer);
2821 teodor@sigaev.ru 362 : 12746 : PredicateLockPageSplit(rel,
363 : : BufferGetBlockNumber(buffer),
364 : : BufferGetBlockNumber(ptr->buffer));
365 : : }
366 : :
367 : : /*
368 : : * Now that we know which blocks the new pages go to, set up downlink
369 : : * tuples to point to them.
370 : : */
7013 bruce@momjian.us 371 [ + + ]: 37648 : for (ptr = dist; ptr; ptr = ptr->next)
372 : : {
5472 heikki.linnakangas@i 373 : 25109 : ItemPointerSetBlockNumber(&(ptr->itup->t_tid), ptr->block.blkno);
374 : 25109 : GistTupleSetValid(ptr->itup);
375 : : }
376 : :
377 : : /*
378 : : * If this is a root split, we construct the new root page with the
379 : : * downlinks here directly, instead of requiring the caller to insert
380 : : * them. Add the new root page to the list along with the child pages.
381 : : */
382 [ + + ]: 12539 : if (is_rootsplit)
383 : : {
384 : : IndexTuple *downlinks;
5364 bruce@momjian.us 385 : 176 : int ndownlinks = 0;
386 : : int i;
387 : :
5472 heikki.linnakangas@i 388 : 176 : rootpg.buffer = buffer;
3527 kgrittn@postgresql.o 389 : 176 : rootpg.page = PageGetTempPageCopySpecial(BufferGetPage(rootpg.buffer));
5472 heikki.linnakangas@i 390 : 176 : GistPageGetOpaque(rootpg.page)->flags = 0;
391 : :
392 : : /* Prepare a vector of all the downlinks */
393 [ + + ]: 532 : for (ptr = dist; ptr; ptr = ptr->next)
394 : 356 : ndownlinks++;
6 michael@paquier.xyz 395 :GNC 176 : downlinks = palloc_array(IndexTuple, ndownlinks);
5472 heikki.linnakangas@i 396 [ + + ]:CBC 532 : for (i = 0, ptr = dist; ptr; ptr = ptr->next)
397 : 356 : downlinks[i++] = ptr->itup;
398 : :
399 : 176 : rootpg.block.blkno = GIST_ROOT_BLKNO;
400 : 176 : rootpg.block.num = ndownlinks;
401 : 176 : rootpg.list = gistfillitupvec(downlinks, ndownlinks,
402 : : &(rootpg.lenlist));
403 : 176 : rootpg.itup = NULL;
404 : :
405 : 176 : rootpg.next = dist;
406 : 176 : dist = &rootpg;
407 : : }
408 : : else
409 : : {
410 : : /* Prepare split-info to be returned to caller */
411 [ + + ]: 37116 : for (ptr = dist; ptr; ptr = ptr->next)
412 : : {
6 michael@paquier.xyz 413 :GNC 24753 : GISTPageSplitInfo *si = palloc_object(GISTPageSplitInfo);
414 : :
5472 heikki.linnakangas@i 415 :CBC 24753 : si->buf = ptr->buffer;
416 : 24753 : si->downlink = ptr->itup;
417 : 24753 : *splitinfo = lappend(*splitinfo, si);
418 : : }
419 : : }
420 : :
421 : : /*
422 : : * Fill all pages. All the pages are new, ie. freshly allocated empty
423 : : * pages, or a temporary copy of the old page.
424 : : */
425 [ + + ]: 37824 : for (ptr = dist; ptr; ptr = ptr->next)
426 : : {
5364 bruce@momjian.us 427 : 25285 : char *data = (char *) (ptr->list);
428 : :
1210 drowley@postgresql.o 429 [ + + ]: 883895 : for (int i = 0; i < ptr->block.num; i++)
430 : : {
4870 heikki.linnakangas@i 431 : 858610 : IndexTuple thistup = (IndexTuple) data;
432 : :
50 peter@eisentraut.org 433 [ - + ]:GNC 858610 : if (PageAddItem(ptr->page, data, IndexTupleSize(thistup), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber)
5213 heikki.linnakangas@i 434 [ # # ]:UBC 0 : elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(rel));
435 : :
436 : : /*
437 : : * If this is the first inserted/updated tuple, let the caller
438 : : * know which page it landed on.
439 : : */
4870 heikki.linnakangas@i 440 [ + + + + ]:CBC 858610 : if (newblkno && ItemPointerEquals(&thistup->t_tid, &(*itup)->t_tid))
441 : 387 : *newblkno = ptr->block.blkno;
442 : :
443 : 858610 : data += IndexTupleSize(thistup);
444 : : }
445 : :
446 : : /* Set up rightlinks */
5472 447 [ + + + + ]: 25285 : if (ptr->next && ptr->block.blkno != GIST_ROOT_BLKNO)
448 : 25140 : GistPageGetOpaque(ptr->page)->rightlink =
449 : 12570 : ptr->next->block.blkno;
450 : : else
451 : 12715 : GistPageGetOpaque(ptr->page)->rightlink = oldrlink;
452 : :
453 : : /*
454 : : * Mark the all but the right-most page with the follow-right
455 : : * flag. It will be cleared as soon as the downlink is inserted
456 : : * into the parent, but this ensures that if we error out before
457 : : * that, the index is still consistent. (in buffering build mode,
458 : : * any error will abort the index build anyway, so this is not
459 : : * needed.)
460 : : */
5213 461 [ + + + + : 25285 : if (ptr->next && !is_rootsplit && markfollowright)
+ + ]
5472 462 : 12006 : GistMarkFollowRight(ptr->page);
463 : : else
464 : 13279 : GistClearFollowRight(ptr->page);
465 : :
466 : : /*
467 : : * Copy the NSN of the original page to all pages. The
468 : : * F_FOLLOW_RIGHT flags ensure that scans will follow the
469 : : * rightlinks until the downlinks are inserted.
470 : : */
4716 471 : 25285 : GistPageSetNSN(ptr->page, oldnsn);
472 : : }
473 : :
474 : : /*
475 : : * gistXLogSplit() needs to WAL log a lot of pages, prepare WAL
476 : : * insertion for that. NB: The number of pages and data segments
477 : : * specified here must match the calculations in gistXLogSplit()!
478 : : */
2449 479 [ + + + + : 12539 : if (!is_build && RelationNeedsWAL(rel))
+ + + - +
- ]
4044 480 : 1724 : XLogEnsureRecordSpace(npage, 1 + npage * 2);
481 : :
7160 teodor@sigaev.ru 482 : 12539 : START_CRIT_SECTION();
483 : :
484 : : /*
485 : : * Must mark buffers dirty before XLogInsert, even though we'll still
486 : : * be changing their opaque fields below.
487 : : */
7013 bruce@momjian.us 488 [ + + ]: 37824 : for (ptr = dist; ptr; ptr = ptr->next)
7200 tgl@sss.pgh.pa.us 489 : 25285 : MarkBufferDirty(ptr->buffer);
5472 heikki.linnakangas@i 490 [ + + ]: 12539 : if (BufferIsValid(leftchildbuf))
491 : 2626 : MarkBufferDirty(leftchildbuf);
492 : :
493 : : /*
494 : : * The first page in the chain was a temporary working copy meant to
495 : : * replace the old page. Copy it over the old page.
496 : : */
3527 kgrittn@postgresql.o 497 : 12539 : PageRestoreTempPage(dist->page, BufferGetPage(dist->buffer));
498 : 12539 : dist->page = BufferGetPage(dist->buffer);
499 : :
500 : : /*
501 : : * Write the WAL record.
502 : : *
503 : : * If we're building a new index, however, we don't WAL-log changes
504 : : * yet. The LSN-NSN interlock between parent and child requires that
505 : : * LSNs never move backwards, so set the LSNs to a value that's
506 : : * smaller than any real or fake unlogged LSN that might be generated
507 : : * later. (There can't be any concurrent scans during index build, so
508 : : * we don't need to be able to detect concurrent splits yet.)
509 : : */
2449 heikki.linnakangas@i 510 [ + + ]: 12539 : if (is_build)
511 : 10812 : recptr = GistBuildLSN;
512 : : else
513 : : {
514 [ + + + + : 1727 : if (RelationNeedsWAL(rel))
+ - + - ]
515 : 1724 : recptr = gistXLogSplit(is_leaf,
516 : : dist, oldrlink, oldnsn, leftchildbuf,
517 : : markfollowright);
518 : : else
519 : 3 : recptr = gistGetFakeLSN(rel);
520 : : }
521 : :
7013 bruce@momjian.us 522 [ + + ]: 37824 : for (ptr = dist; ptr; ptr = ptr->next)
5472 heikki.linnakangas@i 523 : 25285 : PageSetLSN(ptr->page, recptr);
524 : :
525 : : /*
526 : : * Return the new child buffers to the caller.
527 : : *
528 : : * If this was a root split, we've already inserted the downlink
529 : : * pointers, in the form of a new root page. Therefore we can release
530 : : * all the new buffers, and keep just the root page locked.
531 : : */
532 [ + + ]: 12539 : if (is_rootsplit)
533 : : {
534 [ + + ]: 532 : for (ptr = dist->next; ptr; ptr = ptr->next)
535 : 356 : UnlockReleaseBuffer(ptr->buffer);
536 : : }
537 : : }
538 : : else
539 : : {
540 : : /*
541 : : * Enough space. We always get here if ntup==0.
542 : : */
7160 teodor@sigaev.ru 543 : 803592 : START_CRIT_SECTION();
544 : :
545 : : /*
546 : : * Delete old tuple if any, then insert new tuple(s) if any. If
547 : : * possible, use the fast path of PageIndexTupleOverwrite.
548 : : */
5472 heikki.linnakangas@i 549 [ + + + - : 803592 : if (OffsetNumberIsValid(oldoffnum))
+ + ]
550 : : {
3385 tgl@sss.pgh.pa.us 551 [ + + ]: 368421 : if (ntup == 1)
552 : : {
553 : : /* One-for-one replacement, so use PageIndexTupleOverwrite */
50 peter@eisentraut.org 554 [ - + ]:GNC 358690 : if (!PageIndexTupleOverwrite(page, oldoffnum, *itup, IndexTupleSize(*itup)))
3385 tgl@sss.pgh.pa.us 555 [ # # ]:UBC 0 : elog(ERROR, "failed to add item to index page in \"%s\"",
556 : : RelationGetRelationName(rel));
557 : : }
558 : : else
559 : : {
560 : : /* Delete old, then append new tuple(s) to page */
3385 tgl@sss.pgh.pa.us 561 :CBC 9731 : PageIndexTupleDelete(page, oldoffnum);
562 : 9731 : gistfillbuffer(page, itup, ntup, InvalidOffsetNumber);
563 : : }
564 : : }
565 : : else
566 : : {
567 : : /* Just append new tuples at the end of the page */
568 : 435171 : gistfillbuffer(page, itup, ntup, InvalidOffsetNumber);
569 : : }
570 : :
5472 heikki.linnakangas@i 571 : 803592 : MarkBufferDirty(buffer);
572 : :
573 [ + + ]: 803592 : if (BufferIsValid(leftchildbuf))
574 : 9380 : MarkBufferDirty(leftchildbuf);
575 : :
2449 576 [ + + ]: 803592 : if (is_build)
577 : 556321 : recptr = GistBuildLSN;
578 : : else
579 : : {
580 [ + + + + : 247271 : if (RelationNeedsWAL(rel))
+ - + - ]
7390 bruce@momjian.us 581 : 247232 : {
2449 heikki.linnakangas@i 582 : 247232 : OffsetNumber ndeloffs = 0,
583 : : deloffs[1];
584 : :
585 [ + + + - : 247232 : if (OffsetNumberIsValid(oldoffnum))
+ + ]
586 : : {
587 : 96878 : deloffs[0] = oldoffnum;
588 : 96878 : ndeloffs = 1;
589 : : }
590 : :
591 : 247232 : recptr = gistXLogUpdate(buffer,
592 : : deloffs, ndeloffs, itup, ntup,
593 : : leftchildbuf);
594 : : }
595 : : else
596 : 39 : recptr = gistGetFakeLSN(rel);
597 : : }
598 : 803592 : PageSetLSN(page, recptr);
599 : :
4870 600 [ + + ]: 803592 : if (newblkno)
601 : 52185 : *newblkno = blkno;
602 : : }
603 : :
604 : : /*
605 : : * If we inserted the downlink for a child page, set NSN and clear
606 : : * F_FOLLOW_RIGHT flag on the left child, so that concurrent scans know to
607 : : * follow the rightlink if and only if they looked at the parent page
608 : : * before we inserted the downlink.
609 : : *
610 : : * Note that we do this *after* writing the WAL record. That means that
611 : : * the possible full page image in the WAL record does not include these
612 : : * changes, and they must be replayed even if the page is restored from
613 : : * the full page image. There's a chicken-and-egg problem: if we updated
614 : : * the child pages first, we wouldn't know the recptr of the WAL record
615 : : * we're about to write.
616 : : */
5472 617 [ + + ]: 816131 : if (BufferIsValid(leftchildbuf))
618 : : {
3527 kgrittn@postgresql.o 619 : 12006 : Page leftpg = BufferGetPage(leftchildbuf);
620 : :
4716 heikki.linnakangas@i 621 : 12006 : GistPageSetNSN(leftpg, recptr);
5472 622 : 12006 : GistClearFollowRight(leftpg);
623 : :
624 : 12006 : PageSetLSN(leftpg, recptr);
625 : : }
626 : :
627 [ - + ]: 816131 : END_CRIT_SECTION();
628 : :
629 : 816131 : return is_split;
630 : : }
631 : :
632 : : /*
633 : : * Workhorse routine for doing insertion into a GiST index. Note that
634 : : * this routine assumes it is invoked in a short-lived memory context,
635 : : * so it does not bother releasing palloc'd allocations.
636 : : */
637 : : void
2552 akorotkov@postgresql 638 : 427273 : gistdoinsert(Relation r, IndexTuple itup, Size freespace,
639 : : GISTSTATE *giststate, Relation heapRel, bool is_build)
640 : : {
641 : : ItemId iid;
642 : : IndexTuple idxtuple;
643 : : GISTInsertStack firststack;
644 : : GISTInsertStack *stack;
645 : : GISTInsertState state;
5472 heikki.linnakangas@i 646 : 427273 : bool xlocked = false;
647 : :
648 : 427273 : memset(&state, 0, sizeof(GISTInsertState));
649 : 427273 : state.freespace = freespace;
650 : 427273 : state.r = r;
2552 akorotkov@postgresql 651 : 427273 : state.heapRel = heapRel;
2449 heikki.linnakangas@i 652 : 427273 : state.is_build = is_build;
653 : :
654 : : /* Start from the root */
5472 655 : 427273 : firststack.blkno = GIST_ROOT_BLKNO;
4923 656 : 427273 : firststack.lsn = 0;
2408 657 : 427273 : firststack.retry_from_parent = false;
5472 658 : 427273 : firststack.parent = NULL;
5268 659 : 427273 : firststack.downlinkoffnum = InvalidOffsetNumber;
5472 660 : 427273 : state.stack = stack = &firststack;
661 : :
662 : : /*
663 : : * Walk down along the path of smallest penalty, updating the parent
664 : : * pointers with the key we're inserting as we go. If we crash in the
665 : : * middle, the tree is consistent, although the possible parent updates
666 : : * were a waste.
667 : : */
668 : : for (;;)
669 : : {
670 : : /*
671 : : * If we split an internal page while descending the tree, we have to
672 : : * retry at the parent. (Normally, the LSN-NSN interlock below would
673 : : * also catch this and cause us to retry. But LSNs are not updated
674 : : * during index build.)
675 : : */
2408 676 [ + + ]: 986568 : while (stack->retry_from_parent)
677 : : {
678 [ - + ]: 152 : if (xlocked)
2408 heikki.linnakangas@i 679 :UBC 0 : LockBuffer(stack->buffer, GIST_UNLOCK);
2408 heikki.linnakangas@i 680 :CBC 152 : xlocked = false;
681 : 152 : ReleaseBuffer(stack->buffer);
682 : 152 : state.stack = stack = stack->parent;
683 : : }
684 : :
40 alvherre@kurilemu.de 685 [ + + ]:GNC 986416 : if (!XLogRecPtrIsValid(stack->lsn))
5472 heikki.linnakangas@i 686 :CBC 986347 : stack->buffer = ReadBuffer(state.r, stack->blkno);
687 : :
688 : : /*
689 : : * Be optimistic and grab shared lock first. Swap it for an exclusive
690 : : * lock later if we need to update the page.
691 : : */
692 [ + + ]: 986416 : if (!xlocked)
693 : : {
694 : 986415 : LockBuffer(stack->buffer, GIST_SHARE);
695 : 986415 : gistcheckpage(state.r, stack->buffer);
696 : : }
697 : :
109 peter@eisentraut.org 698 :GNC 986416 : stack->page = BufferGetPage(stack->buffer);
2898 alvherre@alvh.no-ip. 699 :CBC 986416 : stack->lsn = xlocked ?
700 [ + + ]: 986416 : PageGetLSN(stack->page) : BufferGetLSNAtomic(stack->buffer);
40 alvherre@kurilemu.de 701 [ + + + + :GNC 986416 : Assert(!RelationNeedsWAL(state.r) || XLogRecPtrIsValid(stack->lsn));
+ + + + -
+ ]
702 : :
703 : : /*
704 : : * If this page was split but the downlink was never inserted to the
705 : : * parent because the inserting backend crashed before doing that, fix
706 : : * that now.
707 : : */
5472 heikki.linnakangas@i 708 [ - + ]:CBC 986416 : if (GistFollowRight(stack->page))
709 : : {
5472 heikki.linnakangas@i 710 [ # # ]:UBC 0 : if (!xlocked)
711 : : {
712 : 0 : LockBuffer(stack->buffer, GIST_UNLOCK);
713 : 0 : LockBuffer(stack->buffer, GIST_EXCLUSIVE);
714 : 0 : xlocked = true;
715 : : /* someone might've completed the split when we unlocked */
716 [ # # ]: 0 : if (!GistFollowRight(stack->page))
717 : 0 : continue;
718 : : }
719 : 0 : gistfixsplit(&state, giststate);
720 : :
721 : 0 : UnlockReleaseBuffer(stack->buffer);
722 : 0 : xlocked = false;
723 : 0 : state.stack = stack = stack->parent;
724 : 0 : continue;
725 : : }
726 : :
2337 heikki.linnakangas@i 727 [ + + + - ]:CBC 1545506 : if ((stack->blkno != GIST_ROOT_BLKNO &&
728 : 559090 : stack->parent->lsn < GistPageGetNSN(stack->page)) ||
729 [ - + ]: 986416 : GistPageIsDeleted(stack->page))
730 : : {
731 : : /*
732 : : * Concurrent split or page deletion detected. There's no
733 : : * guarantee that the downlink for this page is consistent with
734 : : * the tuple we're inserting anymore, so go back to parent and
735 : : * rechoose the best child.
736 : : */
5472 heikki.linnakangas@i 737 :UBC 0 : UnlockReleaseBuffer(stack->buffer);
738 : 0 : xlocked = false;
739 : 0 : state.stack = stack = stack->parent;
7477 teodor@sigaev.ru 740 : 0 : continue;
741 : : }
742 : :
5472 heikki.linnakangas@i 743 [ + + ]:CBC 986416 : if (!GistPageIsLeaf(stack->page))
744 : : {
745 : : /*
746 : : * This is an internal page so continue to walk down the tree.
747 : : * Find the child node that has the minimum insertion penalty.
748 : : */
749 : : BlockNumber childblkno;
750 : : IndexTuple newtup;
751 : : GISTInsertStack *item;
752 : : OffsetNumber downlinkoffnum;
753 : :
5268 754 : 559143 : downlinkoffnum = gistchoose(state.r, stack->page, itup, giststate);
755 : 559143 : iid = PageGetItemId(stack->page, downlinkoffnum);
5472 756 : 559143 : idxtuple = (IndexTuple) PageGetItem(stack->page, iid);
757 : 559143 : childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
758 : :
759 : : /*
760 : : * Check that it's not a leftover invalid tuple from pre-9.1
761 : : */
762 [ - + ]: 559143 : if (GistTupleIsInvalid(idxtuple))
5472 heikki.linnakangas@i 763 [ # # ]:UBC 0 : ereport(ERROR,
764 : : (errmsg("index \"%s\" contains an inner tuple marked as invalid",
765 : : RelationGetRelationName(r)),
766 : : errdetail("This is caused by an incomplete page split at crash recovery before upgrading to PostgreSQL 9.1."),
767 : : errhint("Please REINDEX it.")));
768 : :
769 : : /*
770 : : * Check that the key representing the target child node is
771 : : * consistent with the key we're inserting. Update it if it's not.
772 : : */
5472 heikki.linnakangas@i 773 :CBC 559143 : newtup = gistgetadjusted(state.r, idxtuple, itup, giststate);
774 [ + + ]: 559143 : if (newtup)
775 : : {
776 : : /*
777 : : * Swap shared lock for an exclusive one. Beware, the page may
778 : : * change while we unlock/lock the page...
779 : : */
780 [ + - ]: 324286 : if (!xlocked)
781 : : {
782 : 324286 : LockBuffer(stack->buffer, GIST_UNLOCK);
783 : 324286 : LockBuffer(stack->buffer, GIST_EXCLUSIVE);
784 : 324286 : xlocked = true;
109 peter@eisentraut.org 785 :GNC 324286 : stack->page = BufferGetPage(stack->buffer);
786 : :
4736 alvherre@alvh.no-ip. 787 [ - + ]:CBC 324286 : if (PageGetLSN(stack->page) != stack->lsn)
788 : : {
789 : : /* the page was changed while we unlocked it, retry */
5472 heikki.linnakangas@i 790 :UBC 0 : continue;
791 : : }
792 : : }
793 : :
794 : : /*
795 : : * Update the tuple.
796 : : *
797 : : * We still hold the lock after gistinserttuple(), but it
798 : : * might have to split the page to make the updated tuple fit.
799 : : * In that case the updated tuple might migrate to the other
800 : : * half of the split, so we have to go back to the parent and
801 : : * descend back to the half that's a better fit for the new
802 : : * tuple.
803 : : */
4967 heikki.linnakangas@i 804 [ + + ]:CBC 324286 : if (gistinserttuple(&state, stack, giststate, newtup,
805 : : downlinkoffnum))
806 : : {
807 : : /*
808 : : * If this was a root split, the root page continues to be
809 : : * the parent and the updated tuple went to one of the
810 : : * child pages, so we just need to retry from the root
811 : : * page.
812 : : */
5455 813 [ + + ]: 69 : if (stack->blkno != GIST_ROOT_BLKNO)
814 : : {
815 : 68 : UnlockReleaseBuffer(stack->buffer);
816 : 68 : xlocked = false;
817 : 68 : state.stack = stack = stack->parent;
818 : : }
5472 819 : 69 : continue;
820 : : }
821 : : }
822 : 559074 : LockBuffer(stack->buffer, GIST_UNLOCK);
823 : 559074 : xlocked = false;
824 : :
825 : : /* descend to the chosen child */
6 michael@paquier.xyz 826 :GNC 559074 : item = palloc0_object(GISTInsertStack);
5472 heikki.linnakangas@i 827 :CBC 559074 : item->blkno = childblkno;
828 : 559074 : item->parent = stack;
5268 829 : 559074 : item->downlinkoffnum = downlinkoffnum;
5472 830 : 559074 : state.stack = stack = item;
831 : : }
832 : : else
833 : : {
834 : : /*
835 : : * Leaf page. Insert the new key. We've already updated all the
836 : : * parents on the way down, but we might have to split the page if
837 : : * it doesn't fit. gistinserttuple() will take care of that.
838 : : */
839 : :
840 : : /*
841 : : * Swap shared lock for an exclusive one. Be careful, the page may
842 : : * change while we unlock/lock the page...
843 : : */
844 [ + - ]: 427273 : if (!xlocked)
845 : : {
846 : 427273 : LockBuffer(stack->buffer, GIST_UNLOCK);
847 : 427273 : LockBuffer(stack->buffer, GIST_EXCLUSIVE);
848 : 427273 : xlocked = true;
109 peter@eisentraut.org 849 :GNC 427273 : stack->page = BufferGetPage(stack->buffer);
5472 heikki.linnakangas@i 850 :CBC 427273 : stack->lsn = PageGetLSN(stack->page);
851 : :
852 [ + + ]: 427273 : if (stack->blkno == GIST_ROOT_BLKNO)
853 : : {
854 : : /*
855 : : * the only page that can become inner instead of leaf is
856 : : * the root page, so for root we should recheck it
857 : : */
858 [ - + ]: 19401 : if (!GistPageIsLeaf(stack->page))
859 : : {
860 : : /*
861 : : * very rare situation: during unlock/lock index with
862 : : * number of pages = 1 was increased
863 : : */
5472 heikki.linnakangas@i 864 :UBC 0 : LockBuffer(stack->buffer, GIST_UNLOCK);
865 : 0 : xlocked = false;
866 : 0 : continue;
867 : : }
868 : :
869 : : /*
870 : : * we don't need to check root split, because checking
871 : : * leaf/inner is enough to recognize split for root
872 : : */
873 : : }
2337 heikki.linnakangas@i 874 [ + - + - ]:CBC 815744 : else if ((GistFollowRight(stack->page) ||
1791 875 : 407872 : stack->parent->lsn < GistPageGetNSN(stack->page)) ||
2337 876 [ - + ]: 407872 : GistPageIsDeleted(stack->page))
877 : : {
878 : : /*
879 : : * The page was split or deleted while we momentarily
880 : : * unlocked the page. Go back to parent.
881 : : */
5472 heikki.linnakangas@i 882 :UBC 0 : UnlockReleaseBuffer(stack->buffer);
883 : 0 : xlocked = false;
884 : 0 : state.stack = stack = stack->parent;
885 : 0 : continue;
886 : : }
887 : : }
888 : :
889 : : /* now state.stack->(page, buffer and blkno) points to leaf page */
890 : :
4967 heikki.linnakangas@i 891 :CBC 427273 : gistinserttuple(&state, stack, giststate, itup,
892 : : InvalidOffsetNumber);
5472 893 : 427267 : LockBuffer(stack->buffer, GIST_UNLOCK);
894 : :
895 : : /* Release any pins we might still hold before exiting */
896 [ + + ]: 1413382 : for (; stack; stack = stack->parent)
897 : 986115 : ReleaseBuffer(stack->buffer);
7490 teodor@sigaev.ru 898 : 427267 : break;
899 : : }
900 : : }
7477 901 : 427267 : }
902 : :
903 : : /*
904 : : * Traverse the tree to find path from root page to specified "child" block.
905 : : *
906 : : * returns a new insertion stack, starting from the parent of "child", up
907 : : * to the root. *downlinkoffnum is set to the offset of the downlink in the
908 : : * direct parent of child.
909 : : *
910 : : * To prevent deadlocks, this should lock only one page at a time.
911 : : */
912 : : static GISTInsertStack *
5268 heikki.linnakangas@i 913 :UBC 0 : gistFindPath(Relation r, BlockNumber child, OffsetNumber *downlinkoffnum)
914 : : {
915 : : Page page;
916 : : Buffer buffer;
917 : : OffsetNumber i,
918 : : maxoff;
919 : : ItemId iid;
920 : : IndexTuple idxtuple;
921 : : List *fifo;
922 : : GISTInsertStack *top,
923 : : *ptr;
924 : : BlockNumber blkno;
925 : :
6 michael@paquier.xyz 926 :UNC 0 : top = palloc0_object(GISTInsertStack);
7477 teodor@sigaev.ru 927 :UBC 0 : top->blkno = GIST_ROOT_BLKNO;
5268 heikki.linnakangas@i 928 : 0 : top->downlinkoffnum = InvalidOffsetNumber;
929 : :
930 : 0 : fifo = list_make1(top);
931 [ # # ]: 0 : while (fifo != NIL)
932 : : {
933 : : /* Get next page to visit */
934 : 0 : top = linitial(fifo);
935 : 0 : fifo = list_delete_first(fifo);
936 : :
7201 tgl@sss.pgh.pa.us 937 : 0 : buffer = ReadBuffer(r, top->blkno);
938 : 0 : LockBuffer(buffer, GIST_SHARE);
7345 939 : 0 : gistcheckpage(r, buffer);
109 peter@eisentraut.org 940 :UNC 0 : page = BufferGetPage(buffer);
941 : :
7390 bruce@momjian.us 942 [ # # ]:UBC 0 : if (GistPageIsLeaf(page))
943 : : {
944 : : /*
945 : : * Because we scan the index top-down, all the rest of the pages
946 : : * in the queue must be leaf pages as well.
947 : : */
7200 tgl@sss.pgh.pa.us 948 : 0 : UnlockReleaseBuffer(buffer);
5268 heikki.linnakangas@i 949 : 0 : break;
950 : : }
951 : :
952 : : /* currently, internal pages are never deleted */
2337 953 [ # # ]: 0 : Assert(!GistPageIsDeleted(page));
954 : :
2898 alvherre@alvh.no-ip. 955 : 0 : top->lsn = BufferGetLSNAtomic(buffer);
956 : :
957 : : /*
958 : : * If F_FOLLOW_RIGHT is set, the page to the right doesn't have a
959 : : * downlink. This should not normally happen..
960 : : */
5472 heikki.linnakangas@i 961 [ # # ]: 0 : if (GistFollowRight(page))
962 [ # # ]: 0 : elog(ERROR, "concurrent GiST page split was incomplete");
963 : :
4716 964 [ # # # # ]: 0 : if (top->parent && top->parent->lsn < GistPageGetNSN(page) &&
7390 bruce@momjian.us 965 [ # # ]: 0 : GistPageGetOpaque(page)->rightlink != InvalidBlockNumber /* sanity check */ )
966 : : {
967 : : /*
968 : : * Page was split while we looked elsewhere. We didn't see the
969 : : * downlink to the right page when we scanned the parent, so add
970 : : * it to the queue now.
971 : : *
972 : : * Put the right page ahead of the queue, so that we visit it
973 : : * next. That's important, because if this is the lowest internal
974 : : * level, just above leaves, we might already have queued up some
975 : : * leaf pages, and we assume that there can't be any non-leaf
976 : : * pages behind leaf pages.
977 : : */
6 michael@paquier.xyz 978 :UNC 0 : ptr = palloc0_object(GISTInsertStack);
7477 teodor@sigaev.ru 979 :UBC 0 : ptr->blkno = GistPageGetOpaque(page)->rightlink;
5268 heikki.linnakangas@i 980 : 0 : ptr->downlinkoffnum = InvalidOffsetNumber;
981 : 0 : ptr->parent = top->parent;
982 : :
983 : 0 : fifo = lcons(ptr, fifo);
984 : : }
985 : :
7477 teodor@sigaev.ru 986 : 0 : maxoff = PageGetMaxOffsetNumber(page);
987 : :
7390 bruce@momjian.us 988 [ # # ]: 0 : for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
989 : : {
7477 teodor@sigaev.ru 990 : 0 : iid = PageGetItemId(page, i);
991 : 0 : idxtuple = (IndexTuple) PageGetItem(page, iid);
992 : 0 : blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
7390 bruce@momjian.us 993 [ # # ]: 0 : if (blkno == child)
994 : : {
995 : : /* Found it! */
7200 tgl@sss.pgh.pa.us 996 : 0 : UnlockReleaseBuffer(buffer);
5268 heikki.linnakangas@i 997 : 0 : *downlinkoffnum = i;
7477 teodor@sigaev.ru 998 : 0 : return top;
999 : : }
1000 : : else
1001 : : {
1002 : : /* Append this child to the list of pages to visit later */
6 michael@paquier.xyz 1003 :UNC 0 : ptr = palloc0_object(GISTInsertStack);
7477 teodor@sigaev.ru 1004 :UBC 0 : ptr->blkno = blkno;
5268 heikki.linnakangas@i 1005 : 0 : ptr->downlinkoffnum = i;
7477 teodor@sigaev.ru 1006 : 0 : ptr->parent = top;
1007 : :
5268 heikki.linnakangas@i 1008 : 0 : fifo = lappend(fifo, ptr);
1009 : : }
1010 : : }
1011 : :
7200 tgl@sss.pgh.pa.us 1012 : 0 : UnlockReleaseBuffer(buffer);
1013 : : }
1014 : :
5268 heikki.linnakangas@i 1015 [ # # ]: 0 : elog(ERROR, "failed to re-find parent of a page in index \"%s\", block %u",
1016 : : RelationGetRelationName(r), child);
1017 : : return NULL; /* keep compiler quiet */
1018 : : }
1019 : :
1020 : : /*
1021 : : * Updates the stack so that child->parent is the correct parent of the
1022 : : * child. child->parent must be exclusively locked on entry, and will
1023 : : * remain so at exit, but it might not be the same page anymore.
1024 : : */
1025 : : static void
812 heikki.linnakangas@i 1026 :CBC 12006 : gistFindCorrectParent(Relation r, GISTInsertStack *child, bool is_build)
1027 : : {
7390 bruce@momjian.us 1028 : 12006 : GISTInsertStack *parent = child->parent;
1029 : : ItemId iid;
1030 : : IndexTuple idxtuple;
1031 : : OffsetNumber maxoff;
1032 : : GISTInsertStack *ptr;
1033 : :
7345 tgl@sss.pgh.pa.us 1034 : 12006 : gistcheckpage(r, parent->buffer);
109 peter@eisentraut.org 1035 :GNC 12006 : parent->page = BufferGetPage(parent->buffer);
812 heikki.linnakangas@i 1036 :CBC 12006 : maxoff = PageGetMaxOffsetNumber(parent->page);
1037 : :
1038 : : /* Check if the downlink is still where it was before */
1039 [ + + + - ]: 12006 : if (child->downlinkoffnum != InvalidOffsetNumber && child->downlinkoffnum <= maxoff)
1040 : : {
1041 : 11999 : iid = PageGetItemId(parent->page, child->downlinkoffnum);
1042 : 11999 : idxtuple = (IndexTuple) PageGetItem(parent->page, iid);
1043 [ + - ]: 11999 : if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == child->blkno)
1044 : 11999 : return; /* still there */
1045 : : }
1046 : :
1047 : : /*
1048 : : * The page has changed since we looked. During normal operation, every
1049 : : * update of a page changes its LSN, so the LSN we memorized should have
1050 : : * changed too.
1051 : : *
1052 : : * During index build, however, we don't WAL-log the changes until we have
1053 : : * built the index, so the LSN doesn't change. There is no concurrent
1054 : : * activity during index build, but we might have changed the parent
1055 : : * ourselves.
1056 : : *
1057 : : * We will also get here if child->downlinkoffnum is invalid. That happens
1058 : : * if 'parent' had been updated by an earlier call to this function on its
1059 : : * grandchild, which had to move right.
1060 : : */
256 1061 [ - + + - : 7 : Assert(parent->lsn != PageGetLSN(parent->page) || is_build ||
- - ]
1062 : : child->downlinkoffnum == InvalidOffsetNumber);
1063 : :
1064 : : /*
1065 : : * Scan the page to re-find the downlink. If the page was split, it might
1066 : : * have moved to a different page, so follow the right links until we find
1067 : : * it.
1068 : : */
1069 : : while (true)
812 1070 : 3 : {
1071 : : OffsetNumber i;
1072 : :
1073 : 10 : maxoff = PageGetMaxOffsetNumber(parent->page);
1074 [ + + ]: 45 : for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
1075 : : {
1076 : 42 : iid = PageGetItemId(parent->page, i);
1077 : 42 : idxtuple = (IndexTuple) PageGetItem(parent->page, iid);
1078 [ + + ]: 42 : if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == child->blkno)
1079 : : {
1080 : : /* yes!!, found */
1081 : 7 : child->downlinkoffnum = i;
1082 : 7 : return;
1083 : : }
1084 : : }
1085 : :
1086 : 3 : parent->blkno = GistPageGetOpaque(parent->page)->rightlink;
1087 : 3 : parent->downlinkoffnum = InvalidOffsetNumber;
1088 : 3 : UnlockReleaseBuffer(parent->buffer);
1089 [ - + ]: 3 : if (parent->blkno == InvalidBlockNumber)
1090 : : {
1091 : : /*
1092 : : * End of chain and still didn't find parent. It's a very-very
1093 : : * rare situation when the root was split.
1094 : : */
812 heikki.linnakangas@i 1095 :UBC 0 : break;
1096 : : }
812 heikki.linnakangas@i 1097 :CBC 3 : parent->buffer = ReadBuffer(r, parent->blkno);
1098 : 3 : LockBuffer(parent->buffer, GIST_EXCLUSIVE);
1099 : 3 : gistcheckpage(r, parent->buffer);
109 peter@eisentraut.org 1100 :GNC 3 : parent->page = BufferGetPage(parent->buffer);
1101 : : }
1102 : :
1103 : : /*
1104 : : * awful!!, we need search tree to find parent ... , but before we should
1105 : : * release all old parent
1106 : : */
1107 : :
812 heikki.linnakangas@i 1108 :UBC 0 : ptr = child->parent->parent; /* child->parent already released above */
1109 [ # # ]: 0 : while (ptr)
1110 : : {
1111 : 0 : ReleaseBuffer(ptr->buffer);
1112 : 0 : ptr = ptr->parent;
1113 : : }
1114 : :
1115 : : /* ok, find new path */
1116 : 0 : ptr = parent = gistFindPath(r, child->blkno, &child->downlinkoffnum);
1117 : :
1118 : : /* read all buffers as expected by caller */
1119 : : /* note we don't lock them or gistcheckpage them here! */
1120 [ # # ]: 0 : while (ptr)
1121 : : {
1122 : 0 : ptr->buffer = ReadBuffer(r, ptr->blkno);
109 peter@eisentraut.org 1123 :UNC 0 : ptr->page = BufferGetPage(ptr->buffer);
812 heikki.linnakangas@i 1124 :UBC 0 : ptr = ptr->parent;
1125 : : }
1126 : :
1127 : : /* install new chain of parents to stack */
1128 : 0 : child->parent = parent;
1129 : :
1130 : : /* make recursive call to normal processing */
1131 : 0 : LockBuffer(child->parent->buffer, GIST_EXCLUSIVE);
1132 : 0 : gistFindCorrectParent(r, child, is_build);
1133 : : }
1134 : :
1135 : : /*
1136 : : * Form a downlink pointer for the page in 'buf'.
1137 : : */
1138 : : static IndexTuple
5472 1139 : 0 : gistformdownlink(Relation rel, Buffer buf, GISTSTATE *giststate,
1140 : : GISTInsertStack *stack, bool is_build)
1141 : : {
3527 kgrittn@postgresql.o 1142 : 0 : Page page = BufferGetPage(buf);
1143 : : OffsetNumber maxoff;
1144 : : OffsetNumber offset;
5364 bruce@momjian.us 1145 : 0 : IndexTuple downlink = NULL;
1146 : :
5472 heikki.linnakangas@i 1147 : 0 : maxoff = PageGetMaxOffsetNumber(page);
1148 [ # # ]: 0 : for (offset = FirstOffsetNumber; offset <= maxoff; offset = OffsetNumberNext(offset))
1149 : : {
1150 : : IndexTuple ituple = (IndexTuple)
942 tgl@sss.pgh.pa.us 1151 : 0 : PageGetItem(page, PageGetItemId(page, offset));
1152 : :
5472 heikki.linnakangas@i 1153 [ # # ]: 0 : if (downlink == NULL)
1154 : 0 : downlink = CopyIndexTuple(ituple);
1155 : : else
1156 : : {
1157 : : IndexTuple newdownlink;
1158 : :
1159 : 0 : newdownlink = gistgetadjusted(rel, downlink, ituple,
1160 : : giststate);
1161 [ # # ]: 0 : if (newdownlink)
1162 : 0 : downlink = newdownlink;
1163 : : }
1164 : : }
1165 : :
1166 : : /*
1167 : : * If the page is completely empty, we can't form a meaningful downlink
1168 : : * for it. But we have to insert a downlink for the page. Any key will do,
1169 : : * as long as its consistent with the downlink of parent page, so that we
1170 : : * can legally insert it to the parent. A minimal one that matches as few
1171 : : * scans as possible would be best, to keep scans from doing useless work,
1172 : : * but we don't know how to construct that. So we just use the downlink of
1173 : : * the original page that was split - that's as far from optimal as it can
1174 : : * get but will do..
1175 : : */
1176 [ # # ]: 0 : if (!downlink)
1177 : : {
1178 : : ItemId iid;
1179 : :
1180 : 0 : LockBuffer(stack->parent->buffer, GIST_EXCLUSIVE);
812 1181 : 0 : gistFindCorrectParent(rel, stack, is_build);
5268 1182 : 0 : iid = PageGetItemId(stack->parent->page, stack->downlinkoffnum);
5472 1183 : 0 : downlink = (IndexTuple) PageGetItem(stack->parent->page, iid);
1184 : 0 : downlink = CopyIndexTuple(downlink);
1185 : 0 : LockBuffer(stack->parent->buffer, GIST_UNLOCK);
1186 : : }
1187 : :
1188 : 0 : ItemPointerSetBlockNumber(&(downlink->t_tid), BufferGetBlockNumber(buf));
1189 : 0 : GistTupleSetValid(downlink);
1190 : :
1191 : 0 : return downlink;
1192 : : }
1193 : :
1194 : :
1195 : : /*
1196 : : * Complete the incomplete split of state->stack->page.
1197 : : */
1198 : : static void
1199 : 0 : gistfixsplit(GISTInsertState *state, GISTSTATE *giststate)
1200 : : {
1201 : 0 : GISTInsertStack *stack = state->stack;
1202 : : Buffer buf;
1203 : : Page page;
1204 : 0 : List *splitinfo = NIL;
1205 : :
1838 peter@eisentraut.org 1206 [ # # ]: 0 : ereport(LOG,
1207 : : (errmsg("fixing incomplete split in index \"%s\", block %u",
1208 : : RelationGetRelationName(state->r), stack->blkno)));
1209 : :
5472 heikki.linnakangas@i 1210 [ # # ]: 0 : Assert(GistFollowRight(stack->page));
5268 1211 [ # # # # : 0 : Assert(OffsetNumberIsValid(stack->downlinkoffnum));
# # ]
1212 : :
5472 1213 : 0 : buf = stack->buffer;
1214 : :
1215 : : /*
1216 : : * Read the chain of split pages, following the rightlinks. Construct a
1217 : : * downlink tuple for each page.
1218 : : */
1219 : : for (;;)
1220 : 0 : {
6 michael@paquier.xyz 1221 :UNC 0 : GISTPageSplitInfo *si = palloc_object(GISTPageSplitInfo);
1222 : : IndexTuple downlink;
1223 : :
3527 kgrittn@postgresql.o 1224 :UBC 0 : page = BufferGetPage(buf);
1225 : :
1226 : : /* Form the new downlink tuples to insert to parent */
812 heikki.linnakangas@i 1227 : 0 : downlink = gistformdownlink(state->r, buf, giststate, stack, state->is_build);
1228 : :
5472 1229 : 0 : si->buf = buf;
1230 : 0 : si->downlink = downlink;
1231 : :
1232 : 0 : splitinfo = lappend(splitinfo, si);
1233 : :
1234 [ # # ]: 0 : if (GistFollowRight(page))
1235 : : {
1236 : : /* lock next page */
1237 : 0 : buf = ReadBuffer(state->r, GistPageGetOpaque(page)->rightlink);
1238 : 0 : LockBuffer(buf, GIST_EXCLUSIVE);
1239 : : }
1240 : : else
1241 : 0 : break;
1242 : : }
1243 : :
1244 : : /* Insert the downlinks */
4967 1245 : 0 : gistfinishsplit(state, stack, giststate, splitinfo, false);
5472 1246 : 0 : }
1247 : :
1248 : : /*
1249 : : * Insert or replace a tuple in stack->buffer. If 'oldoffnum' is valid, the
1250 : : * tuple at 'oldoffnum' is replaced, otherwise the tuple is inserted as new.
1251 : : * 'stack' represents the path from the root to the page being updated.
1252 : : *
1253 : : * The caller must hold an exclusive lock on stack->buffer. The lock is still
1254 : : * held on return, but the page might not contain the inserted tuple if the
1255 : : * page was split. The function returns true if the page was split, false
1256 : : * otherwise.
1257 : : */
1258 : : static bool
4967 heikki.linnakangas@i 1259 :CBC 751559 : gistinserttuple(GISTInsertState *state, GISTInsertStack *stack,
1260 : : GISTSTATE *giststate, IndexTuple tuple, OffsetNumber oldoffnum)
1261 : : {
1262 : 751559 : return gistinserttuples(state, stack, giststate, &tuple, 1, oldoffnum,
1263 : : InvalidBuffer, InvalidBuffer, false, false);
1264 : : }
1265 : :
1266 : : /* ----------------
1267 : : * An extended workhorse version of gistinserttuple(). This version allows
1268 : : * inserting multiple tuples, or replacing a single tuple with multiple tuples.
1269 : : * This is used to recursively update the downlinks in the parent when a page
1270 : : * is split.
1271 : : *
1272 : : * If leftchild and rightchild are valid, we're inserting/replacing the
1273 : : * downlink for rightchild, and leftchild is its left sibling. We clear the
1274 : : * F_FOLLOW_RIGHT flag and update NSN on leftchild, atomically with the
1275 : : * insertion of the downlink.
1276 : : *
1277 : : * To avoid holding locks for longer than necessary, when recursing up the
1278 : : * tree to update the parents, the locking is a bit peculiar here. On entry,
1279 : : * the caller must hold an exclusive lock on stack->buffer, as well as
1280 : : * leftchild and rightchild if given. On return:
1281 : : *
1282 : : * - Lock on stack->buffer is released, if 'unlockbuf' is true. The page is
1283 : : * always kept pinned, however.
1284 : : * - Lock on 'leftchild' is released, if 'unlockleftchild' is true. The page
1285 : : * is kept pinned.
1286 : : * - Lock and pin on 'rightchild' are always released.
1287 : : *
1288 : : * Returns 'true' if the page had to be split. Note that if the page was
1289 : : * split, the inserted/updated tuples might've been inserted to a right
1290 : : * sibling of stack->buffer instead of stack->buffer itself.
1291 : : */
1292 : : static bool
5472 1293 : 763565 : gistinserttuples(GISTInsertState *state, GISTInsertStack *stack,
1294 : : GISTSTATE *giststate,
1295 : : IndexTuple *tuples, int ntup, OffsetNumber oldoffnum,
1296 : : Buffer leftchild, Buffer rightchild,
1297 : : bool unlockbuf, bool unlockleftchild)
1298 : : {
1299 : : List *splitinfo;
1300 : : bool is_split;
1301 : :
1302 : : /*
1303 : : * Check for any rw conflicts (in serializable isolation level) just
1304 : : * before we intend to modify the page
1305 : : */
2149 tmunro@postgresql.or 1306 : 763565 : CheckForSerializableConflictIn(state->r, NULL, BufferGetBlockNumber(stack->buffer));
1307 : :
1308 : : /* Insert the tuple(s) to the page, splitting the page if necessary */
5213 heikki.linnakangas@i 1309 : 763559 : is_split = gistplacetopage(state->r, state->freespace, giststate,
1310 : : stack->buffer,
1311 : : tuples, ntup,
1312 : : oldoffnum, NULL,
1313 : : leftchild,
1314 : : &splitinfo,
1315 : : true,
1316 : : state->heapRel,
2449 1317 : 763559 : state->is_build);
1318 : :
1319 : : /*
1320 : : * Before recursing up in case the page was split, release locks on the
1321 : : * child pages. We don't need to keep them locked when updating the
1322 : : * parent.
1323 : : */
4967 1324 [ + + ]: 763559 : if (BufferIsValid(rightchild))
1325 : 12006 : UnlockReleaseBuffer(rightchild);
1326 [ + + + + ]: 763559 : if (BufferIsValid(leftchild) && unlockleftchild)
1327 : 2566 : LockBuffer(leftchild, GIST_UNLOCK);
1328 : :
1329 : : /*
1330 : : * If we had to split, insert/update the downlinks in the parent. If the
1331 : : * caller requested us to release the lock on stack->buffer, tell
1332 : : * gistfinishsplit() to do that as soon as it's safe to do so. If we
1333 : : * didn't have to split, release it ourselves.
1334 : : */
5472 1335 [ + + ]: 763559 : if (splitinfo)
4967 1336 : 11979 : gistfinishsplit(state, stack, giststate, splitinfo, unlockbuf);
1337 [ + + ]: 751580 : else if (unlockbuf)
1338 : 9413 : LockBuffer(stack->buffer, GIST_UNLOCK);
1339 : :
5472 1340 : 763559 : return is_split;
1341 : : }
1342 : :
1343 : : /*
1344 : : * Finish an incomplete split by inserting/updating the downlinks in parent
1345 : : * page. 'splitinfo' contains all the child pages involved in the split,
1346 : : * from left-to-right.
1347 : : *
1348 : : * On entry, the caller must hold a lock on stack->buffer and all the child
1349 : : * pages in 'splitinfo'. If 'unlockbuf' is true, the lock on stack->buffer is
1350 : : * released on return. The child pages are always unlocked and unpinned.
1351 : : */
1352 : : static void
1353 : 11979 : gistfinishsplit(GISTInsertState *state, GISTInsertStack *stack,
1354 : : GISTSTATE *giststate, List *splitinfo, bool unlockbuf)
1355 : : {
1356 : : GISTPageSplitInfo *right;
1357 : : GISTPageSplitInfo *left;
1358 : : IndexTuple tuples[2];
1359 : :
1360 : : /* A split always contains at least two halves */
1361 [ - + ]: 11979 : Assert(list_length(splitinfo) >= 2);
1362 : :
1363 : : /*
1364 : : * We need to insert downlinks for each new page, and update the downlink
1365 : : * for the original (leftmost) page in the split. Begin at the rightmost
1366 : : * page, inserting one downlink at a time until there's only two pages
1367 : : * left. Finally insert the downlink for the last new page and update the
1368 : : * downlink for the original page as one operation.
1369 : : */
1370 : 11979 : LockBuffer(stack->parent->buffer, GIST_EXCLUSIVE);
1371 : :
1372 : : /*
1373 : : * Insert downlinks for the siblings from right to left, until there are
1374 : : * only two siblings left.
1375 : : */
2344 tgl@sss.pgh.pa.us 1376 [ + + ]: 12006 : for (int pos = list_length(splitinfo) - 1; pos > 1; pos--)
1377 : : {
1378 : 27 : right = (GISTPageSplitInfo *) list_nth(splitinfo, pos);
1379 : 27 : left = (GISTPageSplitInfo *) list_nth(splitinfo, pos - 1);
1380 : :
812 heikki.linnakangas@i 1381 : 27 : gistFindCorrectParent(state->r, stack, state->is_build);
5472 1382 [ + + ]: 27 : if (gistinserttuples(state, stack->parent, giststate,
1383 : : &right->downlink, 1,
1384 : : InvalidOffsetNumber,
1385 : : left->buf, right->buf, false, false))
1386 : : {
1387 : : /*
1388 : : * If the parent page was split, the existing downlink might have
1389 : : * moved.
1390 : : */
2195 1391 : 3 : stack->downlinkoffnum = InvalidOffsetNumber;
1392 : : }
1393 : : /* gistinserttuples() released the lock on right->buf. */
1394 : : }
1395 : :
2344 tgl@sss.pgh.pa.us 1396 : 11979 : right = (GISTPageSplitInfo *) lsecond(splitinfo);
1397 : 11979 : left = (GISTPageSplitInfo *) linitial(splitinfo);
1398 : :
1399 : : /*
1400 : : * Finally insert downlink for the remaining right page and update the
1401 : : * downlink for the original page to not contain the tuples that were
1402 : : * moved to the new pages.
1403 : : */
5472 heikki.linnakangas@i 1404 : 11979 : tuples[0] = left->downlink;
1405 : 11979 : tuples[1] = right->downlink;
812 1406 : 11979 : gistFindCorrectParent(state->r, stack, state->is_build);
1407 : 11979 : (void) gistinserttuples(state, stack->parent, giststate,
1408 : : tuples, 2,
1409 : 11979 : stack->downlinkoffnum,
1410 : : left->buf, right->buf,
1411 : : true, /* Unlock parent */
1412 : : unlockbuf /* Unlock stack->buffer if caller
1413 : : * wants that */
1414 : : );
1415 : :
1416 : : /*
1417 : : * The downlink might have moved when we updated it. Even if the page
1418 : : * wasn't split, because gistinserttuples() implements updating the old
1419 : : * tuple by removing and re-inserting it!
1420 : : */
1421 : 11979 : stack->downlinkoffnum = InvalidOffsetNumber;
1422 : :
5472 1423 [ - + ]: 11979 : Assert(left->buf == stack->buffer);
1424 : :
1425 : : /*
1426 : : * If we split the page because we had to adjust the downlink on an
1427 : : * internal page, while descending the tree for inserting a new tuple,
1428 : : * then this might no longer be the correct page for the new tuple. The
1429 : : * downlink to this page might not cover the new tuple anymore, it might
1430 : : * need to go to the newly-created right sibling instead. Tell the caller
1431 : : * to walk back up the stack, to re-check at the parent which page to
1432 : : * insert to.
1433 : : *
1434 : : * Normally, the LSN-NSN interlock during the tree descend would also
1435 : : * detect that a concurrent split happened (by ourselves), and cause us to
1436 : : * retry at the parent. But that mechanism doesn't work during index
1437 : : * build, because we don't do WAL-logging, and don't update LSNs, during
1438 : : * index build.
1439 : : */
2408 1440 : 11979 : stack->retry_from_parent = true;
7484 teodor@sigaev.ru 1441 : 11979 : }
1442 : :
1443 : : /*
1444 : : * gistSplit -- split a page in the tree and fill struct
1445 : : * used for XLOG and real writes buffers. Function is recursive, ie
1446 : : * it will split page until keys will fit in every page.
1447 : : */
1448 : : SplitPageLayout *
10704 scrappy@hub.org 1449 : 13291 : gistSplit(Relation r,
1450 : : Page page,
1451 : : IndexTuple *itup, /* contains compressed entry */
1452 : : int len,
1453 : : GISTSTATE *giststate)
1454 : : {
1455 : : IndexTuple *lvectup,
1456 : : *rvectup;
1457 : : GistSplitVector v;
1458 : : int i;
714 rhaas@postgresql.org 1459 : 13291 : SplitPageLayout *res = NULL;
1460 : :
1461 : : /* this should never recurse very deeply, but better safe than sorry */
4092 heikki.linnakangas@i 1462 : 13291 : check_stack_depth();
1463 : :
1464 : : /* there's no point in splitting an empty page */
1465 [ - + ]: 13291 : Assert(len > 0);
1466 : :
1467 : : /*
1468 : : * If a single tuple doesn't fit on a page, no amount of splitting will
1469 : : * help.
1470 : : */
1471 [ - + ]: 13291 : if (len == 1)
4092 heikki.linnakangas@i 1472 [ # # ]:UBC 0 : ereport(ERROR,
1473 : : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1474 : : errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
1475 : : IndexTupleSize(itup[0]), GiSTPageSize,
1476 : : RelationGetRelationName(r))));
1477 : :
2473 akorotkov@postgresql 1478 :CBC 13291 : memset(v.spl_lisnull, true,
1479 : 13291 : sizeof(bool) * giststate->nonLeafTupdesc->natts);
1480 : 13291 : memset(v.spl_risnull, true,
1481 : 13291 : sizeof(bool) * giststate->nonLeafTupdesc->natts);
4692 tgl@sss.pgh.pa.us 1482 : 13291 : gistSplitByKey(r, page, itup, len, giststate, &v, 0);
1483 : :
1484 : : /* form left and right vector */
6 michael@paquier.xyz 1485 :GNC 13291 : lvectup = palloc_array(IndexTuple, len + 1);
1486 : 13291 : rvectup = palloc_array(IndexTuple, len + 1);
1487 : :
7111 teodor@sigaev.ru 1488 [ + + ]:CBC 571601 : for (i = 0; i < v.splitVector.spl_nleft; i++)
1489 : 558310 : lvectup[i] = itup[v.splitVector.spl_left[i] - 1];
1490 : :
1491 [ + + ]: 637569 : for (i = 0; i < v.splitVector.spl_nright; i++)
1492 : 624278 : rvectup[i] = itup[v.splitVector.spl_right[i] - 1];
1493 : :
1494 : : /* finalize splitting (may need another split) */
1495 [ + + ]: 13291 : if (!gistfitpage(rvectup, v.splitVector.spl_nright))
1496 : : {
1497 : 348 : res = gistSplit(r, page, rvectup, v.splitVector.spl_nright, giststate);
1498 : : }
1499 : : else
1500 : : {
7160 1501 : 12943 : ROTATEDIST(res);
7111 1502 : 12943 : res->block.num = v.splitVector.spl_nright;
7013 bruce@momjian.us 1503 : 12943 : res->list = gistfillitupvec(rvectup, v.splitVector.spl_nright, &(res->lenlist));
5472 heikki.linnakangas@i 1504 : 12943 : res->itup = gistFormTuple(giststate, r, v.spl_rattr, v.spl_risnull, false);
1505 : : }
1506 : :
7111 teodor@sigaev.ru 1507 [ + + ]: 13291 : if (!gistfitpage(lvectup, v.splitVector.spl_nleft))
1508 : : {
1509 : : SplitPageLayout *resptr,
1510 : : *subres;
1511 : :
1512 : 222 : resptr = subres = gistSplit(r, page, lvectup, v.splitVector.spl_nleft, giststate);
1513 : :
1514 : : /* install on list's tail */
7013 bruce@momjian.us 1515 [ + + ]: 565 : while (resptr->next)
7160 teodor@sigaev.ru 1516 : 343 : resptr = resptr->next;
1517 : :
1518 : 222 : resptr->next = res;
1519 : 222 : res = subres;
1520 : : }
1521 : : else
1522 : : {
1523 : 13069 : ROTATEDIST(res);
7111 1524 : 13069 : res->block.num = v.splitVector.spl_nleft;
7013 bruce@momjian.us 1525 : 13069 : res->list = gistfillitupvec(lvectup, v.splitVector.spl_nleft, &(res->lenlist));
5472 heikki.linnakangas@i 1526 : 13069 : res->itup = gistFormTuple(giststate, r, v.spl_lattr, v.spl_lisnull, false);
1527 : : }
1528 : :
7160 teodor@sigaev.ru 1529 : 13291 : return res;
1530 : : }
1531 : :
1532 : : /*
1533 : : * Create a GISTSTATE and fill it with information about the index
1534 : : */
1535 : : GISTSTATE *
5191 tgl@sss.pgh.pa.us 1536 : 4686 : initGISTstate(Relation index)
1537 : : {
1538 : : GISTSTATE *giststate;
1539 : : MemoryContext scanCxt;
1540 : : MemoryContext oldCxt;
1541 : : int i;
1542 : :
1543 : : /* safety check to protect fixed-size arrays in GISTSTATE */
8883 1544 [ - + ]: 4686 : if (index->rd_att->natts > INDEX_MAX_KEYS)
8184 tgl@sss.pgh.pa.us 1545 [ # # ]:UBC 0 : elog(ERROR, "numberOfAttributes %d > %d",
1546 : : index->rd_att->natts, INDEX_MAX_KEYS);
1547 : :
1548 : : /* Create the memory context that will hold the GISTSTATE */
5191 tgl@sss.pgh.pa.us 1549 :CBC 4686 : scanCxt = AllocSetContextCreate(CurrentMemoryContext,
1550 : : "GiST scan context",
1551 : : ALLOCSET_DEFAULT_SIZES);
1552 : 4686 : oldCxt = MemoryContextSwitchTo(scanCxt);
1553 : :
1554 : : /* Create and fill in the GISTSTATE */
6 michael@paquier.xyz 1555 :GNC 4686 : giststate = palloc_object(GISTSTATE);
1556 : :
5191 tgl@sss.pgh.pa.us 1557 :CBC 4686 : giststate->scanCxt = scanCxt;
3100 1558 : 4686 : giststate->tempCxt = scanCxt; /* caller must change this if needed */
2473 akorotkov@postgresql 1559 : 4686 : giststate->leafTupdesc = index->rd_att;
1560 : :
1561 : : /*
1562 : : * The truncated tupdesc for non-leaf index tuples, which doesn't contain
1563 : : * the INCLUDE attributes.
1564 : : *
1565 : : * It is used to form tuples during tuple adjustment and page split.
1566 : : * B-tree creates shortened tuple descriptor for every truncated tuple,
1567 : : * because it is doing this less often: it does not have to form truncated
1568 : : * tuples during page split. Also, B-tree is not adjusting tuples on
1569 : : * internal pages the way GiST does.
1570 : : */
361 drowley@postgresql.o 1571 : 9372 : giststate->nonLeafTupdesc = CreateTupleDescTruncatedCopy(index->rd_att,
1572 : 4686 : IndexRelationGetNumberOfKeyAttributes(index));
1573 : :
2473 akorotkov@postgresql 1574 [ + + ]: 12427 : for (i = 0; i < IndexRelationGetNumberOfKeyAttributes(index); i++)
1575 : : {
8837 tgl@sss.pgh.pa.us 1576 : 7741 : fmgr_info_copy(&(giststate->consistentFn[i]),
7518 neilc@samurai.com 1577 : 7741 : index_getprocinfo(index, i + 1, GIST_CONSISTENT_PROC),
1578 : : scanCxt);
8837 tgl@sss.pgh.pa.us 1579 : 7741 : fmgr_info_copy(&(giststate->unionFn[i]),
8818 bruce@momjian.us 1580 : 7741 : index_getprocinfo(index, i + 1, GIST_UNION_PROC),
1581 : : scanCxt);
1582 : :
1583 : : /* opclasses are not required to provide a Compress method */
3010 tgl@sss.pgh.pa.us 1584 [ + + ]: 7741 : if (OidIsValid(index_getprocid(index, i + 1, GIST_COMPRESS_PROC)))
1585 : 2695 : fmgr_info_copy(&(giststate->compressFn[i]),
1586 : 2695 : index_getprocinfo(index, i + 1, GIST_COMPRESS_PROC),
1587 : : scanCxt);
1588 : : else
1589 : 5046 : giststate->compressFn[i].fn_oid = InvalidOid;
1590 : :
1591 : : /* opclasses are not required to provide a Decompress method */
1592 [ + + ]: 7741 : if (OidIsValid(index_getprocid(index, i + 1, GIST_DECOMPRESS_PROC)))
1593 : 811 : fmgr_info_copy(&(giststate->decompressFn[i]),
1594 : 811 : index_getprocinfo(index, i + 1, GIST_DECOMPRESS_PROC),
1595 : : scanCxt);
1596 : : else
1597 : 6930 : giststate->decompressFn[i].fn_oid = InvalidOid;
1598 : :
8837 1599 : 7741 : fmgr_info_copy(&(giststate->penaltyFn[i]),
8818 bruce@momjian.us 1600 : 7741 : index_getprocinfo(index, i + 1, GIST_PENALTY_PROC),
1601 : : scanCxt);
8837 tgl@sss.pgh.pa.us 1602 : 7741 : fmgr_info_copy(&(giststate->picksplitFn[i]),
7518 neilc@samurai.com 1603 : 7741 : index_getprocinfo(index, i + 1, GIST_PICKSPLIT_PROC),
1604 : : scanCxt);
8837 tgl@sss.pgh.pa.us 1605 : 7741 : fmgr_info_copy(&(giststate->equalFn[i]),
8818 bruce@momjian.us 1606 : 7741 : index_getprocinfo(index, i + 1, GIST_EQUAL_PROC),
1607 : : scanCxt);
1608 : :
1609 : : /* opclasses are not required to provide a Distance method */
5492 tgl@sss.pgh.pa.us 1610 [ + + ]: 7741 : if (OidIsValid(index_getprocid(index, i + 1, GIST_DISTANCE_PROC)))
1611 : 957 : fmgr_info_copy(&(giststate->distanceFn[i]),
3100 1612 : 957 : index_getprocinfo(index, i + 1, GIST_DISTANCE_PROC),
1613 : : scanCxt);
1614 : : else
5492 1615 : 6784 : giststate->distanceFn[i].fn_oid = InvalidOid;
1616 : :
1617 : : /* opclasses are not required to provide a Fetch method */
3918 heikki.linnakangas@i 1618 [ + + ]: 7741 : if (OidIsValid(index_getprocid(index, i + 1, GIST_FETCH_PROC)))
1619 : 689 : fmgr_info_copy(&(giststate->fetchFn[i]),
3860 bruce@momjian.us 1620 : 689 : index_getprocinfo(index, i + 1, GIST_FETCH_PROC),
1621 : : scanCxt);
1622 : : else
3918 heikki.linnakangas@i 1623 : 7052 : giststate->fetchFn[i].fn_oid = InvalidOid;
1624 : :
1625 : : /*
1626 : : * If the index column has a specified collation, we should honor that
1627 : : * while doing comparisons. However, we may have a collatable storage
1628 : : * type for a noncollatable indexed data type. If there's no index
1629 : : * collation then specify default collation in case the support
1630 : : * functions need collation. This is harmless if the support
1631 : : * functions don't care about collation, so we just do it
1632 : : * unconditionally. (We could alternatively call get_typcollation,
1633 : : * but that seems like expensive overkill --- there aren't going to be
1634 : : * any cases where a GiST storage type has a nondefault collation.)
1635 : : */
5352 tgl@sss.pgh.pa.us 1636 [ + + ]: 7741 : if (OidIsValid(index->rd_indcollation[i]))
1637 : 97 : giststate->supportCollation[i] = index->rd_indcollation[i];
1638 : : else
1639 : 7644 : giststate->supportCollation[i] = DEFAULT_COLLATION_OID;
1640 : : }
1641 : :
1642 : : /* No opclass information for INCLUDE attributes */
2473 akorotkov@postgresql 1643 [ + + ]: 4937 : for (; i < index->rd_att->natts; i++)
1644 : : {
1645 : 251 : giststate->consistentFn[i].fn_oid = InvalidOid;
1646 : 251 : giststate->unionFn[i].fn_oid = InvalidOid;
1647 : 251 : giststate->compressFn[i].fn_oid = InvalidOid;
1648 : 251 : giststate->decompressFn[i].fn_oid = InvalidOid;
1649 : 251 : giststate->penaltyFn[i].fn_oid = InvalidOid;
1650 : 251 : giststate->picksplitFn[i].fn_oid = InvalidOid;
1651 : 251 : giststate->equalFn[i].fn_oid = InvalidOid;
1652 : 251 : giststate->distanceFn[i].fn_oid = InvalidOid;
1653 : 251 : giststate->fetchFn[i].fn_oid = InvalidOid;
1654 : 251 : giststate->supportCollation[i] = InvalidOid;
1655 : : }
1656 : :
5191 tgl@sss.pgh.pa.us 1657 : 4686 : MemoryContextSwitchTo(oldCxt);
1658 : :
1659 : 4686 : return giststate;
1660 : : }
1661 : :
1662 : : void
8818 bruce@momjian.us 1663 : 3582 : freeGISTstate(GISTSTATE *giststate)
1664 : : {
1665 : : /* It's sufficient to delete the scanCxt */
5191 tgl@sss.pgh.pa.us 1666 : 3582 : MemoryContextDelete(giststate->scanCxt);
10704 scrappy@hub.org 1667 : 3582 : }
1668 : :
1669 : : /*
1670 : : * gistprunepage() -- try to remove LP_DEAD items from the given page.
1671 : : * Function assumes that buffer is exclusively locked.
1672 : : */
1673 : : static void
2478 heikki.linnakangas@i 1674 :UBC 0 : gistprunepage(Relation rel, Page page, Buffer buffer, Relation heapRel)
1675 : : {
1676 : : OffsetNumber deletable[MaxIndexTuplesPerPage];
3477 rhaas@postgresql.org 1677 : 0 : int ndeletable = 0;
1678 : : OffsetNumber offnum,
1679 : : maxoff;
1680 : :
3751 teodor@sigaev.ru 1681 [ # # ]: 0 : Assert(GistPageIsLeaf(page));
1682 : :
1683 : : /*
1684 : : * Scan over all items to see which ones need to be deleted according to
1685 : : * LP_DEAD flags.
1686 : : */
1687 : 0 : maxoff = PageGetMaxOffsetNumber(page);
1688 : 0 : for (offnum = FirstOffsetNumber;
1689 [ # # ]: 0 : offnum <= maxoff;
1690 : 0 : offnum = OffsetNumberNext(offnum))
1691 : : {
1692 : 0 : ItemId itemId = PageGetItemId(page, offnum);
1693 : :
1694 [ # # ]: 0 : if (ItemIdIsDead(itemId))
1695 : 0 : deletable[ndeletable++] = offnum;
1696 : : }
1697 : :
1698 [ # # ]: 0 : if (ndeletable > 0)
1699 : : {
1125 pg@bowt.ie 1700 : 0 : TransactionId snapshotConflictHorizon = InvalidTransactionId;
1701 : :
1785 1702 [ # # # # : 0 : if (XLogStandbyInfoActive() && RelationNeedsWAL(rel))
# # # # #
# ]
1703 : : snapshotConflictHorizon =
1704 : 0 : index_compute_xid_horizon_for_tuples(rel, heapRel, buffer,
1705 : : deletable, ndeletable);
1706 : :
3751 teodor@sigaev.ru 1707 : 0 : START_CRIT_SECTION();
1708 : :
1709 : 0 : PageIndexMultiDelete(page, deletable, ndeletable);
1710 : :
1711 : : /*
1712 : : * Mark the page as not containing any LP_DEAD items. This is not
1713 : : * certainly true (there might be some that have recently been marked,
1714 : : * but weren't included in our target-item list), but it will almost
1715 : : * always be true and it doesn't seem worth an additional page scan to
1716 : : * check it. Remember that F_HAS_GARBAGE is only a hint anyway.
1717 : : */
1718 : 0 : GistClearPageHasGarbage(page);
1719 : :
1720 : 0 : MarkBufferDirty(buffer);
1721 : :
1722 : : /* XLOG stuff */
1723 [ # # # # : 0 : if (RelationNeedsWAL(rel))
# # # # ]
1724 : 0 : {
1725 : : XLogRecPtr recptr;
1726 : :
2552 akorotkov@postgresql 1727 : 0 : recptr = gistXLogDelete(buffer,
1728 : : deletable, ndeletable,
1729 : : snapshotConflictHorizon,
1730 : : heapRel);
1731 : :
3751 teodor@sigaev.ru 1732 : 0 : PageSetLSN(page, recptr);
1733 : : }
1734 : : else
1735 : 0 : PageSetLSN(page, gistGetFakeLSN(rel));
1736 : :
1737 [ # # ]: 0 : END_CRIT_SECTION();
1738 : : }
1739 : :
1740 : : /*
1741 : : * Note: if we didn't find any LP_DEAD items, then the page's
1742 : : * F_HAS_GARBAGE hint bit is falsely set. We do not bother expending a
1743 : : * separate write to clear it, however. We will clear it when we split
1744 : : * the page.
1745 : : */
1746 : 0 : }
|