Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * reorderbuffer.c
4 : : * PostgreSQL logical replay/reorder buffer management
5 : : *
6 : : *
7 : : * Copyright (c) 2012-2025, PostgreSQL Global Development Group
8 : : *
9 : : *
10 : : * IDENTIFICATION
11 : : * src/backend/replication/logical/reorderbuffer.c
12 : : *
13 : : * NOTES
14 : : * This module gets handed individual pieces of transactions in the order
15 : : * they are written to the WAL and is responsible to reassemble them into
16 : : * toplevel transaction sized pieces. When a transaction is completely
17 : : * reassembled - signaled by reading the transaction commit record - it
18 : : * will then call the output plugin (cf. ReorderBufferCommit()) with the
19 : : * individual changes. The output plugins rely on snapshots built by
20 : : * snapbuild.c which hands them to us.
21 : : *
22 : : * Transactions and subtransactions/savepoints in postgres are not
23 : : * immediately linked to each other from outside the performing
24 : : * backend. Only at commit/abort (or special xact_assignment records) they
25 : : * are linked together. Which means that we will have to splice together a
26 : : * toplevel transaction from its subtransactions. To do that efficiently we
27 : : * build a binary heap indexed by the smallest current lsn of the individual
28 : : * subtransactions' changestreams. As the individual streams are inherently
29 : : * ordered by LSN - since that is where we build them from - the transaction
30 : : * can easily be reassembled by always using the subtransaction with the
31 : : * smallest current LSN from the heap.
32 : : *
33 : : * In order to cope with large transactions - which can be several times as
34 : : * big as the available memory - this module supports spooling the contents
35 : : * of large transactions to disk. When the transaction is replayed the
36 : : * contents of individual (sub-)transactions will be read from disk in
37 : : * chunks.
38 : : *
39 : : * This module also has to deal with reassembling toast records from the
40 : : * individual chunks stored in WAL. When a new (or initial) version of a
41 : : * tuple is stored in WAL it will always be preceded by the toast chunks
42 : : * emitted for the columns stored out of line. Within a single toplevel
43 : : * transaction there will be no other data carrying records between a row's
44 : : * toast chunks and the row data itself. See ReorderBufferToast* for
45 : : * details.
46 : : *
47 : : * ReorderBuffer uses two special memory context types - SlabContext for
48 : : * allocations of fixed-length structures (changes and transactions), and
49 : : * GenerationContext for the variable-length transaction data (allocated
50 : : * and freed in groups with similar lifespans).
51 : : *
52 : : * To limit the amount of memory used by decoded changes, we track memory
53 : : * used at the reorder buffer level (i.e. total amount of memory), and for
54 : : * each transaction. When the total amount of used memory exceeds the
55 : : * limit, the transaction consuming the most memory is then serialized to
56 : : * disk.
57 : : *
58 : : * Only decoded changes are evicted from memory (spilled to disk), not the
59 : : * transaction records. The number of toplevel transactions is limited,
60 : : * but a transaction with many subtransactions may still consume significant
61 : : * amounts of memory. However, the transaction records are fairly small and
62 : : * are not included in the memory limit.
63 : : *
64 : : * The current eviction algorithm is very simple - the transaction is
65 : : * picked merely by size, while it might be useful to also consider age
66 : : * (LSN) of the changes for example. With the new Generational memory
67 : : * allocator, evicting the oldest changes would make it more likely the
68 : : * memory gets actually freed.
69 : : *
70 : : * We use a max-heap with transaction size as the key to efficiently find
71 : : * the largest transaction. We update the max-heap whenever the memory
72 : : * counter is updated; however transactions with size 0 are not stored in
73 : : * the heap, because they have no changes to evict.
74 : : *
75 : : * We still rely on max_changes_in_memory when loading serialized changes
76 : : * back into memory. At that point we can't use the memory limit directly
77 : : * as we load the subxacts independently. One option to deal with this
78 : : * would be to count the subxacts, and allow each to allocate 1/N of the
79 : : * memory limit. That however does not seem very appealing, because with
80 : : * many subtransactions it may easily cause thrashing (short cycles of
81 : : * deserializing and applying very few changes). We probably should give
82 : : * a bit more memory to the oldest subtransactions, because it's likely
83 : : * they are the source for the next sequence of changes.
84 : : *
85 : : * -------------------------------------------------------------------------
86 : : */
87 : : #include "postgres.h"
88 : :
89 : : #include <unistd.h>
90 : : #include <sys/stat.h>
91 : :
92 : : #include "access/detoast.h"
93 : : #include "access/heapam.h"
94 : : #include "access/rewriteheap.h"
95 : : #include "access/transam.h"
96 : : #include "access/xact.h"
97 : : #include "access/xlog_internal.h"
98 : : #include "catalog/catalog.h"
99 : : #include "common/int.h"
100 : : #include "lib/binaryheap.h"
101 : : #include "miscadmin.h"
102 : : #include "pgstat.h"
103 : : #include "replication/logical.h"
104 : : #include "replication/reorderbuffer.h"
105 : : #include "replication/slot.h"
106 : : #include "replication/snapbuild.h" /* just for SnapBuildSnapDecRefcount */
107 : : #include "storage/bufmgr.h"
108 : : #include "storage/fd.h"
109 : : #include "storage/procarray.h"
110 : : #include "storage/sinval.h"
111 : : #include "utils/builtins.h"
112 : : #include "utils/inval.h"
113 : : #include "utils/memutils.h"
114 : : #include "utils/rel.h"
115 : : #include "utils/relfilenumbermap.h"
116 : :
117 : : /*
118 : : * Each transaction has an 8MB limit for invalidation messages distributed from
119 : : * other transactions. This limit is set considering scenarios with many
120 : : * concurrent logical decoding operations. When the distributed invalidation
121 : : * messages reach this threshold, the transaction is marked as
122 : : * RBTXN_DISTR_INVAL_OVERFLOWED to invalidate the complete cache as we have lost
123 : : * some inval messages and hence don't know what needs to be invalidated.
124 : : */
125 : : #define MAX_DISTR_INVAL_MSG_PER_TXN \
126 : : ((8 * 1024 * 1024) / sizeof(SharedInvalidationMessage))
127 : :
128 : : /* entry for a hash table we use to map from xid to our transaction state */
129 : : typedef struct ReorderBufferTXNByIdEnt
130 : : {
131 : : TransactionId xid;
132 : : ReorderBufferTXN *txn;
133 : : } ReorderBufferTXNByIdEnt;
134 : :
135 : : /* data structures for (relfilelocator, ctid) => (cmin, cmax) mapping */
136 : : typedef struct ReorderBufferTupleCidKey
137 : : {
138 : : RelFileLocator rlocator;
139 : : ItemPointerData tid;
140 : : } ReorderBufferTupleCidKey;
141 : :
142 : : typedef struct ReorderBufferTupleCidEnt
143 : : {
144 : : ReorderBufferTupleCidKey key;
145 : : CommandId cmin;
146 : : CommandId cmax;
147 : : CommandId combocid; /* just for debugging */
148 : : } ReorderBufferTupleCidEnt;
149 : :
150 : : /* Virtual file descriptor with file offset tracking */
151 : : typedef struct TXNEntryFile
152 : : {
153 : : File vfd; /* -1 when the file is closed */
154 : : off_t curOffset; /* offset for next write or read. Reset to 0
155 : : * when vfd is opened. */
156 : : } TXNEntryFile;
157 : :
158 : : /* k-way in-order change iteration support structures */
159 : : typedef struct ReorderBufferIterTXNEntry
160 : : {
161 : : XLogRecPtr lsn;
162 : : ReorderBufferChange *change;
163 : : ReorderBufferTXN *txn;
164 : : TXNEntryFile file;
165 : : XLogSegNo segno;
166 : : } ReorderBufferIterTXNEntry;
167 : :
168 : : typedef struct ReorderBufferIterTXNState
169 : : {
170 : : binaryheap *heap;
171 : : Size nr_txns;
172 : : dlist_head old_change;
173 : : ReorderBufferIterTXNEntry entries[FLEXIBLE_ARRAY_MEMBER];
174 : : } ReorderBufferIterTXNState;
175 : :
176 : : /* toast datastructures */
177 : : typedef struct ReorderBufferToastEnt
178 : : {
179 : : Oid chunk_id; /* toast_table.chunk_id */
180 : : int32 last_chunk_seq; /* toast_table.chunk_seq of the last chunk we
181 : : * have seen */
182 : : Size num_chunks; /* number of chunks we've already seen */
183 : : Size size; /* combined size of chunks seen */
184 : : dlist_head chunks; /* linked list of chunks */
185 : : struct varlena *reconstructed; /* reconstructed varlena now pointed to in
186 : : * main tup */
187 : : } ReorderBufferToastEnt;
188 : :
189 : : /* Disk serialization support datastructures */
190 : : typedef struct ReorderBufferDiskChange
191 : : {
192 : : Size size;
193 : : ReorderBufferChange change;
194 : : /* data follows */
195 : : } ReorderBufferDiskChange;
196 : :
197 : : #define IsSpecInsert(action) \
198 : : ( \
199 : : ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT) \
200 : : )
201 : : #define IsSpecConfirmOrAbort(action) \
202 : : ( \
203 : : (((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM) || \
204 : : ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT)) \
205 : : )
206 : : #define IsInsertOrUpdate(action) \
207 : : ( \
208 : : (((action) == REORDER_BUFFER_CHANGE_INSERT) || \
209 : : ((action) == REORDER_BUFFER_CHANGE_UPDATE) || \
210 : : ((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT)) \
211 : : )
212 : :
213 : : /*
214 : : * Maximum number of changes kept in memory, per transaction. After that,
215 : : * changes are spooled to disk.
216 : : *
217 : : * The current value should be sufficient to decode the entire transaction
218 : : * without hitting disk in OLTP workloads, while starting to spool to disk in
219 : : * other workloads reasonably fast.
220 : : *
221 : : * At some point in the future it probably makes sense to have a more elaborate
222 : : * resource management here, but it's not entirely clear what that would look
223 : : * like.
224 : : */
225 : : int logical_decoding_work_mem;
226 : : static const Size max_changes_in_memory = 4096; /* XXX for restore only */
227 : :
228 : : /* GUC variable */
229 : : int debug_logical_replication_streaming = DEBUG_LOGICAL_REP_STREAMING_BUFFERED;
230 : :
231 : : /* ---------------------------------------
232 : : * primary reorderbuffer support routines
233 : : * ---------------------------------------
234 : : */
235 : : static ReorderBufferTXN *ReorderBufferAllocTXN(ReorderBuffer *rb);
236 : : static void ReorderBufferFreeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
237 : : static ReorderBufferTXN *ReorderBufferTXNByXid(ReorderBuffer *rb,
238 : : TransactionId xid, bool create, bool *is_new,
239 : : XLogRecPtr lsn, bool create_as_top);
240 : : static void ReorderBufferTransferSnapToParent(ReorderBufferTXN *txn,
241 : : ReorderBufferTXN *subtxn);
242 : :
243 : : static void AssertTXNLsnOrder(ReorderBuffer *rb);
244 : :
245 : : /* ---------------------------------------
246 : : * support functions for lsn-order iterating over the ->changes of a
247 : : * transaction and its subtransactions
248 : : *
249 : : * used for iteration over the k-way heap merge of a transaction and its
250 : : * subtransactions
251 : : * ---------------------------------------
252 : : */
253 : : static void ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn,
254 : : ReorderBufferIterTXNState *volatile *iter_state);
255 : : static ReorderBufferChange *ReorderBufferIterTXNNext(ReorderBuffer *rb, ReorderBufferIterTXNState *state);
256 : : static void ReorderBufferIterTXNFinish(ReorderBuffer *rb,
257 : : ReorderBufferIterTXNState *state);
258 : : static void ReorderBufferExecuteInvalidations(uint32 nmsgs, SharedInvalidationMessage *msgs);
259 : :
260 : : /*
261 : : * ---------------------------------------
262 : : * Disk serialization support functions
263 : : * ---------------------------------------
264 : : */
265 : : static void ReorderBufferCheckMemoryLimit(ReorderBuffer *rb);
266 : : static void ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
267 : : static void ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
268 : : int fd, ReorderBufferChange *change);
269 : : static Size ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn,
270 : : TXNEntryFile *file, XLogSegNo *segno);
271 : : static void ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
272 : : char *data);
273 : : static void ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn);
274 : : static void ReorderBufferTruncateTXN(ReorderBuffer *rb, ReorderBufferTXN *txn,
275 : : bool txn_prepared);
276 : : static void ReorderBufferMaybeMarkTXNStreamed(ReorderBuffer *rb, ReorderBufferTXN *txn);
277 : : static bool ReorderBufferCheckAndTruncateAbortedTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
278 : : static void ReorderBufferCleanupSerializedTXNs(const char *slotname);
279 : : static void ReorderBufferSerializedPath(char *path, ReplicationSlot *slot,
280 : : TransactionId xid, XLogSegNo segno);
281 : : static int ReorderBufferTXNSizeCompare(const pairingheap_node *a, const pairingheap_node *b, void *arg);
282 : :
283 : : static void ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap);
284 : : static Snapshot ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap,
285 : : ReorderBufferTXN *txn, CommandId cid);
286 : :
287 : : /*
288 : : * ---------------------------------------
289 : : * Streaming support functions
290 : : * ---------------------------------------
291 : : */
292 : : static inline bool ReorderBufferCanStream(ReorderBuffer *rb);
293 : : static inline bool ReorderBufferCanStartStreaming(ReorderBuffer *rb);
294 : : static void ReorderBufferStreamTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
295 : : static void ReorderBufferStreamCommit(ReorderBuffer *rb, ReorderBufferTXN *txn);
296 : :
297 : : /* ---------------------------------------
298 : : * toast reassembly support
299 : : * ---------------------------------------
300 : : */
301 : : static void ReorderBufferToastInitHash(ReorderBuffer *rb, ReorderBufferTXN *txn);
302 : : static void ReorderBufferToastReset(ReorderBuffer *rb, ReorderBufferTXN *txn);
303 : : static void ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn,
304 : : Relation relation, ReorderBufferChange *change);
305 : : static void ReorderBufferToastAppendChunk(ReorderBuffer *rb, ReorderBufferTXN *txn,
306 : : Relation relation, ReorderBufferChange *change);
307 : :
308 : : /*
309 : : * ---------------------------------------
310 : : * memory accounting
311 : : * ---------------------------------------
312 : : */
313 : : static Size ReorderBufferChangeSize(ReorderBufferChange *change);
314 : : static void ReorderBufferChangeMemoryUpdate(ReorderBuffer *rb,
315 : : ReorderBufferChange *change,
316 : : ReorderBufferTXN *txn,
317 : : bool addition, Size sz);
318 : :
319 : : /*
320 : : * Allocate a new ReorderBuffer and clean out any old serialized state from
321 : : * prior ReorderBuffer instances for the same slot.
322 : : */
323 : : ReorderBuffer *
4307 rhaas@postgresql.org 324 :CBC 1059 : ReorderBufferAllocate(void)
325 : : {
326 : : ReorderBuffer *buffer;
327 : : HASHCTL hash_ctl;
328 : : MemoryContext new_ctx;
329 : :
2843 alvherre@alvh.no-ip. 330 [ - + ]: 1059 : Assert(MyReplicationSlot != NULL);
331 : :
332 : : /* allocate memory in own context, to have better accountability */
4307 rhaas@postgresql.org 333 : 1059 : new_ctx = AllocSetContextCreate(CurrentMemoryContext,
334 : : "ReorderBuffer",
335 : : ALLOCSET_DEFAULT_SIZES);
336 : :
337 : : buffer =
338 : 1059 : (ReorderBuffer *) MemoryContextAlloc(new_ctx, sizeof(ReorderBuffer));
339 : :
340 : 1059 : memset(&hash_ctl, 0, sizeof(hash_ctl));
341 : :
342 : 1059 : buffer->context = new_ctx;
343 : :
3215 andres@anarazel.de 344 : 1059 : buffer->change_context = SlabContextCreate(new_ctx,
345 : : "Change",
346 : : SLAB_DEFAULT_BLOCK_SIZE,
347 : : sizeof(ReorderBufferChange));
348 : :
349 : 1059 : buffer->txn_context = SlabContextCreate(new_ctx,
350 : : "TXN",
351 : : SLAB_DEFAULT_BLOCK_SIZE,
352 : : sizeof(ReorderBufferTXN));
353 : :
354 : : /*
355 : : * To minimize memory fragmentation caused by long-running transactions
356 : : * with changes spanning multiple memory blocks, we use a single
357 : : * fixed-size memory block for decoded tuple storage. The performance
358 : : * testing showed that the default memory block size maintains logical
359 : : * decoding performance without causing fragmentation due to concurrent
360 : : * transactions. One might think that we can use the max size as
361 : : * SLAB_LARGE_BLOCK_SIZE but the test also showed it doesn't help resolve
362 : : * the memory fragmentation.
363 : : */
2946 simon@2ndQuadrant.co 364 : 1059 : buffer->tup_context = GenerationContextCreate(new_ctx,
365 : : "Tuples",
366 : : SLAB_DEFAULT_BLOCK_SIZE,
367 : : SLAB_DEFAULT_BLOCK_SIZE,
368 : : SLAB_DEFAULT_BLOCK_SIZE);
369 : :
4307 rhaas@postgresql.org 370 : 1059 : hash_ctl.keysize = sizeof(TransactionId);
371 : 1059 : hash_ctl.entrysize = sizeof(ReorderBufferTXNByIdEnt);
372 : 1059 : hash_ctl.hcxt = buffer->context;
373 : :
374 : 1059 : buffer->by_txn = hash_create("ReorderBufferByXid", 1000, &hash_ctl,
375 : : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
376 : :
377 : 1059 : buffer->by_txn_last_xid = InvalidTransactionId;
378 : 1059 : buffer->by_txn_last_txn = NULL;
379 : :
380 : 1059 : buffer->outbuf = NULL;
381 : 1059 : buffer->outbufsize = 0;
2223 akapila@postgresql.o 382 : 1059 : buffer->size = 0;
383 : :
384 : : /* txn_heap is ordered by transaction size */
615 msawada@postgresql.o 385 : 1059 : buffer->txn_heap = pairingheap_allocate(ReorderBufferTXNSizeCompare, NULL);
386 : :
1896 akapila@postgresql.o 387 : 1059 : buffer->spillTxns = 0;
388 : 1059 : buffer->spillCount = 0;
389 : 1059 : buffer->spillBytes = 0;
1875 390 : 1059 : buffer->streamTxns = 0;
391 : 1059 : buffer->streamCount = 0;
392 : 1059 : buffer->streamBytes = 0;
70 msawada@postgresql.o 393 :GNC 1059 : buffer->memExceededCount = 0;
1706 akapila@postgresql.o 394 :CBC 1059 : buffer->totalTxns = 0;
395 : 1059 : buffer->totalBytes = 0;
396 : :
4307 rhaas@postgresql.org 397 : 1059 : buffer->current_restart_decoding_lsn = InvalidXLogRecPtr;
398 : :
399 : 1059 : dlist_init(&buffer->toplevel_by_lsn);
2731 alvherre@alvh.no-ip. 400 : 1059 : dlist_init(&buffer->txns_by_base_snapshot_lsn);
1141 drowley@postgresql.o 401 : 1059 : dclist_init(&buffer->catchange_txns);
402 : :
403 : : /*
404 : : * Ensure there's no stale data from prior uses of this slot, in case some
405 : : * prior exit avoided calling ReorderBufferFree. Failure to do this can
406 : : * produce duplicated txns, and it's very cheap if there's nothing there.
407 : : */
2843 alvherre@alvh.no-ip. 408 : 1059 : ReorderBufferCleanupSerializedTXNs(NameStr(MyReplicationSlot->data.name));
409 : :
4307 rhaas@postgresql.org 410 : 1059 : return buffer;
411 : : }
412 : :
413 : : /*
414 : : * Free a ReorderBuffer
415 : : */
416 : : void
417 : 846 : ReorderBufferFree(ReorderBuffer *rb)
418 : : {
419 : 846 : MemoryContext context = rb->context;
420 : :
421 : : /*
422 : : * We free separately allocated data by entirely scrapping reorderbuffer's
423 : : * memory context.
424 : : */
425 : 846 : MemoryContextDelete(context);
426 : :
427 : : /* Free disk space used by unconsumed reorder buffers */
2843 alvherre@alvh.no-ip. 428 : 846 : ReorderBufferCleanupSerializedTXNs(NameStr(MyReplicationSlot->data.name));
4307 rhaas@postgresql.org 429 : 846 : }
430 : :
431 : : /*
432 : : * Allocate a new ReorderBufferTXN.
433 : : */
434 : : static ReorderBufferTXN *
280 heikki.linnakangas@i 435 : 3920 : ReorderBufferAllocTXN(ReorderBuffer *rb)
436 : : {
437 : : ReorderBufferTXN *txn;
438 : :
439 : : txn = (ReorderBufferTXN *)
3215 andres@anarazel.de 440 : 3920 : MemoryContextAlloc(rb->txn_context, sizeof(ReorderBufferTXN));
441 : :
4307 rhaas@postgresql.org 442 : 3920 : memset(txn, 0, sizeof(ReorderBufferTXN));
443 : :
444 : 3920 : dlist_init(&txn->changes);
445 : 3920 : dlist_init(&txn->tuplecids);
446 : 3920 : dlist_init(&txn->subtxns);
447 : :
448 : : /* InvalidCommandId is not zero, so set it explicitly */
1957 akapila@postgresql.o 449 : 3920 : txn->command_id = InvalidCommandId;
1856 450 : 3920 : txn->output_plugin_private = NULL;
451 : :
4307 rhaas@postgresql.org 452 : 3920 : return txn;
453 : : }
454 : :
455 : : /*
456 : : * Free a ReorderBufferTXN.
457 : : */
458 : : static void
280 heikki.linnakangas@i 459 : 3862 : ReorderBufferFreeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
460 : : {
461 : : /* clean the lookup cache if we were cached (quite likely) */
4307 rhaas@postgresql.org 462 [ + + ]: 3862 : if (rb->by_txn_last_xid == txn->xid)
463 : : {
464 : 3677 : rb->by_txn_last_xid = InvalidTransactionId;
465 : 3677 : rb->by_txn_last_txn = NULL;
466 : : }
467 : :
468 : : /* free data that's contained */
469 : :
1808 akapila@postgresql.o 470 [ + + ]: 3862 : if (txn->gid != NULL)
471 : : {
472 : 41 : pfree(txn->gid);
473 : 41 : txn->gid = NULL;
474 : : }
475 : :
4307 rhaas@postgresql.org 476 [ + + ]: 3862 : if (txn->tuplecid_hash != NULL)
477 : : {
478 : 654 : hash_destroy(txn->tuplecid_hash);
479 : 654 : txn->tuplecid_hash = NULL;
480 : : }
481 : :
482 [ + + ]: 3862 : if (txn->invalidations)
483 : : {
484 : 1229 : pfree(txn->invalidations);
485 : 1229 : txn->invalidations = NULL;
486 : : }
487 : :
184 msawada@postgresql.o 488 [ + + ]: 3862 : if (txn->invalidations_distributed)
489 : : {
490 : 21 : pfree(txn->invalidations_distributed);
491 : 21 : txn->invalidations_distributed = NULL;
492 : : }
493 : :
494 : : /* Reset the toast hash */
1646 akapila@postgresql.o 495 : 3862 : ReorderBufferToastReset(rb, txn);
496 : :
497 : : /* All changes must be deallocated */
478 msawada@postgresql.o 498 [ - + ]: 3862 : Assert(txn->size == 0);
499 : :
3215 andres@anarazel.de 500 : 3862 : pfree(txn);
4307 rhaas@postgresql.org 501 : 3862 : }
502 : :
503 : : /*
504 : : * Allocate a ReorderBufferChange.
505 : : */
506 : : ReorderBufferChange *
280 heikki.linnakangas@i 507 : 1624729 : ReorderBufferAllocChange(ReorderBuffer *rb)
508 : : {
509 : : ReorderBufferChange *change;
510 : :
511 : : change = (ReorderBufferChange *)
3215 andres@anarazel.de 512 : 1624729 : MemoryContextAlloc(rb->change_context, sizeof(ReorderBufferChange));
513 : :
4307 rhaas@postgresql.org 514 : 1624729 : memset(change, 0, sizeof(ReorderBufferChange));
515 : 1624729 : return change;
516 : : }
517 : :
518 : : /*
519 : : * Free a ReorderBufferChange and update memory accounting, if requested.
520 : : */
521 : : void
280 heikki.linnakangas@i 522 : 1624490 : ReorderBufferFreeChange(ReorderBuffer *rb, ReorderBufferChange *change,
523 : : bool upd_mem)
524 : : {
525 : : /* update memory accounting info */
1957 akapila@postgresql.o 526 [ + + ]: 1624490 : if (upd_mem)
623 msawada@postgresql.o 527 : 202477 : ReorderBufferChangeMemoryUpdate(rb, change, NULL, false,
528 : : ReorderBufferChangeSize(change));
529 : :
530 : : /* free contained data */
4303 tgl@sss.pgh.pa.us 531 [ + + + + : 1624490 : switch (change->action)
+ + - ]
532 : : {
533 : 1549918 : case REORDER_BUFFER_CHANGE_INSERT:
534 : : case REORDER_BUFFER_CHANGE_UPDATE:
535 : : case REORDER_BUFFER_CHANGE_DELETE:
536 : : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
537 [ + + ]: 1549918 : if (change->data.tp.newtuple)
538 : : {
280 heikki.linnakangas@i 539 : 1270465 : ReorderBufferFreeTupleBuf(change->data.tp.newtuple);
4303 tgl@sss.pgh.pa.us 540 : 1270465 : change->data.tp.newtuple = NULL;
541 : : }
542 : :
543 [ + + ]: 1549918 : if (change->data.tp.oldtuple)
544 : : {
280 heikki.linnakangas@i 545 : 211152 : ReorderBufferFreeTupleBuf(change->data.tp.oldtuple);
4303 tgl@sss.pgh.pa.us 546 : 211152 : change->data.tp.oldtuple = NULL;
547 : : }
4307 rhaas@postgresql.org 548 : 1549918 : break;
3542 simon@2ndQuadrant.co 549 : 40 : case REORDER_BUFFER_CHANGE_MESSAGE:
550 [ + - ]: 40 : if (change->data.msg.prefix != NULL)
551 : 40 : pfree(change->data.msg.prefix);
552 : 40 : change->data.msg.prefix = NULL;
553 [ + - ]: 40 : if (change->data.msg.message != NULL)
554 : 40 : pfree(change->data.msg.message);
555 : 40 : change->data.msg.message = NULL;
556 : 40 : break;
1889 akapila@postgresql.o 557 : 5191 : case REORDER_BUFFER_CHANGE_INVALIDATION:
558 [ + - ]: 5191 : if (change->data.inval.invalidations)
559 : 5191 : pfree(change->data.inval.invalidations);
560 : 5191 : change->data.inval.invalidations = NULL;
561 : 5191 : break;
4307 rhaas@postgresql.org 562 : 1254 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
4303 tgl@sss.pgh.pa.us 563 [ + - ]: 1254 : if (change->data.snapshot)
564 : : {
565 : 1254 : ReorderBufferFreeSnap(rb, change->data.snapshot);
566 : 1254 : change->data.snapshot = NULL;
567 : : }
4307 rhaas@postgresql.org 568 : 1254 : break;
569 : : /* no data in addition to the struct itself */
2662 tomas.vondra@postgre 570 : 49 : case REORDER_BUFFER_CHANGE_TRUNCATE:
571 [ + - ]: 49 : if (change->data.truncate.relids != NULL)
572 : : {
280 heikki.linnakangas@i 573 : 49 : ReorderBufferFreeRelids(rb, change->data.truncate.relids);
2662 tomas.vondra@postgre 574 : 49 : change->data.truncate.relids = NULL;
575 : : }
576 : 49 : break;
3876 andres@anarazel.de 577 : 68038 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
578 : : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
579 : : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
580 : : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
4307 rhaas@postgresql.org 581 : 68038 : break;
582 : : }
583 : :
3215 andres@anarazel.de 584 : 1624490 : pfree(change);
4307 rhaas@postgresql.org 585 : 1624490 : }
586 : :
587 : : /*
588 : : * Allocate a HeapTuple fitting a tuple of size tuple_len (excluding header
589 : : * overhead).
590 : : */
591 : : HeapTuple
280 heikki.linnakangas@i 592 : 1481660 : ReorderBufferAllocTupleBuf(ReorderBuffer *rb, Size tuple_len)
593 : : {
594 : : HeapTuple tuple;
595 : : Size alloc_len;
596 : :
3574 andres@anarazel.de 597 : 1481660 : alloc_len = tuple_len + SizeofHeapTupleHeader;
598 : :
688 msawada@postgresql.o 599 : 1481660 : tuple = (HeapTuple) MemoryContextAlloc(rb->tup_context,
600 : : HEAPTUPLESIZE + alloc_len);
601 : 1481660 : tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE);
602 : :
4307 rhaas@postgresql.org 603 : 1481660 : return tuple;
604 : : }
605 : :
606 : : /*
607 : : * Free a HeapTuple returned by ReorderBufferAllocTupleBuf().
608 : : */
609 : : void
280 heikki.linnakangas@i 610 : 1481617 : ReorderBufferFreeTupleBuf(HeapTuple tuple)
611 : : {
2946 simon@2ndQuadrant.co 612 : 1481617 : pfree(tuple);
4307 rhaas@postgresql.org 613 : 1481617 : }
614 : :
615 : : /*
616 : : * Allocate an array for relids of truncated relations.
617 : : *
618 : : * We use the global memory context (for the whole reorder buffer), because
619 : : * none of the existing ones seems like a good match (some are SLAB, so we
620 : : * can't use those, and tup_context is meant for tuple data, not relids). We
621 : : * could add yet another context, but it seems like an overkill - TRUNCATE is
622 : : * not particularly common operation, so it does not seem worth it.
623 : : */
624 : : Oid *
280 heikki.linnakangas@i 625 : 54 : ReorderBufferAllocRelids(ReorderBuffer *rb, int nrelids)
626 : : {
627 : : Oid *relids;
628 : : Size alloc_len;
629 : :
2662 tomas.vondra@postgre 630 : 54 : alloc_len = sizeof(Oid) * nrelids;
631 : :
632 : 54 : relids = (Oid *) MemoryContextAlloc(rb->context, alloc_len);
633 : :
634 : 54 : return relids;
635 : : }
636 : :
637 : : /*
638 : : * Free an array of relids.
639 : : */
640 : : void
280 heikki.linnakangas@i 641 : 49 : ReorderBufferFreeRelids(ReorderBuffer *rb, Oid *relids)
642 : : {
2662 tomas.vondra@postgre 643 : 49 : pfree(relids);
644 : 49 : }
645 : :
646 : : /*
647 : : * Return the ReorderBufferTXN from the given buffer, specified by Xid.
648 : : * If create is true, and a transaction doesn't already exist, create it
649 : : * (with the given LSN, and as top transaction if that's specified);
650 : : * when this happens, is_new is set to true.
651 : : */
652 : : static ReorderBufferTXN *
4307 rhaas@postgresql.org 653 : 5420640 : ReorderBufferTXNByXid(ReorderBuffer *rb, TransactionId xid, bool create,
654 : : bool *is_new, XLogRecPtr lsn, bool create_as_top)
655 : : {
656 : : ReorderBufferTXN *txn;
657 : : ReorderBufferTXNByIdEnt *ent;
658 : : bool found;
659 : :
660 [ - + ]: 5420640 : Assert(TransactionIdIsValid(xid));
661 : :
662 : : /*
663 : : * Check the one-entry lookup cache first
664 : : */
665 [ + + ]: 5420640 : if (TransactionIdIsValid(rb->by_txn_last_xid) &&
666 [ + + ]: 5416925 : rb->by_txn_last_xid == xid)
667 : : {
668 : 4727235 : txn = rb->by_txn_last_txn;
669 : :
670 [ + + ]: 4727235 : if (txn != NULL)
671 : : {
672 : : /* found it, and it's valid */
673 [ + + ]: 4727211 : if (is_new)
674 : 3205 : *is_new = false;
675 : 4727211 : return txn;
676 : : }
677 : :
678 : : /*
679 : : * cached as non-existent, and asked not to create? Then nothing else
680 : : * to do.
681 : : */
682 [ + + ]: 24 : if (!create)
683 : 21 : return NULL;
684 : : /* otherwise fall through to create it */
685 : : }
686 : :
687 : : /*
688 : : * If the cache wasn't hit or it yielded a "does-not-exist" and we want to
689 : : * create an entry.
690 : : */
691 : :
692 : : /* search the lookup table */
693 : : ent = (ReorderBufferTXNByIdEnt *)
694 : 693408 : hash_search(rb->by_txn,
695 : : &xid,
696 : : create ? HASH_ENTER : HASH_FIND,
697 : : &found);
698 [ + + ]: 693408 : if (found)
699 : 688193 : txn = ent->txn;
700 [ + + ]: 5215 : else if (create)
701 : : {
702 : : /* initialize the new entry, if creation was requested */
703 [ - + ]: 3920 : Assert(ent != NULL);
41 alvherre@kurilemu.de 704 [ - + ]:GNC 3920 : Assert(XLogRecPtrIsValid(lsn));
705 : :
280 heikki.linnakangas@i 706 :CBC 3920 : ent->txn = ReorderBufferAllocTXN(rb);
4307 rhaas@postgresql.org 707 : 3920 : ent->txn->xid = xid;
708 : 3920 : txn = ent->txn;
709 : 3920 : txn->first_lsn = lsn;
710 : 3920 : txn->restart_decoding_lsn = rb->current_restart_decoding_lsn;
711 : :
712 [ + + ]: 3920 : if (create_as_top)
713 : : {
714 : 3273 : dlist_push_tail(&rb->toplevel_by_lsn, &txn->node);
715 : 3273 : AssertTXNLsnOrder(rb);
716 : : }
717 : : }
718 : : else
719 : 1295 : txn = NULL; /* not found and not asked to create */
720 : :
721 : : /* update cache */
722 : 693408 : rb->by_txn_last_xid = xid;
723 : 693408 : rb->by_txn_last_txn = txn;
724 : :
725 [ + + ]: 693408 : if (is_new)
726 : 1711 : *is_new = !found;
727 : :
3552 andres@anarazel.de 728 [ + + - + ]: 693408 : Assert(!create || txn != NULL);
4307 rhaas@postgresql.org 729 : 693408 : return txn;
730 : : }
731 : :
732 : : /*
733 : : * Record the partial change for the streaming of in-progress transactions. We
734 : : * can stream only complete changes so if we have a partial change like toast
735 : : * table insert or speculative insert then we mark such a 'txn' so that it
736 : : * can't be streamed. We also ensure that if the changes in such a 'txn' can
737 : : * be streamed and are above logical_decoding_work_mem threshold then we stream
738 : : * them as soon as we have a complete change.
739 : : */
740 : : static void
1957 akapila@postgresql.o 741 : 1412739 : ReorderBufferProcessPartialChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
742 : : ReorderBufferChange *change,
743 : : bool toast_insert)
744 : : {
745 : : ReorderBufferTXN *toptxn;
746 : :
747 : : /*
748 : : * The partial changes need to be processed only while streaming
749 : : * in-progress transactions.
750 : : */
751 [ + + ]: 1412739 : if (!ReorderBufferCanStream(rb))
752 : 909455 : return;
753 : :
754 : : /* Get the top transaction. */
1006 755 [ + + ]: 503284 : toptxn = rbtxn_get_toptxn(txn);
756 : :
757 : : /*
758 : : * Indicate a partial change for toast inserts. The change will be
759 : : * considered as complete once we get the insert or update on the main
760 : : * table and we are sure that the pending toast chunks are not required
761 : : * anymore.
762 : : *
763 : : * If we allow streaming when there are pending toast chunks then such
764 : : * chunks won't be released till the insert (multi_insert) is complete and
765 : : * we expect the txn to have streamed all changes after streaming. This
766 : : * restriction is mainly to ensure the correctness of streamed
767 : : * transactions and it doesn't seem worth uplifting such a restriction
768 : : * just to allow this case because anyway we will stream the transaction
769 : : * once such an insert is complete.
770 : : */
1957 771 [ + + ]: 503284 : if (toast_insert)
1665 772 : 1666 : toptxn->txn_flags |= RBTXN_HAS_PARTIAL_CHANGE;
773 [ + + ]: 501618 : else if (rbtxn_has_partial_change(toptxn) &&
774 [ + + - + : 63 : IsInsertOrUpdate(change->action) &&
- - ]
775 [ + + ]: 63 : change->data.tp.clear_toast_afterwards)
776 : 43 : toptxn->txn_flags &= ~RBTXN_HAS_PARTIAL_CHANGE;
777 : :
778 : : /*
779 : : * Indicate a partial change for speculative inserts. The change will be
780 : : * considered as complete once we get the speculative confirm or abort
781 : : * token.
782 : : */
1957 783 [ - + ]: 503284 : if (IsSpecInsert(change->action))
1665 akapila@postgresql.o 784 :UBC 0 : toptxn->txn_flags |= RBTXN_HAS_PARTIAL_CHANGE;
1665 akapila@postgresql.o 785 [ + + ]:CBC 503284 : else if (rbtxn_has_partial_change(toptxn) &&
1631 786 [ + - - + ]: 1686 : IsSpecConfirmOrAbort(change->action))
1665 akapila@postgresql.o 787 :UBC 0 : toptxn->txn_flags &= ~RBTXN_HAS_PARTIAL_CHANGE;
788 : :
789 : : /*
790 : : * Stream the transaction if it is serialized before and the changes are
791 : : * now complete in the top-level transaction.
792 : : *
793 : : * The reason for doing the streaming of such a transaction as soon as we
794 : : * get the complete change for it is that previously it would have reached
795 : : * the memory threshold and wouldn't get streamed because of incomplete
796 : : * changes. Delaying such transactions would increase apply lag for them.
797 : : */
1957 akapila@postgresql.o 798 [ + + ]:CBC 503284 : if (ReorderBufferCanStartStreaming(rb) &&
1665 799 [ + + ]: 169612 : !(rbtxn_has_partial_change(toptxn)) &&
1105 800 [ + + ]: 168076 : rbtxn_is_serialized(txn) &&
801 [ + + ]: 39 : rbtxn_has_streamable_change(toptxn))
1957 802 : 9 : ReorderBufferStreamTXN(rb, toptxn);
803 : : }
804 : :
805 : : /*
806 : : * Queue a change into a transaction so it can be replayed upon commit or will be
807 : : * streamed when we reach logical_decoding_work_mem threshold.
808 : : */
809 : : void
4307 rhaas@postgresql.org 810 : 1422148 : ReorderBufferQueueChange(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn,
811 : : ReorderBufferChange *change, bool toast_insert)
812 : : {
813 : : ReorderBufferTXN *txn;
814 : :
815 : 1422148 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
816 : :
817 : : /*
818 : : * If we have detected that the transaction is aborted while streaming the
819 : : * previous changes or by checking its CLOG, there is no point in
820 : : * collecting further changes for it.
821 : : */
308 msawada@postgresql.o 822 [ + + ]: 1422148 : if (rbtxn_is_aborted(txn))
823 : : {
824 : : /*
825 : : * We don't need to update memory accounting for this change as we
826 : : * have not added it to the queue yet.
827 : : */
280 heikki.linnakangas@i 828 : 9409 : ReorderBufferFreeChange(rb, change, false);
1957 akapila@postgresql.o 829 : 9409 : return;
830 : : }
831 : :
832 : : /*
833 : : * The changes that are sent downstream are considered streamable. We
834 : : * remember such transactions so that only those will later be considered
835 : : * for streaming.
836 : : */
1105 837 [ + + ]: 1412739 : if (change->action == REORDER_BUFFER_CHANGE_INSERT ||
838 [ + + ]: 541182 : change->action == REORDER_BUFFER_CHANGE_UPDATE ||
839 [ + + ]: 333930 : change->action == REORDER_BUFFER_CHANGE_DELETE ||
840 [ + + ]: 66441 : change->action == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT ||
841 [ + + ]: 48525 : change->action == REORDER_BUFFER_CHANGE_TRUNCATE ||
842 [ + + ]: 48481 : change->action == REORDER_BUFFER_CHANGE_MESSAGE)
843 : : {
1006 844 [ + + ]: 1364297 : ReorderBufferTXN *toptxn = rbtxn_get_toptxn(txn);
845 : :
1105 846 : 1364297 : toptxn->txn_flags |= RBTXN_HAS_STREAMABLE_CHANGE;
847 : : }
848 : :
4307 rhaas@postgresql.org 849 : 1412739 : change->lsn = lsn;
2223 akapila@postgresql.o 850 : 1412739 : change->txn = txn;
851 : :
41 alvherre@kurilemu.de 852 [ - + ]:GNC 1412739 : Assert(XLogRecPtrIsValid(lsn));
4307 rhaas@postgresql.org 853 :CBC 1412739 : dlist_push_tail(&txn->changes, &change->node);
854 : 1412739 : txn->nentries++;
855 : 1412739 : txn->nentries_mem++;
856 : :
857 : : /* update memory accounting information */
623 msawada@postgresql.o 858 : 1412739 : ReorderBufferChangeMemoryUpdate(rb, change, NULL, true,
859 : : ReorderBufferChangeSize(change));
860 : :
861 : : /* process partial change */
1957 akapila@postgresql.o 862 : 1412739 : ReorderBufferProcessPartialChange(rb, txn, change, toast_insert);
863 : :
864 : : /* check the memory limits and evict something if needed */
2223 865 : 1412739 : ReorderBufferCheckMemoryLimit(rb);
866 : : }
867 : :
868 : : /*
869 : : * A transactional message is queued to be processed upon commit and a
870 : : * non-transactional message gets processed immediately.
871 : : */
872 : : void
3542 simon@2ndQuadrant.co 873 : 47 : ReorderBufferQueueMessage(ReorderBuffer *rb, TransactionId xid,
874 : : Snapshot snap, XLogRecPtr lsn,
875 : : bool transactional, const char *prefix,
876 : : Size message_size, const char *message)
877 : : {
878 [ + + ]: 47 : if (transactional)
879 : : {
880 : : MemoryContext oldcontext;
881 : : ReorderBufferChange *change;
882 : :
883 [ - + ]: 39 : Assert(xid != InvalidTransactionId);
884 : :
885 : : /*
886 : : * We don't expect snapshots for transactional changes - we'll use the
887 : : * snapshot derived later during apply (unless the change gets
888 : : * skipped).
889 : : */
1029 tomas.vondra@postgre 890 [ - + ]: 39 : Assert(!snap);
891 : :
3542 simon@2ndQuadrant.co 892 : 39 : oldcontext = MemoryContextSwitchTo(rb->context);
893 : :
280 heikki.linnakangas@i 894 : 39 : change = ReorderBufferAllocChange(rb);
3542 simon@2ndQuadrant.co 895 : 39 : change->action = REORDER_BUFFER_CHANGE_MESSAGE;
896 : 39 : change->data.msg.prefix = pstrdup(prefix);
897 : 39 : change->data.msg.message_size = message_size;
898 : 39 : change->data.msg.message = palloc(message_size);
899 : 39 : memcpy(change->data.msg.message, message, message_size);
900 : :
1957 akapila@postgresql.o 901 : 39 : ReorderBufferQueueChange(rb, xid, lsn, change, false);
902 : :
3542 simon@2ndQuadrant.co 903 : 39 : MemoryContextSwitchTo(oldcontext);
904 : : }
905 : : else
906 : : {
3478 rhaas@postgresql.org 907 : 8 : ReorderBufferTXN *txn = NULL;
1187 pg@bowt.ie 908 : 8 : volatile Snapshot snapshot_now = snap;
909 : :
910 : : /* Non-transactional changes require a valid snapshot. */
1029 tomas.vondra@postgre 911 [ - + ]: 8 : Assert(snapshot_now);
912 : :
3542 simon@2ndQuadrant.co 913 [ + + ]: 8 : if (xid != InvalidTransactionId)
914 : 3 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
915 : :
916 : : /* setup snapshot to allow catalog access */
917 : 8 : SetupHistoricSnapshot(snapshot_now, NULL);
918 [ + - ]: 8 : PG_TRY();
919 : : {
920 : 8 : rb->message(rb, txn, lsn, false, prefix, message_size, message);
921 : :
922 : 8 : TeardownHistoricSnapshot(false);
923 : : }
3542 simon@2ndQuadrant.co 924 :UBC 0 : PG_CATCH();
925 : : {
926 : 0 : TeardownHistoricSnapshot(true);
927 : 0 : PG_RE_THROW();
928 : : }
3542 simon@2ndQuadrant.co 929 [ - + ]:CBC 8 : PG_END_TRY();
930 : : }
931 : 47 : }
932 : :
933 : : /*
934 : : * AssertTXNLsnOrder
935 : : * Verify LSN ordering of transaction lists in the reorderbuffer
936 : : *
937 : : * Other LSN-related invariants are checked too.
938 : : *
939 : : * No-op if assertions are not in use.
940 : : */
941 : : static void
4307 rhaas@postgresql.org 942 : 8026 : AssertTXNLsnOrder(ReorderBuffer *rb)
943 : : {
944 : : #ifdef USE_ASSERT_CHECKING
1154 akapila@postgresql.o 945 : 8026 : LogicalDecodingContext *ctx = rb->private_data;
946 : : dlist_iter iter;
4307 rhaas@postgresql.org 947 : 8026 : XLogRecPtr prev_first_lsn = InvalidXLogRecPtr;
2731 alvherre@alvh.no-ip. 948 : 8026 : XLogRecPtr prev_base_snap_lsn = InvalidXLogRecPtr;
949 : :
950 : : /*
951 : : * Skip the verification if we don't reach the LSN at which we start
952 : : * decoding the contents of transactions yet because until we reach the
953 : : * LSN, we could have transactions that don't have the association between
954 : : * the top-level transaction and subtransaction yet and consequently have
955 : : * the same LSN. We don't guarantee this association until we try to
956 : : * decode the actual contents of transaction. The ordering of the records
957 : : * prior to the start_decoding_at LSN should have been checked before the
958 : : * restart.
959 : : */
1154 akapila@postgresql.o 960 [ + + ]: 8026 : if (SnapBuildXactNeedsSkip(ctx->snapshot_builder, ctx->reader->EndRecPtr))
961 : 3855 : return;
962 : :
4307 rhaas@postgresql.org 963 [ + - + + ]: 7853 : dlist_foreach(iter, &rb->toplevel_by_lsn)
964 : : {
2731 alvherre@alvh.no-ip. 965 : 3682 : ReorderBufferTXN *cur_txn = dlist_container(ReorderBufferTXN, node,
966 : : iter.cur);
967 : :
968 : : /* start LSN must be set */
41 alvherre@kurilemu.de 969 [ - + ]:GNC 3682 : Assert(XLogRecPtrIsValid(cur_txn->first_lsn));
970 : :
971 : : /* If there is an end LSN, it must be higher than start LSN */
972 [ + + ]: 3682 : if (XLogRecPtrIsValid(cur_txn->end_lsn))
4307 rhaas@postgresql.org 973 [ - + ]:CBC 20 : Assert(cur_txn->first_lsn <= cur_txn->end_lsn);
974 : :
975 : : /* Current initial LSN must be strictly higher than previous */
41 alvherre@kurilemu.de 976 [ + + ]:GNC 3682 : if (XLogRecPtrIsValid(prev_first_lsn))
4307 rhaas@postgresql.org 977 [ - + ]:CBC 232 : Assert(prev_first_lsn < cur_txn->first_lsn);
978 : :
979 : : /* known-as-subtxn txns must not be listed */
2168 alvherre@alvh.no-ip. 980 [ - + ]: 3682 : Assert(!rbtxn_is_known_subxact(cur_txn));
981 : :
4307 rhaas@postgresql.org 982 : 3682 : prev_first_lsn = cur_txn->first_lsn;
983 : : }
984 : :
2731 alvherre@alvh.no-ip. 985 [ + - + + ]: 6206 : dlist_foreach(iter, &rb->txns_by_base_snapshot_lsn)
986 : : {
987 : 2035 : ReorderBufferTXN *cur_txn = dlist_container(ReorderBufferTXN,
988 : : base_snapshot_node,
989 : : iter.cur);
990 : :
991 : : /* base snapshot (and its LSN) must be set */
992 [ - + ]: 2035 : Assert(cur_txn->base_snapshot != NULL);
41 alvherre@kurilemu.de 993 [ - + ]:GNC 2035 : Assert(XLogRecPtrIsValid(cur_txn->base_snapshot_lsn));
994 : :
995 : : /* current LSN must be strictly higher than previous */
996 [ + + ]: 2035 : if (XLogRecPtrIsValid(prev_base_snap_lsn))
2731 alvherre@alvh.no-ip. 997 [ - + ]:CBC 177 : Assert(prev_base_snap_lsn < cur_txn->base_snapshot_lsn);
998 : :
999 : : /* known-as-subtxn txns must not be listed */
2168 1000 [ - + ]: 2035 : Assert(!rbtxn_is_known_subxact(cur_txn));
1001 : :
2731 1002 : 2035 : prev_base_snap_lsn = cur_txn->base_snapshot_lsn;
1003 : : }
1004 : : #endif
1005 : : }
1006 : :
1007 : : /*
1008 : : * AssertChangeLsnOrder
1009 : : *
1010 : : * Check ordering of changes in the (sub)transaction.
1011 : : */
1012 : : static void
1957 akapila@postgresql.o 1013 : 2570 : AssertChangeLsnOrder(ReorderBufferTXN *txn)
1014 : : {
1015 : : #ifdef USE_ASSERT_CHECKING
1016 : : dlist_iter iter;
1017 : 2570 : XLogRecPtr prev_lsn = txn->first_lsn;
1018 : :
1019 [ + - + + ]: 185942 : dlist_foreach(iter, &txn->changes)
1020 : : {
1021 : : ReorderBufferChange *cur_change;
1022 : :
1023 : 183372 : cur_change = dlist_container(ReorderBufferChange, node, iter.cur);
1024 : :
41 alvherre@kurilemu.de 1025 [ - + ]:GNC 183372 : Assert(XLogRecPtrIsValid(txn->first_lsn));
1026 [ - + ]: 183372 : Assert(XLogRecPtrIsValid(cur_change->lsn));
1957 akapila@postgresql.o 1027 [ - + ]:CBC 183372 : Assert(txn->first_lsn <= cur_change->lsn);
1028 : :
41 alvherre@kurilemu.de 1029 [ + + ]:GNC 183372 : if (XLogRecPtrIsValid(txn->end_lsn))
1957 akapila@postgresql.o 1030 [ - + ]:CBC 29930 : Assert(cur_change->lsn <= txn->end_lsn);
1031 : :
1032 [ - + ]: 183372 : Assert(prev_lsn <= cur_change->lsn);
1033 : :
1034 : 183372 : prev_lsn = cur_change->lsn;
1035 : : }
1036 : : #endif
1037 : 2570 : }
1038 : :
1039 : : /*
1040 : : * ReorderBufferGetOldestTXN
1041 : : * Return oldest transaction in reorderbuffer
1042 : : */
1043 : : ReorderBufferTXN *
4307 rhaas@postgresql.org 1044 : 420 : ReorderBufferGetOldestTXN(ReorderBuffer *rb)
1045 : : {
1046 : : ReorderBufferTXN *txn;
1047 : :
2731 alvherre@alvh.no-ip. 1048 : 420 : AssertTXNLsnOrder(rb);
1049 : :
4307 rhaas@postgresql.org 1050 [ + + ]: 420 : if (dlist_is_empty(&rb->toplevel_by_lsn))
1051 : 364 : return NULL;
1052 : :
1053 : 56 : txn = dlist_head_element(ReorderBufferTXN, node, &rb->toplevel_by_lsn);
1054 : :
2168 alvherre@alvh.no-ip. 1055 [ - + ]: 56 : Assert(!rbtxn_is_known_subxact(txn));
41 alvherre@kurilemu.de 1056 [ - + ]:GNC 56 : Assert(XLogRecPtrIsValid(txn->first_lsn));
4307 rhaas@postgresql.org 1057 :CBC 56 : return txn;
1058 : : }
1059 : :
1060 : : /*
1061 : : * ReorderBufferGetOldestXmin
1062 : : * Return oldest Xmin in reorderbuffer
1063 : : *
1064 : : * Returns oldest possibly running Xid from the point of view of snapshots
1065 : : * used in the transactions kept by reorderbuffer, or InvalidTransactionId if
1066 : : * there are none.
1067 : : *
1068 : : * Since snapshots are assigned monotonically, this equals the Xmin of the
1069 : : * base snapshot with minimal base_snapshot_lsn.
1070 : : */
1071 : : TransactionId
2731 alvherre@alvh.no-ip. 1072 : 436 : ReorderBufferGetOldestXmin(ReorderBuffer *rb)
1073 : : {
1074 : : ReorderBufferTXN *txn;
1075 : :
1076 : 436 : AssertTXNLsnOrder(rb);
1077 : :
1078 [ + + ]: 436 : if (dlist_is_empty(&rb->txns_by_base_snapshot_lsn))
1079 : 389 : return InvalidTransactionId;
1080 : :
1081 : 47 : txn = dlist_head_element(ReorderBufferTXN, base_snapshot_node,
1082 : : &rb->txns_by_base_snapshot_lsn);
1083 : 47 : return txn->base_snapshot->xmin;
1084 : : }
1085 : :
1086 : : void
4307 rhaas@postgresql.org 1087 : 480 : ReorderBufferSetRestartPoint(ReorderBuffer *rb, XLogRecPtr ptr)
1088 : : {
1089 : 480 : rb->current_restart_decoding_lsn = ptr;
1090 : 480 : }
1091 : :
1092 : : /*
1093 : : * ReorderBufferAssignChild
1094 : : *
1095 : : * Make note that we know that subxid is a subtransaction of xid, seen as of
1096 : : * the given lsn.
1097 : : */
1098 : : void
1099 : 833 : ReorderBufferAssignChild(ReorderBuffer *rb, TransactionId xid,
1100 : : TransactionId subxid, XLogRecPtr lsn)
1101 : : {
1102 : : ReorderBufferTXN *txn;
1103 : : ReorderBufferTXN *subtxn;
1104 : : bool new_top;
1105 : : bool new_sub;
1106 : :
1107 : 833 : txn = ReorderBufferTXNByXid(rb, xid, true, &new_top, lsn, true);
1108 : 833 : subtxn = ReorderBufferTXNByXid(rb, subxid, true, &new_sub, lsn, false);
1109 : :
2731 alvherre@alvh.no-ip. 1110 [ + + ]: 833 : if (!new_sub)
1111 : : {
2168 1112 [ + - ]: 186 : if (rbtxn_is_known_subxact(subtxn))
1113 : : {
1114 : : /* already associated, nothing to do */
2731 1115 : 186 : return;
1116 : : }
1117 : : else
1118 : : {
1119 : : /*
1120 : : * We already saw this transaction, but initially added it to the
1121 : : * list of top-level txns. Now that we know it's not top-level,
1122 : : * remove it from there.
1123 : : */
2731 alvherre@alvh.no-ip. 1124 :UBC 0 : dlist_delete(&subtxn->node);
1125 : : }
1126 : : }
1127 : :
2168 alvherre@alvh.no-ip. 1128 :CBC 647 : subtxn->txn_flags |= RBTXN_IS_SUBXACT;
2731 1129 : 647 : subtxn->toplevel_xid = xid;
1130 [ - + ]: 647 : Assert(subtxn->nsubtxns == 0);
1131 : :
1132 : : /* set the reference to top-level transaction */
1973 akapila@postgresql.o 1133 : 647 : subtxn->toptxn = txn;
1134 : :
1135 : : /* add to subtransaction list */
2731 alvherre@alvh.no-ip. 1136 : 647 : dlist_push_tail(&txn->subtxns, &subtxn->node);
1137 : 647 : txn->nsubtxns++;
1138 : :
1139 : : /* Possibly transfer the subtxn's snapshot to its top-level txn. */
1140 : 647 : ReorderBufferTransferSnapToParent(txn, subtxn);
1141 : :
1142 : : /* Verify LSN-ordering invariant */
1143 : 647 : AssertTXNLsnOrder(rb);
1144 : : }
1145 : :
1146 : : /*
1147 : : * ReorderBufferTransferSnapToParent
1148 : : * Transfer base snapshot from subtxn to top-level txn, if needed
1149 : : *
1150 : : * This is done if the top-level txn doesn't have a base snapshot, or if the
1151 : : * subtxn's base snapshot has an earlier LSN than the top-level txn's base
1152 : : * snapshot's LSN. This can happen if there are no changes in the toplevel
1153 : : * txn but there are some in the subtxn, or the first change in subtxn has
1154 : : * earlier LSN than first change in the top-level txn and we learned about
1155 : : * their kinship only now.
1156 : : *
1157 : : * The subtransaction's snapshot is cleared regardless of the transfer
1158 : : * happening, since it's not needed anymore in either case.
1159 : : *
1160 : : * We do this as soon as we become aware of their kinship, to avoid queueing
1161 : : * extra snapshots to txns known-as-subtxns -- only top-level txns will
1162 : : * receive further snapshots.
1163 : : */
1164 : : static void
1165 : 651 : ReorderBufferTransferSnapToParent(ReorderBufferTXN *txn,
1166 : : ReorderBufferTXN *subtxn)
1167 : : {
1168 [ - + ]: 651 : Assert(subtxn->toplevel_xid == txn->xid);
1169 : :
1170 [ - + ]: 651 : if (subtxn->base_snapshot != NULL)
1171 : : {
2731 alvherre@alvh.no-ip. 1172 [ # # ]:UBC 0 : if (txn->base_snapshot == NULL ||
1173 [ # # ]: 0 : subtxn->base_snapshot_lsn < txn->base_snapshot_lsn)
1174 : : {
1175 : : /*
1176 : : * If the toplevel transaction already has a base snapshot but
1177 : : * it's newer than the subxact's, purge it.
1178 : : */
1179 [ # # ]: 0 : if (txn->base_snapshot != NULL)
1180 : : {
1181 : 0 : SnapBuildSnapDecRefcount(txn->base_snapshot);
1182 : 0 : dlist_delete(&txn->base_snapshot_node);
1183 : : }
1184 : :
1185 : : /*
1186 : : * The snapshot is now the top transaction's; transfer it, and
1187 : : * adjust the list position of the top transaction in the list by
1188 : : * moving it to where the subtransaction is.
1189 : : */
1190 : 0 : txn->base_snapshot = subtxn->base_snapshot;
1191 : 0 : txn->base_snapshot_lsn = subtxn->base_snapshot_lsn;
1192 : 0 : dlist_insert_before(&subtxn->base_snapshot_node,
1193 : : &txn->base_snapshot_node);
1194 : :
1195 : : /*
1196 : : * The subtransaction doesn't have a snapshot anymore (so it
1197 : : * mustn't be in the list.)
1198 : : */
1199 : 0 : subtxn->base_snapshot = NULL;
1200 : 0 : subtxn->base_snapshot_lsn = InvalidXLogRecPtr;
1201 : 0 : dlist_delete(&subtxn->base_snapshot_node);
1202 : : }
1203 : : else
1204 : : {
1205 : : /* Base snap of toplevel is fine, so subxact's is not needed */
1206 : 0 : SnapBuildSnapDecRefcount(subtxn->base_snapshot);
1207 : 0 : dlist_delete(&subtxn->base_snapshot_node);
1208 : 0 : subtxn->base_snapshot = NULL;
1209 : 0 : subtxn->base_snapshot_lsn = InvalidXLogRecPtr;
1210 : : }
1211 : : }
4307 rhaas@postgresql.org 1212 :CBC 651 : }
1213 : :
1214 : : /*
1215 : : * Associate a subtransaction with its toplevel transaction at commit
1216 : : * time. There may be no further changes added after this.
1217 : : */
1218 : : void
1219 : 267 : ReorderBufferCommitChild(ReorderBuffer *rb, TransactionId xid,
1220 : : TransactionId subxid, XLogRecPtr commit_lsn,
1221 : : XLogRecPtr end_lsn)
1222 : : {
1223 : : ReorderBufferTXN *subtxn;
1224 : :
1225 : 267 : subtxn = ReorderBufferTXNByXid(rb, subxid, false, NULL,
1226 : : InvalidXLogRecPtr, false);
1227 : :
1228 : : /*
1229 : : * No need to do anything if that subtxn didn't contain any changes
1230 : : */
1231 [ + + ]: 267 : if (!subtxn)
1232 : 81 : return;
1233 : :
1234 : 186 : subtxn->final_lsn = commit_lsn;
1235 : 186 : subtxn->end_lsn = end_lsn;
1236 : :
1237 : : /*
1238 : : * Assign this subxact as a child of the toplevel xact (no-op if already
1239 : : * done.)
1240 : : */
2731 alvherre@alvh.no-ip. 1241 : 186 : ReorderBufferAssignChild(rb, xid, subxid, InvalidXLogRecPtr);
1242 : : }
1243 : :
1244 : :
1245 : : /*
1246 : : * Support for efficiently iterating over a transaction's and its
1247 : : * subtransactions' changes.
1248 : : *
1249 : : * We do by doing a k-way merge between transactions/subtransactions. For that
1250 : : * we model the current heads of the different transactions as a binary heap
1251 : : * so we easily know which (sub-)transaction has the change with the smallest
1252 : : * lsn next.
1253 : : *
1254 : : * We assume the changes in individual transactions are already sorted by LSN.
1255 : : */
1256 : :
1257 : : /*
1258 : : * Binary heap comparison function.
1259 : : */
1260 : : static int
4307 rhaas@postgresql.org 1261 : 51568 : ReorderBufferIterCompare(Datum a, Datum b, void *arg)
1262 : : {
1263 : 51568 : ReorderBufferIterTXNState *state = (ReorderBufferIterTXNState *) arg;
1264 : 51568 : XLogRecPtr pos_a = state->entries[DatumGetInt32(a)].lsn;
1265 : 51568 : XLogRecPtr pos_b = state->entries[DatumGetInt32(b)].lsn;
1266 : :
1267 [ + + ]: 51568 : if (pos_a < pos_b)
1268 : 50712 : return 1;
1269 [ - + ]: 856 : else if (pos_a == pos_b)
4307 rhaas@postgresql.org 1270 :UBC 0 : return 0;
4307 rhaas@postgresql.org 1271 :CBC 856 : return -1;
1272 : : }
1273 : :
1274 : : /*
1275 : : * Allocate & initialize an iterator which iterates in lsn order over a
1276 : : * transaction and all its subtransactions.
1277 : : *
1278 : : * Note: The iterator state is returned through iter_state parameter rather
1279 : : * than the function's return value. This is because the state gets cleaned up
1280 : : * in a PG_CATCH block in the caller, so we want to make sure the caller gets
1281 : : * back the state even if this function throws an exception.
1282 : : */
1283 : : static void
2195 akapila@postgresql.o 1284 : 2107 : ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn,
1285 : : ReorderBufferIterTXNState *volatile *iter_state)
1286 : : {
4307 rhaas@postgresql.org 1287 : 2107 : Size nr_txns = 0;
1288 : : ReorderBufferIterTXNState *state;
1289 : : dlist_iter cur_txn_i;
1290 : : int32 off;
1291 : :
2195 akapila@postgresql.o 1292 : 2107 : *iter_state = NULL;
1293 : :
1294 : : /* Check ordering of changes in the toplevel transaction. */
1957 1295 : 2107 : AssertChangeLsnOrder(txn);
1296 : :
1297 : : /*
1298 : : * Calculate the size of our heap: one element for every transaction that
1299 : : * contains changes. (Besides the transactions already in the reorder
1300 : : * buffer, we count the one we were directly passed.)
1301 : : */
4307 rhaas@postgresql.org 1302 [ + + ]: 2107 : if (txn->nentries > 0)
1303 : 1928 : nr_txns++;
1304 : :
1305 [ + - + + ]: 2570 : dlist_foreach(cur_txn_i, &txn->subtxns)
1306 : : {
1307 : : ReorderBufferTXN *cur_txn;
1308 : :
1309 : 463 : cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur);
1310 : :
1311 : : /* Check ordering of changes in this subtransaction. */
1957 akapila@postgresql.o 1312 : 463 : AssertChangeLsnOrder(cur_txn);
1313 : :
4307 rhaas@postgresql.org 1314 [ + + ]: 463 : if (cur_txn->nentries > 0)
1315 : 301 : nr_txns++;
1316 : : }
1317 : :
1318 : : /* allocate iteration state */
1319 : : state = (ReorderBufferIterTXNState *)
1320 : 2107 : MemoryContextAllocZero(rb->context,
1321 : : sizeof(ReorderBufferIterTXNState) +
1322 : 2107 : sizeof(ReorderBufferIterTXNEntry) * nr_txns);
1323 : :
1324 : 2107 : state->nr_txns = nr_txns;
1325 : 2107 : dlist_init(&state->old_change);
1326 : :
1327 [ + + ]: 4336 : for (off = 0; off < state->nr_txns; off++)
1328 : : {
2195 akapila@postgresql.o 1329 : 2229 : state->entries[off].file.vfd = -1;
4307 rhaas@postgresql.org 1330 : 2229 : state->entries[off].segno = 0;
1331 : : }
1332 : :
1333 : : /* allocate heap */
1334 : 2107 : state->heap = binaryheap_allocate(state->nr_txns,
1335 : : ReorderBufferIterCompare,
1336 : : state);
1337 : :
1338 : : /* Now that the state fields are initialized, it is safe to return it. */
2195 akapila@postgresql.o 1339 : 2107 : *iter_state = state;
1340 : :
1341 : : /*
1342 : : * Now insert items into the binary heap, in an unordered fashion. (We
1343 : : * will run a heap assembly step at the end; this is more efficient.)
1344 : : */
1345 : :
4307 rhaas@postgresql.org 1346 : 2107 : off = 0;
1347 : :
1348 : : /* add toplevel transaction if it contains changes */
1349 [ + + ]: 2107 : if (txn->nentries > 0)
1350 : : {
1351 : : ReorderBufferChange *cur_change;
1352 : :
2168 alvherre@alvh.no-ip. 1353 [ + + ]: 1928 : if (rbtxn_is_serialized(txn))
1354 : : {
1355 : : /* serialize remaining changes */
3362 andres@anarazel.de 1356 : 23 : ReorderBufferSerializeTXN(rb, txn);
2195 akapila@postgresql.o 1357 : 23 : ReorderBufferRestoreChanges(rb, txn, &state->entries[off].file,
1358 : : &state->entries[off].segno);
1359 : : }
1360 : :
4307 rhaas@postgresql.org 1361 : 1928 : cur_change = dlist_head_element(ReorderBufferChange, node,
1362 : : &txn->changes);
1363 : :
1364 : 1928 : state->entries[off].lsn = cur_change->lsn;
1365 : 1928 : state->entries[off].change = cur_change;
1366 : 1928 : state->entries[off].txn = txn;
1367 : :
1368 : 1928 : binaryheap_add_unordered(state->heap, Int32GetDatum(off++));
1369 : : }
1370 : :
1371 : : /* add subtransactions if they contain changes */
1372 [ + - + + ]: 2570 : dlist_foreach(cur_txn_i, &txn->subtxns)
1373 : : {
1374 : : ReorderBufferTXN *cur_txn;
1375 : :
1376 : 463 : cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur);
1377 : :
1378 [ + + ]: 463 : if (cur_txn->nentries > 0)
1379 : : {
1380 : : ReorderBufferChange *cur_change;
1381 : :
2168 alvherre@alvh.no-ip. 1382 [ + + ]: 301 : if (rbtxn_is_serialized(cur_txn))
1383 : : {
1384 : : /* serialize remaining changes */
3362 andres@anarazel.de 1385 : 17 : ReorderBufferSerializeTXN(rb, cur_txn);
4307 rhaas@postgresql.org 1386 : 17 : ReorderBufferRestoreChanges(rb, cur_txn,
1387 : : &state->entries[off].file,
1388 : : &state->entries[off].segno);
1389 : : }
1390 : 301 : cur_change = dlist_head_element(ReorderBufferChange, node,
1391 : : &cur_txn->changes);
1392 : :
1393 : 301 : state->entries[off].lsn = cur_change->lsn;
1394 : 301 : state->entries[off].change = cur_change;
1395 : 301 : state->entries[off].txn = cur_txn;
1396 : :
1397 : 301 : binaryheap_add_unordered(state->heap, Int32GetDatum(off++));
1398 : : }
1399 : : }
1400 : :
1401 : : /* assemble a valid binary heap */
1402 : 2107 : binaryheap_build(state->heap);
1403 : 2107 : }
1404 : :
1405 : : /*
1406 : : * Return the next change when iterating over a transaction and its
1407 : : * subtransactions.
1408 : : *
1409 : : * Returns NULL when no further changes exist.
1410 : : */
1411 : : static ReorderBufferChange *
1412 : 358631 : ReorderBufferIterTXNNext(ReorderBuffer *rb, ReorderBufferIterTXNState *state)
1413 : : {
1414 : : ReorderBufferChange *change;
1415 : : ReorderBufferIterTXNEntry *entry;
1416 : : int32 off;
1417 : :
1418 : : /* nothing there anymore */
169 nathan@postgresql.or 1419 [ + + ]:GNC 358631 : if (binaryheap_empty(state->heap))
4307 rhaas@postgresql.org 1420 :CBC 2097 : return NULL;
1421 : :
1422 : 356534 : off = DatumGetInt32(binaryheap_first(state->heap));
1423 : 356534 : entry = &state->entries[off];
1424 : :
1425 : : /* free memory we might have "leaked" in the previous *Next call */
1426 [ + + ]: 356534 : if (!dlist_is_empty(&state->old_change))
1427 : : {
1428 : 45 : change = dlist_container(ReorderBufferChange, node,
1429 : : dlist_pop_head_node(&state->old_change));
280 heikki.linnakangas@i 1430 : 45 : ReorderBufferFreeChange(rb, change, true);
4307 rhaas@postgresql.org 1431 [ - + ]: 45 : Assert(dlist_is_empty(&state->old_change));
1432 : : }
1433 : :
1434 : 356534 : change = entry->change;
1435 : :
1436 : : /*
1437 : : * update heap with information about which transaction has the next
1438 : : * relevant change in LSN order
1439 : : */
1440 : :
1441 : : /* there are in-memory changes */
1442 [ + + ]: 356534 : if (dlist_has_next(&entry->txn->changes, &entry->change->node))
1443 : : {
1444 : 354272 : dlist_node *next = dlist_next_node(&entry->txn->changes, &change->node);
1445 : 354272 : ReorderBufferChange *next_change =
943 tgl@sss.pgh.pa.us 1446 :ECB (353189) : dlist_container(ReorderBufferChange, node, next);
1447 : :
1448 : : /* txn stays the same */
4307 rhaas@postgresql.org 1449 :CBC 354272 : state->entries[off].lsn = next_change->lsn;
1450 : 354272 : state->entries[off].change = next_change;
1451 : :
1452 : 354272 : binaryheap_replace_first(state->heap, Int32GetDatum(off));
1453 : 354272 : return change;
1454 : : }
1455 : :
1456 : : /* try to load changes from disk */
1457 [ + + ]: 2262 : if (entry->txn->nentries != entry->txn->nentries_mem)
1458 : : {
1459 : : /*
1460 : : * Ugly: restoring changes will reuse *Change records, thus delete the
1461 : : * current one from the per-tx list and only free in the next call.
1462 : : */
1463 : 65 : dlist_delete(&change->node);
1464 : 65 : dlist_push_tail(&state->old_change, &change->node);
1465 : :
1466 : : /*
1467 : : * Update the total bytes processed by the txn for which we are
1468 : : * releasing the current set of changes and restoring the new set of
1469 : : * changes.
1470 : : */
1689 akapila@postgresql.o 1471 : 65 : rb->totalBytes += entry->txn->size;
2195 1472 [ + + ]: 65 : if (ReorderBufferRestoreChanges(rb, entry->txn, &entry->file,
1473 : : &state->entries[off].segno))
1474 : : {
1475 : : /* successfully restored changes from disk */
1476 : : ReorderBufferChange *next_change =
943 tgl@sss.pgh.pa.us 1477 : 36 : dlist_head_element(ReorderBufferChange, node,
1478 : : &entry->txn->changes);
1479 : :
4307 rhaas@postgresql.org 1480 [ - + ]: 36 : elog(DEBUG2, "restored %u/%u changes from disk",
1481 : : (uint32) entry->txn->nentries_mem,
1482 : : (uint32) entry->txn->nentries);
1483 : :
1484 [ - + ]: 36 : Assert(entry->txn->nentries_mem);
1485 : : /* txn stays the same */
1486 : 36 : state->entries[off].lsn = next_change->lsn;
1487 : 36 : state->entries[off].change = next_change;
1488 : 36 : binaryheap_replace_first(state->heap, Int32GetDatum(off));
1489 : :
1490 : 36 : return change;
1491 : : }
1492 : : }
1493 : :
1494 : : /* ok, no changes there anymore, remove */
1495 : 2226 : binaryheap_remove_first(state->heap);
1496 : :
1497 : 2226 : return change;
1498 : : }
1499 : :
1500 : : /*
1501 : : * Deallocate the iterator
1502 : : */
1503 : : static void
1504 : 2106 : ReorderBufferIterTXNFinish(ReorderBuffer *rb,
1505 : : ReorderBufferIterTXNState *state)
1506 : : {
1507 : : int32 off;
1508 : :
1509 [ + + ]: 4334 : for (off = 0; off < state->nr_txns; off++)
1510 : : {
2195 akapila@postgresql.o 1511 [ - + ]: 2228 : if (state->entries[off].file.vfd != -1)
2195 akapila@postgresql.o 1512 :UBC 0 : FileClose(state->entries[off].file.vfd);
1513 : : }
1514 : :
1515 : : /* free memory we might have "leaked" in the last *Next call */
4307 rhaas@postgresql.org 1516 [ + + ]:CBC 2106 : if (!dlist_is_empty(&state->old_change))
1517 : : {
1518 : : ReorderBufferChange *change;
1519 : :
1520 : 19 : change = dlist_container(ReorderBufferChange, node,
1521 : : dlist_pop_head_node(&state->old_change));
280 heikki.linnakangas@i 1522 : 19 : ReorderBufferFreeChange(rb, change, true);
4307 rhaas@postgresql.org 1523 [ - + ]: 19 : Assert(dlist_is_empty(&state->old_change));
1524 : : }
1525 : :
1526 : 2106 : binaryheap_free(state->heap);
1527 : 2106 : pfree(state);
1528 : 2106 : }
1529 : :
1530 : : /*
1531 : : * Cleanup the contents of a transaction, usually after the transaction
1532 : : * committed or aborted.
1533 : : */
1534 : : static void
1535 : 3862 : ReorderBufferCleanupTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
1536 : : {
1537 : : bool found;
1538 : : dlist_mutable_iter iter;
478 msawada@postgresql.o 1539 : 3862 : Size mem_freed = 0;
1540 : :
1541 : : /* cleanup subtransactions & their changes */
4307 rhaas@postgresql.org 1542 [ + - + + ]: 4047 : dlist_foreach_modify(iter, &txn->subtxns)
1543 : : {
1544 : : ReorderBufferTXN *subtxn;
1545 : :
1546 : 185 : subtxn = dlist_container(ReorderBufferTXN, node, iter.cur);
1547 : :
1548 : : /*
1549 : : * Subtransactions are always associated to the toplevel TXN, even if
1550 : : * they originally were happening inside another subtxn, so we won't
1551 : : * ever recurse more than one level deep here.
1552 : : */
2168 alvherre@alvh.no-ip. 1553 [ - + ]: 185 : Assert(rbtxn_is_known_subxact(subtxn));
4307 rhaas@postgresql.org 1554 [ - + ]: 185 : Assert(subtxn->nsubtxns == 0);
1555 : :
1556 : 185 : ReorderBufferCleanupTXN(rb, subtxn);
1557 : : }
1558 : :
1559 : : /* cleanup changes in the txn */
1560 [ + - + + ]: 69788 : dlist_foreach_modify(iter, &txn->changes)
1561 : : {
1562 : : ReorderBufferChange *change;
1563 : :
1564 : 65926 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1565 : :
1566 : : /* Check we're not mixing changes from different transactions. */
2223 akapila@postgresql.o 1567 [ - + ]: 65926 : Assert(change->txn == txn);
1568 : :
1569 : : /*
1570 : : * Instead of updating the memory counter for individual changes, we
1571 : : * sum up the size of memory to free so we can update the memory
1572 : : * counter all together below. This saves costs of maintaining the
1573 : : * max-heap.
1574 : : */
478 msawada@postgresql.o 1575 : 65926 : mem_freed += ReorderBufferChangeSize(change);
1576 : :
280 heikki.linnakangas@i 1577 : 65926 : ReorderBufferFreeChange(rb, change, false);
1578 : : }
1579 : :
1580 : : /* Update the memory counter */
478 msawada@postgresql.o 1581 : 3862 : ReorderBufferChangeMemoryUpdate(rb, NULL, txn, false, mem_freed);
1582 : :
1583 : : /*
1584 : : * Cleanup the tuplecids we stored for decoding catalog snapshot access.
1585 : : * They are always stored in the toplevel transaction.
1586 : : */
4307 rhaas@postgresql.org 1587 [ + - + + ]: 27809 : dlist_foreach_modify(iter, &txn->tuplecids)
1588 : : {
1589 : : ReorderBufferChange *change;
1590 : :
1591 : 23947 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1592 : :
1593 : : /* Check we're not mixing changes from different transactions. */
2223 akapila@postgresql.o 1594 [ - + ]: 23947 : Assert(change->txn == txn);
4303 tgl@sss.pgh.pa.us 1595 [ - + ]: 23947 : Assert(change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID);
1596 : :
280 heikki.linnakangas@i 1597 : 23947 : ReorderBufferFreeChange(rb, change, true);
1598 : : }
1599 : :
1600 : : /*
1601 : : * Cleanup the base snapshot, if set.
1602 : : */
4307 rhaas@postgresql.org 1603 [ + + ]: 3862 : if (txn->base_snapshot != NULL)
1604 : : {
1605 : 3201 : SnapBuildSnapDecRefcount(txn->base_snapshot);
2731 alvherre@alvh.no-ip. 1606 : 3201 : dlist_delete(&txn->base_snapshot_node);
1607 : : }
1608 : :
1609 : : /*
1610 : : * Cleanup the snapshot for the last streamed run.
1611 : : */
1957 akapila@postgresql.o 1612 [ + + ]: 3862 : if (txn->snapshot_now != NULL)
1613 : : {
1614 [ - + ]: 63 : Assert(rbtxn_is_streamed(txn));
1615 : 63 : ReorderBufferFreeSnap(rb, txn->snapshot_now);
1616 : : }
1617 : :
1618 : : /*
1619 : : * Remove TXN from its containing lists.
1620 : : *
1621 : : * Note: if txn is known as subxact, we are deleting the TXN from its
1622 : : * parent's list of known subxacts; this leaves the parent's nsubxacts
1623 : : * count too high, but we don't care. Otherwise, we are deleting the TXN
1624 : : * from the LSN-ordered list of toplevel TXNs. We remove the TXN from the
1625 : : * list of catalog modifying transactions as well.
1626 : : */
3391 tgl@sss.pgh.pa.us 1627 : 3862 : dlist_delete(&txn->node);
1224 akapila@postgresql.o 1628 [ + + ]: 3862 : if (rbtxn_has_catalog_changes(txn))
1141 drowley@postgresql.o 1629 : 1284 : dclist_delete_from(&rb->catchange_txns, &txn->catchange_node);
1630 : :
1631 : : /* now remove reference from buffer */
943 tgl@sss.pgh.pa.us 1632 : 3862 : hash_search(rb->by_txn, &txn->xid, HASH_REMOVE, &found);
4307 rhaas@postgresql.org 1633 [ - + ]: 3862 : Assert(found);
1634 : :
1635 : : /* remove entries spilled to disk */
2168 alvherre@alvh.no-ip. 1636 [ + + ]: 3862 : if (rbtxn_is_serialized(txn))
4307 rhaas@postgresql.org 1637 : 237 : ReorderBufferRestoreCleanup(rb, txn);
1638 : :
1639 : : /* deallocate */
280 heikki.linnakangas@i 1640 : 3862 : ReorderBufferFreeTXN(rb, txn);
4307 rhaas@postgresql.org 1641 : 3862 : }
1642 : :
1643 : : /*
1644 : : * Discard changes from a transaction (and subtransactions), either after
1645 : : * streaming, decoding them at PREPARE, or detecting the transaction abort.
1646 : : * Keep the remaining info - transactions, tuplecids, invalidations and
1647 : : * snapshots.
1648 : : *
1649 : : * We additionally remove tuplecids after decoding the transaction at prepare
1650 : : * time as we only need to perform invalidation at rollback or commit prepared.
1651 : : *
1652 : : * 'txn_prepared' indicates that we have decoded the transaction at prepare
1653 : : * time.
1654 : : */
1655 : : static void
1808 akapila@postgresql.o 1656 : 1039 : ReorderBufferTruncateTXN(ReorderBuffer *rb, ReorderBufferTXN *txn, bool txn_prepared)
1657 : : {
1658 : : dlist_mutable_iter iter;
478 msawada@postgresql.o 1659 : 1039 : Size mem_freed = 0;
1660 : :
1661 : : /* cleanup subtransactions & their changes */
1957 akapila@postgresql.o 1662 [ + - + + ]: 1336 : dlist_foreach_modify(iter, &txn->subtxns)
1663 : : {
1664 : : ReorderBufferTXN *subtxn;
1665 : :
1666 : 297 : subtxn = dlist_container(ReorderBufferTXN, node, iter.cur);
1667 : :
1668 : : /*
1669 : : * Subtransactions are always associated to the toplevel TXN, even if
1670 : : * they originally were happening inside another subtxn, so we won't
1671 : : * ever recurse more than one level deep here.
1672 : : */
1673 [ - + ]: 297 : Assert(rbtxn_is_known_subxact(subtxn));
1674 [ - + ]: 297 : Assert(subtxn->nsubtxns == 0);
1675 : :
308 msawada@postgresql.o 1676 : 297 : ReorderBufferMaybeMarkTXNStreamed(rb, subtxn);
1808 akapila@postgresql.o 1677 : 297 : ReorderBufferTruncateTXN(rb, subtxn, txn_prepared);
1678 : : }
1679 : :
1680 : : /* cleanup changes in the txn */
1957 1681 [ + - + + ]: 158734 : dlist_foreach_modify(iter, &txn->changes)
1682 : : {
1683 : : ReorderBufferChange *change;
1684 : :
1685 : 157695 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1686 : :
1687 : : /* Check we're not mixing changes from different transactions. */
1688 [ - + ]: 157695 : Assert(change->txn == txn);
1689 : :
1690 : : /* remove the change from its containing list */
1691 : 157695 : dlist_delete(&change->node);
1692 : :
1693 : : /*
1694 : : * Instead of updating the memory counter for individual changes, we
1695 : : * sum up the size of memory to free so we can update the memory
1696 : : * counter all together below. This saves costs of maintaining the
1697 : : * max-heap.
1698 : : */
478 msawada@postgresql.o 1699 : 157695 : mem_freed += ReorderBufferChangeSize(change);
1700 : :
280 heikki.linnakangas@i 1701 : 157695 : ReorderBufferFreeChange(rb, change, false);
1702 : : }
1703 : :
1704 : : /* Update the memory counter */
478 msawada@postgresql.o 1705 : 1039 : ReorderBufferChangeMemoryUpdate(rb, NULL, txn, false, mem_freed);
1706 : :
1808 akapila@postgresql.o 1707 [ + + ]: 1039 : if (txn_prepared)
1708 : : {
1709 : : /*
1710 : : * If this is a prepared txn, cleanup the tuplecids we stored for
1711 : : * decoding catalog snapshot access. They are always stored in the
1712 : : * toplevel transaction.
1713 : : */
1714 [ + - + + ]: 178 : dlist_foreach_modify(iter, &txn->tuplecids)
1715 : : {
1716 : : ReorderBufferChange *change;
1717 : :
1718 : 123 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1719 : :
1720 : : /* Check we're not mixing changes from different transactions. */
1721 [ - + ]: 123 : Assert(change->txn == txn);
1722 [ - + ]: 123 : Assert(change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID);
1723 : :
1724 : : /* Remove the change from its containing list. */
1725 : 123 : dlist_delete(&change->node);
1726 : :
280 heikki.linnakangas@i 1727 : 123 : ReorderBufferFreeChange(rb, change, true);
1728 : : }
1729 : : }
1730 : :
1731 : : /*
1732 : : * Destroy the (relfilelocator, ctid) hashtable, so that we don't leak any
1733 : : * memory. We could also keep the hash table and update it with new ctid
1734 : : * values, but this seems simpler and good enough for now.
1735 : : */
1957 akapila@postgresql.o 1736 [ + + ]: 1039 : if (txn->tuplecid_hash != NULL)
1737 : : {
1738 : 51 : hash_destroy(txn->tuplecid_hash);
1739 : 51 : txn->tuplecid_hash = NULL;
1740 : : }
1741 : :
1742 : : /* If this txn is serialized then clean the disk space. */
1743 [ + + ]: 1039 : if (rbtxn_is_serialized(txn))
1744 : : {
1745 : 9 : ReorderBufferRestoreCleanup(rb, txn);
1746 : 9 : txn->txn_flags &= ~RBTXN_IS_SERIALIZED;
1747 : :
1748 : : /*
1749 : : * We set this flag to indicate if the transaction is ever serialized.
1750 : : * We need this to accurately update the stats as otherwise the same
1751 : : * transaction can be counted as serialized multiple times.
1752 : : */
1896 1753 : 9 : txn->txn_flags |= RBTXN_IS_SERIALIZED_CLEAR;
1754 : : }
1755 : :
1756 : : /* also reset the number of entries in the transaction */
1957 1757 : 1039 : txn->nentries_mem = 0;
1758 : 1039 : txn->nentries = 0;
1759 : 1039 : }
1760 : :
1761 : : /*
1762 : : * Check the transaction status by CLOG lookup and discard all changes if
1763 : : * the transaction is aborted. The transaction status is cached in
1764 : : * txn->txn_flags so we can skip future changes and avoid CLOG lookups on the
1765 : : * next call.
1766 : : *
1767 : : * Return true if the transaction is aborted, otherwise return false.
1768 : : *
1769 : : * When the 'debug_logical_replication_streaming' is set to "immediate", we
1770 : : * don't check the transaction status, meaning the caller will always process
1771 : : * this transaction.
1772 : : */
1773 : : static bool
308 msawada@postgresql.o 1774 : 4089 : ReorderBufferCheckAndTruncateAbortedTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
1775 : : {
1776 : : /* Quick return for regression tests */
1777 [ + + ]: 4089 : if (unlikely(debug_logical_replication_streaming == DEBUG_LOGICAL_REP_STREAMING_IMMEDIATE))
1778 : 942 : return false;
1779 : :
1780 : : /*
1781 : : * Quick return if the transaction status is already known.
1782 : : */
1783 : :
1784 [ + + ]: 3147 : if (rbtxn_is_committed(txn))
1785 : 2693 : return false;
1786 [ - + ]: 454 : if (rbtxn_is_aborted(txn))
1787 : : {
1788 : : /* Already-aborted transactions should not have any changes */
308 msawada@postgresql.o 1789 [ # # ]:UBC 0 : Assert(txn->size == 0);
1790 : :
1791 : 0 : return true;
1792 : : }
1793 : :
1794 : : /* Otherwise, check the transaction status using CLOG lookup */
1795 : :
308 msawada@postgresql.o 1796 [ + + ]:CBC 454 : if (TransactionIdIsInProgress(txn->xid))
1797 : 253 : return false;
1798 : :
1799 [ + + ]: 201 : if (TransactionIdDidCommit(txn->xid))
1800 : : {
1801 : : /*
1802 : : * Remember the transaction is committed so that we can skip CLOG
1803 : : * check next time, avoiding the pressure on CLOG lookup.
1804 : : */
1805 [ - + ]: 192 : Assert(!rbtxn_is_aborted(txn));
1806 : 192 : txn->txn_flags |= RBTXN_IS_COMMITTED;
1807 : 192 : return false;
1808 : : }
1809 : :
1810 : : /*
1811 : : * The transaction aborted. We discard both the changes collected so far
1812 : : * and the toast reconstruction data. The full cleanup will happen as part
1813 : : * of decoding ABORT record of this transaction.
1814 : : */
1815 : 9 : ReorderBufferTruncateTXN(rb, txn, rbtxn_is_prepared(txn));
1816 : 9 : ReorderBufferToastReset(rb, txn);
1817 : :
1818 : : /* All changes should be discarded */
1819 [ - + ]: 9 : Assert(txn->size == 0);
1820 : :
1821 : : /*
1822 : : * Mark the transaction as aborted so we can ignore future changes of this
1823 : : * transaction.
1824 : : */
1825 [ - + ]: 9 : Assert(!rbtxn_is_committed(txn));
1826 : 9 : txn->txn_flags |= RBTXN_IS_ABORTED;
1827 : :
1828 : 9 : return true;
1829 : : }
1830 : :
1831 : : /*
1832 : : * Build a hash with a (relfilelocator, ctid) -> (cmin, cmax) mapping for use by
1833 : : * HeapTupleSatisfiesHistoricMVCC.
1834 : : */
1835 : : static void
4307 rhaas@postgresql.org 1836 : 2107 : ReorderBufferBuildTupleCidHash(ReorderBuffer *rb, ReorderBufferTXN *txn)
1837 : : {
1838 : : dlist_iter iter;
1839 : : HASHCTL hash_ctl;
1840 : :
2168 alvherre@alvh.no-ip. 1841 [ + + + + ]: 2107 : if (!rbtxn_has_catalog_changes(txn) || dlist_is_empty(&txn->tuplecids))
4307 rhaas@postgresql.org 1842 : 1402 : return;
1843 : :
1844 : 705 : hash_ctl.keysize = sizeof(ReorderBufferTupleCidKey);
1845 : 705 : hash_ctl.entrysize = sizeof(ReorderBufferTupleCidEnt);
1846 : 705 : hash_ctl.hcxt = rb->context;
1847 : :
1848 : : /*
1849 : : * create the hash with the exact number of to-be-stored tuplecids from
1850 : : * the start
1851 : : */
1852 : 705 : txn->tuplecid_hash =
1853 : 705 : hash_create("ReorderBufferTupleCid", txn->ntuplecids, &hash_ctl,
1854 : : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
1855 : :
1856 [ + - + + ]: 12760 : dlist_foreach(iter, &txn->tuplecids)
1857 : : {
1858 : : ReorderBufferTupleCidKey key;
1859 : : ReorderBufferTupleCidEnt *ent;
1860 : : bool found;
1861 : : ReorderBufferChange *change;
1862 : :
1863 : 12055 : change = dlist_container(ReorderBufferChange, node, iter.cur);
1864 : :
4303 tgl@sss.pgh.pa.us 1865 [ - + ]: 12055 : Assert(change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID);
1866 : :
1867 : : /* be careful about padding */
4307 rhaas@postgresql.org 1868 : 12055 : memset(&key, 0, sizeof(ReorderBufferTupleCidKey));
1869 : :
1260 1870 : 12055 : key.rlocator = change->data.tuplecid.locator;
1871 : :
4303 tgl@sss.pgh.pa.us 1872 : 12055 : ItemPointerCopy(&change->data.tuplecid.tid,
1873 : : &key.tid);
1874 : :
1875 : : ent = (ReorderBufferTupleCidEnt *)
1045 peter@eisentraut.org 1876 : 12055 : hash_search(txn->tuplecid_hash, &key, HASH_ENTER, &found);
4307 rhaas@postgresql.org 1877 [ + + ]: 12055 : if (!found)
1878 : : {
4303 tgl@sss.pgh.pa.us 1879 : 10418 : ent->cmin = change->data.tuplecid.cmin;
1880 : 10418 : ent->cmax = change->data.tuplecid.cmax;
1881 : 10418 : ent->combocid = change->data.tuplecid.combocid;
1882 : : }
1883 : : else
1884 : : {
1885 : : /*
1886 : : * Maybe we already saw this tuple before in this transaction, but
1887 : : * if so it must have the same cmin.
1888 : : */
1889 [ - + ]: 1637 : Assert(ent->cmin == change->data.tuplecid.cmin);
1890 : :
1891 : : /*
1892 : : * cmax may be initially invalid, but once set it can only grow,
1893 : : * and never become invalid again.
1894 : : */
2500 alvherre@alvh.no-ip. 1895 [ + + + - : 1637 : Assert((ent->cmax == InvalidCommandId) ||
- + ]
1896 : : ((change->data.tuplecid.cmax != InvalidCommandId) &&
1897 : : (change->data.tuplecid.cmax > ent->cmax)));
4303 tgl@sss.pgh.pa.us 1898 : 1637 : ent->cmax = change->data.tuplecid.cmax;
1899 : : }
1900 : : }
1901 : : }
1902 : :
1903 : : /*
1904 : : * Copy a provided snapshot so we can modify it privately. This is needed so
1905 : : * that catalog modifying transactions can look into intermediate catalog
1906 : : * states.
1907 : : */
1908 : : static Snapshot
4307 rhaas@postgresql.org 1909 : 2009 : ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap,
1910 : : ReorderBufferTXN *txn, CommandId cid)
1911 : : {
1912 : : Snapshot snap;
1913 : : dlist_iter iter;
1914 : 2009 : int i = 0;
1915 : : Size size;
1916 : :
1917 : 2009 : size = sizeof(SnapshotData) +
1918 : 2009 : sizeof(TransactionId) * orig_snap->xcnt +
1919 : 2009 : sizeof(TransactionId) * (txn->nsubtxns + 1);
1920 : :
1921 : 2009 : snap = MemoryContextAllocZero(rb->context, size);
1922 : 2009 : memcpy(snap, orig_snap, sizeof(SnapshotData));
1923 : :
1924 : 2009 : snap->copied = true;
3898 heikki.linnakangas@i 1925 : 2009 : snap->active_count = 1; /* mark as active so nobody frees it */
1926 : 2009 : snap->regd_count = 0;
4307 rhaas@postgresql.org 1927 : 2009 : snap->xip = (TransactionId *) (snap + 1);
1928 : :
1929 : 2009 : memcpy(snap->xip, orig_snap->xip, sizeof(TransactionId) * snap->xcnt);
1930 : :
1931 : : /*
1932 : : * snap->subxip contains all txids that belong to our transaction which we
1933 : : * need to check via cmin/cmax. That's why we store the toplevel
1934 : : * transaction in there as well.
1935 : : */
1936 : 2009 : snap->subxip = snap->xip + snap->xcnt;
1937 : 2009 : snap->subxip[i++] = txn->xid;
1938 : :
1939 : : /*
1940 : : * txn->nsubtxns isn't decreased when subtransactions abort, so count
1941 : : * manually. Since it's an upper boundary it is safe to use it for the
1942 : : * allocation above.
1943 : : */
1944 : 2009 : snap->subxcnt = 1;
1945 : :
1946 [ + - + + ]: 2318 : dlist_foreach(iter, &txn->subtxns)
1947 : : {
1948 : : ReorderBufferTXN *sub_txn;
1949 : :
1950 : 309 : sub_txn = dlist_container(ReorderBufferTXN, node, iter.cur);
1951 : 309 : snap->subxip[i++] = sub_txn->xid;
1952 : 309 : snap->subxcnt++;
1953 : : }
1954 : :
1955 : : /* sort so we can bsearch() later */
1956 : 2009 : qsort(snap->subxip, snap->subxcnt, sizeof(TransactionId), xidComparator);
1957 : :
1958 : : /* store the specified current CommandId */
1959 : 2009 : snap->curcid = cid;
1960 : :
1961 : 2009 : return snap;
1962 : : }
1963 : :
1964 : : /*
1965 : : * Free a previously ReorderBufferCopySnap'ed snapshot
1966 : : */
1967 : : static void
1968 : 3258 : ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap)
1969 : : {
1970 [ + + ]: 3258 : if (snap->copied)
1971 : 2006 : pfree(snap);
1972 : : else
1973 : 1252 : SnapBuildSnapDecRefcount(snap);
1974 : 3258 : }
1975 : :
1976 : : /*
1977 : : * If the transaction was (partially) streamed, we need to prepare or commit
1978 : : * it in a 'streamed' way. That is, we first stream the remaining part of the
1979 : : * transaction, and then invoke stream_prepare or stream_commit message as per
1980 : : * the case.
1981 : : */
1982 : : static void
1957 akapila@postgresql.o 1983 : 62 : ReorderBufferStreamCommit(ReorderBuffer *rb, ReorderBufferTXN *txn)
1984 : : {
1985 : : /* we should only call this for previously streamed transactions */
1986 [ - + ]: 62 : Assert(rbtxn_is_streamed(txn));
1987 : :
1988 : 62 : ReorderBufferStreamTXN(rb, txn);
1989 : :
308 msawada@postgresql.o 1990 [ + + ]: 62 : if (rbtxn_is_prepared(txn))
1991 : : {
1992 : : /*
1993 : : * Note, we send stream prepare even if a concurrent abort is
1994 : : * detected. See DecodePrepare for more information.
1995 : : */
1996 [ - + ]: 12 : Assert(!rbtxn_sent_prepare(txn));
1808 akapila@postgresql.o 1997 : 12 : rb->stream_prepare(rb, txn, txn->final_lsn);
308 msawada@postgresql.o 1998 : 12 : txn->txn_flags |= RBTXN_SENT_PREPARE;
1999 : :
2000 : : /*
2001 : : * This is a PREPARED transaction, part of a two-phase commit. The
2002 : : * full cleanup will happen as part of the COMMIT PREPAREDs, so now
2003 : : * just truncate txn by removing changes and tuplecids.
2004 : : */
1808 akapila@postgresql.o 2005 : 12 : ReorderBufferTruncateTXN(rb, txn, true);
2006 : : /* Reset the CheckXidAlive */
2007 : 12 : CheckXidAlive = InvalidTransactionId;
2008 : : }
2009 : : else
2010 : : {
2011 : 50 : rb->stream_commit(rb, txn, txn->final_lsn);
2012 : 50 : ReorderBufferCleanupTXN(rb, txn);
2013 : : }
1957 2014 : 62 : }
2015 : :
2016 : : /*
2017 : : * Set xid to detect concurrent aborts.
2018 : : *
2019 : : * While streaming an in-progress transaction or decoding a prepared
2020 : : * transaction there is a possibility that the (sub)transaction might get
2021 : : * aborted concurrently. In such case if the (sub)transaction has catalog
2022 : : * update then we might decode the tuple using wrong catalog version. For
2023 : : * example, suppose there is one catalog tuple with (xmin: 500, xmax: 0). Now,
2024 : : * the transaction 501 updates the catalog tuple and after that we will have
2025 : : * two tuples (xmin: 500, xmax: 501) and (xmin: 501, xmax: 0). Now, if 501 is
2026 : : * aborted and some other transaction say 502 updates the same catalog tuple
2027 : : * then the first tuple will be changed to (xmin: 500, xmax: 502). So, the
2028 : : * problem is that when we try to decode the tuple inserted/updated in 501
2029 : : * after the catalog update, we will see the catalog tuple with (xmin: 500,
2030 : : * xmax: 502) as visible because it will consider that the tuple is deleted by
2031 : : * xid 502 which is not visible to our snapshot. And when we will try to
2032 : : * decode with that catalog tuple, it can lead to a wrong result or a crash.
2033 : : * So, it is necessary to detect concurrent aborts to allow streaming of
2034 : : * in-progress transactions or decoding of prepared transactions.
2035 : : *
2036 : : * For detecting the concurrent abort we set CheckXidAlive to the current
2037 : : * (sub)transaction's xid for which this change belongs to. And, during
2038 : : * catalog scan we can check the status of the xid and if it is aborted we will
2039 : : * report a specific error so that we can stop streaming current transaction
2040 : : * and discard the already streamed changes on such an error. We might have
2041 : : * already streamed some of the changes for the aborted (sub)transaction, but
2042 : : * that is fine because when we decode the abort we will stream abort message
2043 : : * to truncate the changes in the subscriber. Similarly, for prepared
2044 : : * transactions, we stop decoding if concurrent abort is detected and then
2045 : : * rollback the changes when rollback prepared is encountered. See
2046 : : * DecodePrepare.
2047 : : */
2048 : : static inline void
2049 : 177852 : SetupCheckXidLive(TransactionId xid)
2050 : : {
2051 : : /*
2052 : : * If the input transaction id is already set as a CheckXidAlive then
2053 : : * nothing to do.
2054 : : */
2055 [ + + ]: 177852 : if (TransactionIdEquals(CheckXidAlive, xid))
4307 rhaas@postgresql.org 2056 : 100843 : return;
2057 : :
2058 : : /*
2059 : : * setup CheckXidAlive if it's not committed yet. We don't check if the
2060 : : * xid is aborted. That will happen during catalog access.
2061 : : */
1957 akapila@postgresql.o 2062 [ + + ]: 77009 : if (!TransactionIdDidCommit(xid))
2063 : 386 : CheckXidAlive = xid;
2064 : : else
2065 : 76623 : CheckXidAlive = InvalidTransactionId;
2066 : : }
2067 : :
2068 : : /*
2069 : : * Helper function for ReorderBufferProcessTXN for applying change.
2070 : : */
2071 : : static inline void
2072 : 334053 : ReorderBufferApplyChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
2073 : : Relation relation, ReorderBufferChange *change,
2074 : : bool streaming)
2075 : : {
2076 [ + + ]: 334053 : if (streaming)
2077 : 175986 : rb->stream_change(rb, txn, relation, change);
2078 : : else
2079 : 158067 : rb->apply_change(rb, txn, relation, change);
2080 : 334051 : }
2081 : :
2082 : : /*
2083 : : * Helper function for ReorderBufferProcessTXN for applying the truncate.
2084 : : */
2085 : : static inline void
2086 : 23 : ReorderBufferApplyTruncate(ReorderBuffer *rb, ReorderBufferTXN *txn,
2087 : : int nrelations, Relation *relations,
2088 : : ReorderBufferChange *change, bool streaming)
2089 : : {
2090 [ - + ]: 23 : if (streaming)
1957 akapila@postgresql.o 2091 :UBC 0 : rb->stream_truncate(rb, txn, nrelations, relations, change);
2092 : : else
1957 akapila@postgresql.o 2093 :CBC 23 : rb->apply_truncate(rb, txn, nrelations, relations, change);
2094 : 23 : }
2095 : :
2096 : : /*
2097 : : * Helper function for ReorderBufferProcessTXN for applying the message.
2098 : : */
2099 : : static inline void
2100 : 11 : ReorderBufferApplyMessage(ReorderBuffer *rb, ReorderBufferTXN *txn,
2101 : : ReorderBufferChange *change, bool streaming)
2102 : : {
2103 [ + + ]: 11 : if (streaming)
2104 : 3 : rb->stream_message(rb, txn, change->lsn, true,
2105 : 3 : change->data.msg.prefix,
2106 : : change->data.msg.message_size,
2107 : 3 : change->data.msg.message);
2108 : : else
2109 : 8 : rb->message(rb, txn, change->lsn, true,
2110 : 8 : change->data.msg.prefix,
2111 : : change->data.msg.message_size,
2112 : 8 : change->data.msg.message);
2113 : 11 : }
2114 : :
2115 : : /*
2116 : : * Function to store the command id and snapshot at the end of the current
2117 : : * stream so that we can reuse the same while sending the next stream.
2118 : : */
2119 : : static inline void
2120 : 692 : ReorderBufferSaveTXNSnapshot(ReorderBuffer *rb, ReorderBufferTXN *txn,
2121 : : Snapshot snapshot_now, CommandId command_id)
2122 : : {
2123 : 692 : txn->command_id = command_id;
2124 : :
2125 : : /* Avoid copying if it's already copied. */
2126 [ + - ]: 692 : if (snapshot_now->copied)
2127 : 692 : txn->snapshot_now = snapshot_now;
2128 : : else
1957 akapila@postgresql.o 2129 :UBC 0 : txn->snapshot_now = ReorderBufferCopySnap(rb, snapshot_now,
2130 : : txn, command_id);
1957 akapila@postgresql.o 2131 :CBC 692 : }
2132 : :
2133 : : /*
2134 : : * Mark the given transaction as streamed if it's a top-level transaction
2135 : : * or has changes.
2136 : : */
2137 : : static void
308 msawada@postgresql.o 2138 : 989 : ReorderBufferMaybeMarkTXNStreamed(ReorderBuffer *rb, ReorderBufferTXN *txn)
2139 : : {
2140 : : /*
2141 : : * The top-level transaction, is marked as streamed always, even if it
2142 : : * does not contain any changes (that is, when all the changes are in
2143 : : * subtransactions).
2144 : : *
2145 : : * For subtransactions, we only mark them as streamed when there are
2146 : : * changes in them.
2147 : : *
2148 : : * We do it this way because of aborts - we don't want to send aborts for
2149 : : * XIDs the downstream is not aware of. And of course, it always knows
2150 : : * about the top-level xact (we send the XID in all messages), but we
2151 : : * never stream XIDs of empty subxacts.
2152 : : */
2153 [ + + + + ]: 989 : if (rbtxn_is_toptxn(txn) || (txn->nentries_mem != 0))
2154 : 827 : txn->txn_flags |= RBTXN_IS_STREAMED;
2155 : 989 : }
2156 : :
2157 : : /*
2158 : : * Helper function for ReorderBufferProcessTXN to handle the concurrent
2159 : : * abort of the streaming transaction. This resets the TXN such that it
2160 : : * can be used to stream the remaining data of transaction being processed.
2161 : : * This can happen when the subtransaction is aborted and we still want to
2162 : : * continue processing the main or other subtransactions data.
2163 : : */
2164 : : static void
1957 akapila@postgresql.o 2165 : 8 : ReorderBufferResetTXN(ReorderBuffer *rb, ReorderBufferTXN *txn,
2166 : : Snapshot snapshot_now,
2167 : : CommandId command_id,
2168 : : XLogRecPtr last_lsn,
2169 : : ReorderBufferChange *specinsert)
2170 : : {
2171 : : /* Discard the changes that we just streamed */
308 msawada@postgresql.o 2172 : 8 : ReorderBufferTruncateTXN(rb, txn, rbtxn_is_prepared(txn));
2173 : :
2174 : : /* Free all resources allocated for toast reconstruction */
1957 akapila@postgresql.o 2175 : 8 : ReorderBufferToastReset(rb, txn);
2176 : :
2177 : : /* Return the spec insert change if it is not NULL */
2178 [ - + ]: 8 : if (specinsert != NULL)
2179 : : {
280 heikki.linnakangas@i 2180 :UBC 0 : ReorderBufferFreeChange(rb, specinsert, true);
1957 akapila@postgresql.o 2181 : 0 : specinsert = NULL;
2182 : : }
2183 : :
2184 : : /*
2185 : : * For the streaming case, stop the stream and remember the command ID and
2186 : : * snapshot for the streaming run.
2187 : : */
1808 akapila@postgresql.o 2188 [ + - ]:CBC 8 : if (rbtxn_is_streamed(txn))
2189 : : {
2190 : 8 : rb->stream_stop(rb, txn, last_lsn);
2191 : 8 : ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id);
2192 : : }
2193 : :
2194 : : /* All changes must be deallocated */
478 msawada@postgresql.o 2195 [ - + ]: 8 : Assert(txn->size == 0);
1957 akapila@postgresql.o 2196 : 8 : }
2197 : :
2198 : : /*
2199 : : * Helper function for ReorderBufferReplay and ReorderBufferStreamTXN.
2200 : : *
2201 : : * Send data of a transaction (and its subtransactions) to the
2202 : : * output plugin. We iterate over the top and subtransactions (using a k-way
2203 : : * merge) and replay the changes in lsn order.
2204 : : *
2205 : : * If streaming is true then data will be sent using stream API.
2206 : : *
2207 : : * Note: "volatile" markers on some parameters are to avoid trouble with
2208 : : * PG_TRY inside the function.
2209 : : */
2210 : : static void
2211 : 2107 : ReorderBufferProcessTXN(ReorderBuffer *rb, ReorderBufferTXN *txn,
2212 : : XLogRecPtr commit_lsn,
2213 : : volatile Snapshot snapshot_now,
2214 : : volatile CommandId command_id,
2215 : : bool streaming)
2216 : : {
2217 : : bool using_subtxn;
2218 : 2107 : MemoryContext ccxt = CurrentMemoryContext;
96 alvherre@kurilemu.de 2219 :GNC 2107 : ResourceOwner cowner = CurrentResourceOwner;
1957 akapila@postgresql.o 2220 :CBC 2107 : ReorderBufferIterTXNState *volatile iterstate = NULL;
2221 : 2107 : volatile XLogRecPtr prev_lsn = InvalidXLogRecPtr;
2222 : 2107 : ReorderBufferChange *volatile specinsert = NULL;
2223 : 2107 : volatile bool stream_started = false;
2224 : 2107 : ReorderBufferTXN *volatile curtxn = NULL;
2225 : :
2226 : : /* build data to be able to lookup the CommandIds of catalog tuples */
4307 rhaas@postgresql.org 2227 : 2107 : ReorderBufferBuildTupleCidHash(rb, txn);
2228 : :
2229 : : /* setup the initial snapshot */
2230 : 2107 : SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2231 : :
2232 : : /*
2233 : : * Decoding needs access to syscaches et al., which in turn use
2234 : : * heavyweight locks and such. Thus we need to have enough state around to
2235 : : * keep track of those. The easiest way is to simply use a transaction
2236 : : * internally. That also allows us to easily enforce that nothing writes
2237 : : * to the database by checking for xid assignments.
2238 : : *
2239 : : * When we're called via the SQL SRF there's already a transaction
2240 : : * started, so start an explicit subtransaction there.
2241 : : */
3979 tgl@sss.pgh.pa.us 2242 : 2107 : using_subtxn = IsTransactionOrTransactionBlock();
2243 : :
4307 rhaas@postgresql.org 2244 [ + + ]: 2107 : PG_TRY();
2245 : : {
2246 : : ReorderBufferChange *change;
1043 akapila@postgresql.o 2247 : 2107 : int changes_count = 0; /* used to accumulate the number of
2248 : : * changes */
2249 : :
4052 andres@anarazel.de 2250 [ + + ]: 2107 : if (using_subtxn)
1957 akapila@postgresql.o 2251 [ + + ]: 485 : BeginInternalSubTransaction(streaming ? "stream" : "replay");
2252 : : else
4307 rhaas@postgresql.org 2253 : 1622 : StartTransactionCommand();
2254 : :
2255 : : /*
2256 : : * We only need to send begin/begin-prepare for non-streamed
2257 : : * transactions.
2258 : : */
1957 akapila@postgresql.o 2259 [ + + ]: 2107 : if (!streaming)
2260 : : {
308 msawada@postgresql.o 2261 [ + + ]: 1415 : if (rbtxn_is_prepared(txn))
1808 akapila@postgresql.o 2262 : 29 : rb->begin_prepare(rb, txn);
2263 : : else
2264 : 1386 : rb->begin(rb, txn);
2265 : : }
2266 : :
2195 2267 : 2107 : ReorderBufferIterTXNInit(rb, txn, &iterstate);
3979 tgl@sss.pgh.pa.us 2268 [ + + ]: 360738 : while ((change = ReorderBufferIterTXNNext(rb, iterstate)) != NULL)
2269 : : {
4307 rhaas@postgresql.org 2270 : 356534 : Relation relation = NULL;
2271 : : Oid reloid;
2272 : :
1212 akapila@postgresql.o 2273 [ - + ]: 356534 : CHECK_FOR_INTERRUPTS();
2274 : :
2275 : : /*
2276 : : * We can't call start stream callback before processing first
2277 : : * change.
2278 : : */
41 alvherre@kurilemu.de 2279 [ + + ]:GNC 356534 : if (!XLogRecPtrIsValid(prev_lsn))
2280 : : {
1957 akapila@postgresql.o 2281 [ + + ]:CBC 2071 : if (streaming)
2282 : : {
2283 : 657 : txn->origin_id = change->origin_id;
2284 : 657 : rb->stream_start(rb, txn, change->lsn);
2285 : 657 : stream_started = true;
2286 : : }
2287 : : }
2288 : :
2289 : : /*
2290 : : * Enforce correct ordering of changes, merged from multiple
2291 : : * subtransactions. The changes may have the same LSN due to
2292 : : * MULTI_INSERT xlog records.
2293 : : */
41 alvherre@kurilemu.de 2294 [ + + - + ]:GNC 356534 : Assert(!XLogRecPtrIsValid(prev_lsn) || prev_lsn <= change->lsn);
2295 : :
1957 akapila@postgresql.o 2296 :CBC 356534 : prev_lsn = change->lsn;
2297 : :
2298 : : /*
2299 : : * Set the current xid to detect concurrent aborts. This is
2300 : : * required for the cases when we decode the changes before the
2301 : : * COMMIT record is processed.
2302 : : */
308 msawada@postgresql.o 2303 [ + + + + ]: 356534 : if (streaming || rbtxn_is_prepared(change->txn))
2304 : : {
1957 akapila@postgresql.o 2305 : 177852 : curtxn = change->txn;
2306 : 177852 : SetupCheckXidLive(curtxn->xid);
2307 : : }
2308 : :
4303 tgl@sss.pgh.pa.us 2309 [ + + + - : 356534 : switch (change->action)
+ + + + +
- - ]
2310 : : {
3876 andres@anarazel.de 2311 : 1782 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
2312 : :
2313 : : /*
2314 : : * Confirmation for speculative insertion arrived. Simply
2315 : : * use as a normal record. It'll be cleaned up at the end
2316 : : * of INSERT processing.
2317 : : */
2722 alvherre@alvh.no-ip. 2318 [ - + ]: 1782 : if (specinsert == NULL)
2722 alvherre@alvh.no-ip. 2319 [ # # ]:UBC 0 : elog(ERROR, "invalid ordering of speculative insertion changes");
3876 andres@anarazel.de 2320 [ - + ]:CBC 1782 : Assert(specinsert->data.tp.oldtuple == NULL);
2321 : 1782 : change = specinsert;
2322 : 1782 : change->action = REORDER_BUFFER_CHANGE_INSERT;
2323 : :
2324 : : /* intentionally fall through */
4303 tgl@sss.pgh.pa.us 2325 : 340466 : case REORDER_BUFFER_CHANGE_INSERT:
2326 : : case REORDER_BUFFER_CHANGE_UPDATE:
2327 : : case REORDER_BUFFER_CHANGE_DELETE:
4307 rhaas@postgresql.org 2328 [ - + ]: 340466 : Assert(snapshot_now);
2329 : :
1260 2330 : 340466 : reloid = RelidByRelfilenumber(change->data.tp.rlocator.spcOid,
2331 : : change->data.tp.rlocator.relNumber);
2332 : :
2333 : : /*
2334 : : * Mapped catalog tuple without data, emitted while
2335 : : * catalog table was in the process of being rewritten. We
2336 : : * can fail to look up the relfilenumber, because the
2337 : : * relmapper has no "historic" view, in contrast to the
2338 : : * normal catalog during decoding. Thus repeated rewrites
2339 : : * can cause a lookup failure. That's OK because we do not
2340 : : * decode catalog changes anyway. Normally such tuples
2341 : : * would be skipped over below, but we can't identify
2342 : : * whether the table should be logically logged without
2343 : : * mapping the relfilenumber to the oid.
2344 : : */
4307 2345 [ + + ]: 340458 : if (reloid == InvalidOid &&
4303 tgl@sss.pgh.pa.us 2346 [ + - ]: 83 : change->data.tp.newtuple == NULL &&
2347 [ + - ]: 83 : change->data.tp.oldtuple == NULL)
3876 andres@anarazel.de 2348 : 83 : goto change_done;
4307 rhaas@postgresql.org 2349 [ - + ]: 340375 : else if (reloid == InvalidOid)
1260 rhaas@postgresql.org 2350 [ # # ]:UBC 0 : elog(ERROR, "could not map filenumber \"%s\" to relation OID",
2351 : : relpathperm(change->data.tp.rlocator,
2352 : : MAIN_FORKNUM).str);
2353 : :
4307 rhaas@postgresql.org 2354 :CBC 340375 : relation = RelationIdGetRelation(reloid);
2355 : :
2292 tgl@sss.pgh.pa.us 2356 [ - + ]: 340375 : if (!RelationIsValid(relation))
1260 rhaas@postgresql.org 2357 [ # # ]:UBC 0 : elog(ERROR, "could not open relation with OID %u (for filenumber \"%s\")",
2358 : : reloid,
2359 : : relpathperm(change->data.tp.rlocator,
2360 : : MAIN_FORKNUM).str);
2361 : :
3876 andres@anarazel.de 2362 [ + - + - :CBC 340375 : if (!RelationIsLogicallyLogged(relation))
- + - - -
- + - +
+ ]
2363 : 4235 : goto change_done;
2364 : :
2365 : : /*
2366 : : * Ignore temporary heaps created during DDL unless the
2367 : : * plugin has asked for them.
2368 : : */
2828 peter_e@gmx.net 2369 [ + + + + ]: 336140 : if (relation->rd_rel->relrewrite && !rb->output_rewrites)
2370 : 26 : goto change_done;
2371 : :
2372 : : /*
2373 : : * For now ignore sequence changes entirely. Most of the
2374 : : * time they don't log changes using records we
2375 : : * understand, so it doesn't make sense to handle the few
2376 : : * cases we do.
2377 : : */
3876 andres@anarazel.de 2378 [ - + ]: 336114 : if (relation->rd_rel->relkind == RELKIND_SEQUENCE)
3876 andres@anarazel.de 2379 :UBC 0 : goto change_done;
2380 : :
2381 : : /* user-triggered change */
3876 andres@anarazel.de 2382 [ + + ]:CBC 336114 : if (!IsToastRelation(relation))
2383 : : {
2384 : 334053 : ReorderBufferToastReplace(rb, txn, relation, change);
1957 akapila@postgresql.o 2385 : 334053 : ReorderBufferApplyChange(rb, txn, relation, change,
2386 : : streaming);
2387 : :
2388 : : /*
2389 : : * Only clear reassembled toast chunks if we're sure
2390 : : * they're not required anymore. The creator of the
2391 : : * tuple tells us.
2392 : : */
3876 andres@anarazel.de 2393 [ + + ]: 334051 : if (change->data.tp.clear_toast_afterwards)
2394 : 333830 : ReorderBufferToastReset(rb, txn);
2395 : : }
2396 : : /* we're not interested in toast deletions */
2397 [ + + ]: 2061 : else if (change->action == REORDER_BUFFER_CHANGE_INSERT)
2398 : : {
2399 : : /*
2400 : : * Need to reassemble the full toasted Datum in
2401 : : * memory, to ensure the chunks don't get reused till
2402 : : * we're done remove it from the list of this
2403 : : * transaction's changes. Otherwise it will get
2404 : : * freed/reused while restoring spooled data from
2405 : : * disk.
2406 : : */
2576 tomas.vondra@postgre 2407 [ - + ]: 1830 : Assert(change->data.tp.newtuple != NULL);
2408 : :
2409 : 1830 : dlist_delete(&change->node);
2410 : 1830 : ReorderBufferToastAppendChunk(rb, txn, relation,
2411 : : change);
2412 : : }
2413 : :
3861 bruce@momjian.us 2414 : 231 : change_done:
2415 : :
2416 : : /*
2417 : : * If speculative insertion was confirmed, the record
2418 : : * isn't needed anymore.
2419 : : */
3876 andres@anarazel.de 2420 [ + + ]: 340456 : if (specinsert != NULL)
2421 : : {
280 heikki.linnakangas@i 2422 : 1782 : ReorderBufferFreeChange(rb, specinsert, true);
3876 andres@anarazel.de 2423 : 1782 : specinsert = NULL;
2424 : : }
2425 : :
1957 akapila@postgresql.o 2426 [ + + ]: 340456 : if (RelationIsValid(relation))
2427 : : {
3876 andres@anarazel.de 2428 : 340373 : RelationClose(relation);
2429 : 340373 : relation = NULL;
2430 : : }
4307 rhaas@postgresql.org 2431 : 340456 : break;
2432 : :
3876 andres@anarazel.de 2433 : 1782 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
2434 : :
2435 : : /*
2436 : : * Speculative insertions are dealt with by delaying the
2437 : : * processing of the insert until the confirmation record
2438 : : * arrives. For that we simply unlink the record from the
2439 : : * chain, so it does not get freed/reused while restoring
2440 : : * spooled data from disk.
2441 : : *
2442 : : * This is safe in the face of concurrent catalog changes
2443 : : * because the relevant relation can't be changed between
2444 : : * speculative insertion and confirmation due to
2445 : : * CheckTableNotInUse() and locking.
2446 : : */
2447 : :
2448 : : /* clear out a pending (and thus failed) speculation */
2449 [ - + ]: 1782 : if (specinsert != NULL)
2450 : : {
280 heikki.linnakangas@i 2451 :UBC 0 : ReorderBufferFreeChange(rb, specinsert, true);
3876 andres@anarazel.de 2452 : 0 : specinsert = NULL;
2453 : : }
2454 : :
2455 : : /* and memorize the pending insertion */
3876 andres@anarazel.de 2456 :CBC 1782 : dlist_delete(&change->node);
2457 : 1782 : specinsert = change;
2458 : 1782 : break;
2459 : :
1646 akapila@postgresql.o 2460 :UBC 0 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
2461 : :
2462 : : /*
2463 : : * Abort for speculative insertion arrived. So cleanup the
2464 : : * specinsert tuple and toast hash.
2465 : : *
2466 : : * Note that we get the spec abort change for each toast
2467 : : * entry but we need to perform the cleanup only the first
2468 : : * time we get it for the main table.
2469 : : */
2470 [ # # ]: 0 : if (specinsert != NULL)
2471 : : {
2472 : : /*
2473 : : * We must clean the toast hash before processing a
2474 : : * completely new tuple to avoid confusion about the
2475 : : * previous tuple's toast chunks.
2476 : : */
2477 [ # # ]: 0 : Assert(change->data.tp.clear_toast_afterwards);
2478 : 0 : ReorderBufferToastReset(rb, txn);
2479 : :
2480 : : /* We don't need this record anymore. */
280 heikki.linnakangas@i 2481 : 0 : ReorderBufferFreeChange(rb, specinsert, true);
1646 akapila@postgresql.o 2482 : 0 : specinsert = NULL;
2483 : : }
2484 : 0 : break;
2485 : :
2811 peter_e@gmx.net 2486 :CBC 23 : case REORDER_BUFFER_CHANGE_TRUNCATE:
2487 : : {
2488 : : int i;
2792 tgl@sss.pgh.pa.us 2489 : 23 : int nrelids = change->data.truncate.nrelids;
2490 : 23 : int nrelations = 0;
2491 : : Relation *relations;
2492 : :
2493 : 23 : relations = palloc0(nrelids * sizeof(Relation));
2494 [ + + ]: 66 : for (i = 0; i < nrelids; i++)
2495 : : {
2496 : 43 : Oid relid = change->data.truncate.relids[i];
2497 : : Relation rel;
2498 : :
1169 drowley@postgresql.o 2499 : 43 : rel = RelationIdGetRelation(relid);
2500 : :
2501 [ - + ]: 43 : if (!RelationIsValid(rel))
2792 tgl@sss.pgh.pa.us 2502 [ # # ]:UBC 0 : elog(ERROR, "could not open relation with OID %u", relid);
2503 : :
1169 drowley@postgresql.o 2504 [ + - + - :CBC 43 : if (!RelationIsLogicallyLogged(rel))
- + - - -
- + - -
+ ]
2792 tgl@sss.pgh.pa.us 2505 :UBC 0 : continue;
2506 : :
1169 drowley@postgresql.o 2507 :CBC 43 : relations[nrelations++] = rel;
2508 : : }
2509 : :
2510 : : /* Apply the truncate. */
1957 akapila@postgresql.o 2511 : 23 : ReorderBufferApplyTruncate(rb, txn, nrelations,
2512 : : relations, change,
2513 : : streaming);
2514 : :
2792 tgl@sss.pgh.pa.us 2515 [ + + ]: 66 : for (i = 0; i < nrelations; i++)
2516 : 43 : RelationClose(relations[i]);
2517 : :
2518 : 23 : break;
2519 : : }
2520 : :
3542 simon@2ndQuadrant.co 2521 : 11 : case REORDER_BUFFER_CHANGE_MESSAGE:
1957 akapila@postgresql.o 2522 : 11 : ReorderBufferApplyMessage(rb, txn, change, streaming);
3542 simon@2ndQuadrant.co 2523 : 11 : break;
2524 : :
1889 akapila@postgresql.o 2525 : 2416 : case REORDER_BUFFER_CHANGE_INVALIDATION:
2526 : : /* Execute the invalidation messages locally */
1314 alvherre@alvh.no-ip. 2527 : 2416 : ReorderBufferExecuteInvalidations(change->data.inval.ninvalidations,
2528 : : change->data.inval.invalidations);
1889 akapila@postgresql.o 2529 : 2416 : break;
2530 : :
4307 rhaas@postgresql.org 2531 : 686 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
2532 : : /* get rid of the old */
2533 : 686 : TeardownHistoricSnapshot(false);
2534 : :
2535 [ + + ]: 686 : if (snapshot_now->copied)
2536 : : {
2537 : 661 : ReorderBufferFreeSnap(rb, snapshot_now);
2538 : 661 : snapshot_now =
4303 tgl@sss.pgh.pa.us 2539 : 661 : ReorderBufferCopySnap(rb, change->data.snapshot,
2540 : : txn, command_id);
2541 : : }
2542 : :
2543 : : /*
2544 : : * Restored from disk, need to be careful not to double
2545 : : * free. We could introduce refcounting for that, but for
2546 : : * now this seems infrequent enough not to care.
2547 : : */
2548 [ - + ]: 25 : else if (change->data.snapshot->copied)
2549 : : {
4307 rhaas@postgresql.org 2550 :UBC 0 : snapshot_now =
4303 tgl@sss.pgh.pa.us 2551 : 0 : ReorderBufferCopySnap(rb, change->data.snapshot,
2552 : : txn, command_id);
2553 : : }
2554 : : else
2555 : : {
4303 tgl@sss.pgh.pa.us 2556 :CBC 25 : snapshot_now = change->data.snapshot;
2557 : : }
2558 : :
2559 : : /* and continue with the new one */
4307 rhaas@postgresql.org 2560 : 686 : SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2561 : 686 : break;
2562 : :
2563 : 11150 : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
4303 tgl@sss.pgh.pa.us 2564 [ - + ]: 11150 : Assert(change->data.command_id != InvalidCommandId);
2565 : :
2566 [ + + ]: 11150 : if (command_id < change->data.command_id)
2567 : : {
2568 : 2087 : command_id = change->data.command_id;
2569 : :
4307 rhaas@postgresql.org 2570 [ + + ]: 2087 : if (!snapshot_now->copied)
2571 : : {
2572 : : /* we don't use the global one anymore */
2573 : 656 : snapshot_now = ReorderBufferCopySnap(rb, snapshot_now,
2574 : : txn, command_id);
2575 : : }
2576 : :
2577 : 2087 : snapshot_now->curcid = command_id;
2578 : :
2579 : 2087 : TeardownHistoricSnapshot(false);
2580 : 2087 : SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
2581 : : }
2582 : :
2583 : 11150 : break;
2584 : :
4307 rhaas@postgresql.org 2585 :UBC 0 : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
2586 [ # # ]: 0 : elog(ERROR, "tuplecid value in changequeue");
2587 : : break;
2588 : : }
2589 : :
2590 : : /*
2591 : : * It is possible that the data is not sent to downstream for a
2592 : : * long time either because the output plugin filtered it or there
2593 : : * is a DDL that generates a lot of data that is not processed by
2594 : : * the plugin. So, in such cases, the downstream can timeout. To
2595 : : * avoid that we try to send a keepalive message if required.
2596 : : * Trying to send a keepalive message after every change has some
2597 : : * overhead, but testing showed there is no noticeable overhead if
2598 : : * we do it after every ~100 changes.
2599 : : */
2600 : : #define CHANGES_THRESHOLD 100
2601 : :
1043 akapila@postgresql.o 2602 [ + + ]:CBC 356524 : if (++changes_count >= CHANGES_THRESHOLD)
2603 : : {
137 michael@paquier.xyz 2604 : 3108 : rb->update_progress_txn(rb, txn, prev_lsn);
1043 akapila@postgresql.o 2605 : 3108 : changes_count = 0;
2606 : : }
2607 : : }
2608 : :
2609 : : /* speculative insertion record must be freed by now */
1646 2610 [ - + ]: 2097 : Assert(!specinsert);
2611 : :
2612 : : /* clean up the iterator */
4307 rhaas@postgresql.org 2613 : 2097 : ReorderBufferIterTXNFinish(rb, iterstate);
3980 tgl@sss.pgh.pa.us 2614 : 2097 : iterstate = NULL;
2615 : :
2616 : : /*
2617 : : * Update total transaction count and total bytes processed by the
2618 : : * transaction and its subtransactions. Ensure to not count the
2619 : : * streamed transaction multiple times.
2620 : : *
2621 : : * Note that the statistics computation has to be done after
2622 : : * ReorderBufferIterTXNFinish as it releases the serialized change
2623 : : * which we have already accounted in ReorderBufferIterTXNNext.
2624 : : */
1706 akapila@postgresql.o 2625 [ + + ]: 2097 : if (!rbtxn_is_streamed(txn))
2626 : 1477 : rb->totalTxns++;
2627 : :
1689 2628 : 2097 : rb->totalBytes += txn->total_size;
2629 : :
2630 : : /*
2631 : : * Done with current changes, send the last message for this set of
2632 : : * changes depending upon streaming mode.
2633 : : */
1957 2634 [ + + ]: 2097 : if (streaming)
2635 : : {
2636 [ + + ]: 684 : if (stream_started)
2637 : : {
2638 : 649 : rb->stream_stop(rb, txn, prev_lsn);
2639 : 649 : stream_started = false;
2640 : : }
2641 : : }
2642 : : else
2643 : : {
2644 : : /*
2645 : : * Call either PREPARE (for two-phase transactions) or COMMIT (for
2646 : : * regular ones).
2647 : : */
308 msawada@postgresql.o 2648 [ + + ]: 1413 : if (rbtxn_is_prepared(txn))
2649 : : {
2650 [ - + ]: 29 : Assert(!rbtxn_sent_prepare(txn));
1808 akapila@postgresql.o 2651 : 29 : rb->prepare(rb, txn, commit_lsn);
308 msawada@postgresql.o 2652 : 29 : txn->txn_flags |= RBTXN_SENT_PREPARE;
2653 : : }
2654 : : else
1808 akapila@postgresql.o 2655 : 1384 : rb->commit(rb, txn, commit_lsn);
2656 : : }
2657 : :
2658 : : /* this is just a sanity check against bad output plugin behaviour */
4307 rhaas@postgresql.org 2659 [ - + ]: 2094 : if (GetCurrentTransactionIdIfAny() != InvalidTransactionId)
4249 tgl@sss.pgh.pa.us 2660 [ # # ]:UBC 0 : elog(ERROR, "output plugin used XID %u",
2661 : : GetCurrentTransactionId());
2662 : :
2663 : : /*
2664 : : * Remember the command ID and snapshot for the next set of changes in
2665 : : * streaming mode.
2666 : : */
1957 akapila@postgresql.o 2667 [ + + ]:CBC 2094 : if (streaming)
2668 : 684 : ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id);
2669 [ + + ]: 1410 : else if (snapshot_now->copied)
2670 : 656 : ReorderBufferFreeSnap(rb, snapshot_now);
2671 : :
2672 : : /* cleanup */
4307 rhaas@postgresql.org 2673 : 2094 : TeardownHistoricSnapshot(false);
2674 : :
2675 : : /*
2676 : : * Aborting the current (sub-)transaction as a whole has the right
2677 : : * semantics. We want all locks acquired in here to be released, not
2678 : : * reassigned to the parent and we do not want any database access
2679 : : * have persistent effects.
2680 : : */
4052 andres@anarazel.de 2681 : 2094 : AbortCurrentTransaction();
2682 : :
2683 : : /* make sure there's no cache pollution */
184 msawada@postgresql.o 2684 [ - + ]: 2094 : if (rbtxn_distr_inval_overflowed(txn))
2685 : : {
184 msawada@postgresql.o 2686 [ # # ]:UBC 0 : Assert(txn->ninvalidations_distributed == 0);
2687 : 0 : InvalidateSystemCaches();
2688 : : }
2689 : : else
2690 : : {
184 msawada@postgresql.o 2691 :CBC 2094 : ReorderBufferExecuteInvalidations(txn->ninvalidations, txn->invalidations);
2692 : 2094 : ReorderBufferExecuteInvalidations(txn->ninvalidations_distributed,
2693 : : txn->invalidations_distributed);
2694 : : }
2695 : :
4052 andres@anarazel.de 2696 [ + + ]: 2094 : if (using_subtxn)
2697 : : {
4307 rhaas@postgresql.org 2698 : 481 : RollbackAndReleaseCurrentSubTransaction();
96 alvherre@kurilemu.de 2699 :GNC 481 : MemoryContextSwitchTo(ccxt);
2700 : 481 : CurrentResourceOwner = cowner;
2701 : : }
2702 : :
2703 : : /*
2704 : : * We are here due to one of the four reasons: 1. Decoding an
2705 : : * in-progress txn. 2. Decoding a prepared txn. 3. Decoding of a
2706 : : * prepared txn that was (partially) streamed. 4. Decoding a committed
2707 : : * txn.
2708 : : *
2709 : : * For 1, we allow truncation of txn data by removing the changes
2710 : : * already streamed but still keeping other things like invalidations,
2711 : : * snapshot, and tuplecids. For 2 and 3, we indicate
2712 : : * ReorderBufferTruncateTXN to do more elaborate truncation of txn
2713 : : * data as the entire transaction has been decoded except for commit.
2714 : : * For 4, as the entire txn has been decoded, we can fully clean up
2715 : : * the TXN reorder buffer.
2716 : : */
308 msawada@postgresql.o 2717 [ + + + + ]:CBC 2094 : if (streaming || rbtxn_is_prepared(txn))
2718 : : {
2719 [ + + ]: 713 : if (streaming)
2720 : 684 : ReorderBufferMaybeMarkTXNStreamed(rb, txn);
2721 : :
2722 : 713 : ReorderBufferTruncateTXN(rb, txn, rbtxn_is_prepared(txn));
2723 : : /* Reset the CheckXidAlive */
1957 akapila@postgresql.o 2724 : 713 : CheckXidAlive = InvalidTransactionId;
2725 : : }
2726 : : else
2727 : 1381 : ReorderBufferCleanupTXN(rb, txn);
2728 : : }
4307 rhaas@postgresql.org 2729 : 9 : PG_CATCH();
2730 : : {
1957 akapila@postgresql.o 2731 : 9 : MemoryContext ecxt = MemoryContextSwitchTo(ccxt);
2732 : 9 : ErrorData *errdata = CopyErrorData();
2733 : :
2734 : : /* TODO: Encapsulate cleanup from the PG_TRY and PG_CATCH blocks */
4307 rhaas@postgresql.org 2735 [ + - ]: 9 : if (iterstate)
2736 : 9 : ReorderBufferIterTXNFinish(rb, iterstate);
2737 : :
2738 : 9 : TeardownHistoricSnapshot(true);
2739 : :
2740 : : /*
2741 : : * Force cache invalidation to happen outside of a valid transaction
2742 : : * to prevent catalog access as we just caught an error.
2743 : : */
4052 andres@anarazel.de 2744 : 9 : AbortCurrentTransaction();
2745 : :
2746 : : /* make sure there's no cache pollution */
184 msawada@postgresql.o 2747 [ - + ]: 9 : if (rbtxn_distr_inval_overflowed(txn))
2748 : : {
184 msawada@postgresql.o 2749 [ # # ]:UBC 0 : Assert(txn->ninvalidations_distributed == 0);
2750 : 0 : InvalidateSystemCaches();
2751 : : }
2752 : : else
2753 : : {
184 msawada@postgresql.o 2754 :CBC 9 : ReorderBufferExecuteInvalidations(txn->ninvalidations, txn->invalidations);
2755 : 9 : ReorderBufferExecuteInvalidations(txn->ninvalidations_distributed,
2756 : : txn->invalidations_distributed);
2757 : : }
2758 : :
4052 andres@anarazel.de 2759 [ + + ]: 9 : if (using_subtxn)
2760 : : {
2761 : 4 : RollbackAndReleaseCurrentSubTransaction();
96 alvherre@kurilemu.de 2762 :GNC 4 : MemoryContextSwitchTo(ccxt);
2763 : 4 : CurrentResourceOwner = cowner;
2764 : : }
2765 : :
2766 : : /*
2767 : : * The error code ERRCODE_TRANSACTION_ROLLBACK indicates a concurrent
2768 : : * abort of the (sub)transaction we are streaming or preparing. We
2769 : : * need to do the cleanup and return gracefully on this error, see
2770 : : * SetupCheckXidLive.
2771 : : *
2772 : : * This error code can be thrown by one of the callbacks we call
2773 : : * during decoding so we need to ensure that we return gracefully only
2774 : : * when we are sending the data in streaming mode and the streaming is
2775 : : * not finished yet or when we are sending the data out on a PREPARE
2776 : : * during a two-phase commit.
2777 : : */
1686 akapila@postgresql.o 2778 [ + + ]:CBC 9 : if (errdata->sqlerrcode == ERRCODE_TRANSACTION_ROLLBACK &&
308 msawada@postgresql.o 2779 [ - + - - ]: 8 : (stream_started || rbtxn_is_prepared(txn)))
2780 : : {
2781 : : /* curtxn must be set for streaming or prepared transactions */
1686 akapila@postgresql.o 2782 [ - + ]: 8 : Assert(curtxn);
2783 : :
2784 : : /* Cleanup the temporary error state. */
1957 2785 : 8 : FlushErrorState();
2786 : 8 : FreeErrorData(errdata);
2787 : 8 : errdata = NULL;
2788 : :
2789 : : /* Remember the transaction is aborted. */
308 msawada@postgresql.o 2790 [ - + ]: 8 : Assert(!rbtxn_is_committed(curtxn));
2791 : 8 : curtxn->txn_flags |= RBTXN_IS_ABORTED;
2792 : :
2793 : : /* Mark the transaction is streamed if appropriate */
2794 [ + - ]: 8 : if (stream_started)
2795 : 8 : ReorderBufferMaybeMarkTXNStreamed(rb, txn);
2796 : :
2797 : : /* Reset the TXN so that it is allowed to stream remaining data. */
1957 akapila@postgresql.o 2798 : 8 : ReorderBufferResetTXN(rb, txn, snapshot_now,
2799 : : command_id, prev_lsn,
2800 : : specinsert);
2801 : : }
2802 : : else
2803 : : {
2804 : 1 : ReorderBufferCleanupTXN(rb, txn);
2805 : 1 : MemoryContextSwitchTo(ecxt);
2806 : 1 : PG_RE_THROW();
2807 : : }
2808 : : }
2809 [ - + ]: 2102 : PG_END_TRY();
2810 : 2102 : }
2811 : :
2812 : : /*
2813 : : * Perform the replay of a transaction and its non-aborted subtransactions.
2814 : : *
2815 : : * Subtransactions previously have to be processed by
2816 : : * ReorderBufferCommitChild(), even if previously assigned to the toplevel
2817 : : * transaction with ReorderBufferAssignChild.
2818 : : *
2819 : : * This interface is called once a prepare or toplevel commit is read for both
2820 : : * streamed as well as non-streamed transactions.
2821 : : */
2822 : : static void
1808 2823 : 1480 : ReorderBufferReplay(ReorderBufferTXN *txn,
2824 : : ReorderBuffer *rb, TransactionId xid,
2825 : : XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
2826 : : TimestampTz commit_time,
2827 : : RepOriginId origin_id, XLogRecPtr origin_lsn)
2828 : : {
2829 : : Snapshot snapshot_now;
1957 2830 : 1480 : CommandId command_id = FirstCommandId;
2831 : :
2832 : 1480 : txn->final_lsn = commit_lsn;
2833 : 1480 : txn->end_lsn = end_lsn;
78 peter@eisentraut.org 2834 :GNC 1480 : txn->commit_time = commit_time;
1957 akapila@postgresql.o 2835 :CBC 1480 : txn->origin_id = origin_id;
2836 : 1480 : txn->origin_lsn = origin_lsn;
2837 : :
2838 : : /*
2839 : : * If the transaction was (partially) streamed, we need to commit it in a
2840 : : * 'streamed' way. That is, we first stream the remaining part of the
2841 : : * transaction, and then invoke stream_commit message.
2842 : : *
2843 : : * Called after everything (origin ID, LSN, ...) is stored in the
2844 : : * transaction to avoid passing that information directly.
2845 : : */
2846 [ + + ]: 1480 : if (rbtxn_is_streamed(txn))
2847 : : {
2848 : 62 : ReorderBufferStreamCommit(rb, txn);
2849 : 62 : return;
2850 : : }
2851 : :
2852 : : /*
2853 : : * If this transaction has no snapshot, it didn't make any changes to the
2854 : : * database, so there's nothing to decode. Note that
2855 : : * ReorderBufferCommitChild will have transferred any snapshots from
2856 : : * subtransactions if there were any.
2857 : : */
2858 [ + + ]: 1418 : if (txn->base_snapshot == NULL)
2859 : : {
2860 [ - + ]: 3 : Assert(txn->ninvalidations == 0);
2861 : :
2862 : : /*
2863 : : * Removing this txn before a commit might result in the computation
2864 : : * of an incorrect restart_lsn. See SnapBuildProcessRunningXacts.
2865 : : */
308 msawada@postgresql.o 2866 [ + - ]: 3 : if (!rbtxn_is_prepared(txn))
1808 akapila@postgresql.o 2867 : 3 : ReorderBufferCleanupTXN(rb, txn);
1957 2868 : 3 : return;
2869 : : }
2870 : :
2871 : 1415 : snapshot_now = txn->base_snapshot;
2872 : :
2873 : : /* Process and send the changes to output plugin. */
2874 : 1415 : ReorderBufferProcessTXN(rb, txn, commit_lsn, snapshot_now,
2875 : : command_id, false);
2876 : : }
2877 : :
2878 : : /*
2879 : : * Commit a transaction.
2880 : : *
2881 : : * See comments for ReorderBufferReplay().
2882 : : */
2883 : : void
1808 2884 : 1452 : ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid,
2885 : : XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
2886 : : TimestampTz commit_time,
2887 : : RepOriginId origin_id, XLogRecPtr origin_lsn)
2888 : : {
2889 : : ReorderBufferTXN *txn;
2890 : :
2891 : 1452 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2892 : : false);
2893 : :
2894 : : /* unknown transaction, nothing to replay */
2895 [ + + ]: 1452 : if (txn == NULL)
2896 : 13 : return;
2897 : :
2898 : 1439 : ReorderBufferReplay(txn, rb, xid, commit_lsn, end_lsn, commit_time,
2899 : : origin_id, origin_lsn);
2900 : : }
2901 : :
2902 : : /*
2903 : : * Record the prepare information for a transaction. Also, mark the transaction
2904 : : * as a prepared transaction.
2905 : : */
2906 : : bool
2907 : 139 : ReorderBufferRememberPrepareInfo(ReorderBuffer *rb, TransactionId xid,
2908 : : XLogRecPtr prepare_lsn, XLogRecPtr end_lsn,
2909 : : TimestampTz prepare_time,
2910 : : RepOriginId origin_id, XLogRecPtr origin_lsn)
2911 : : {
2912 : : ReorderBufferTXN *txn;
2913 : :
2914 : 139 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, false);
2915 : :
2916 : : /* unknown transaction, nothing to do */
2917 [ - + ]: 139 : if (txn == NULL)
1808 akapila@postgresql.o 2918 :UBC 0 : return false;
2919 : :
2920 : : /*
2921 : : * Remember the prepare information to be later used by commit prepared in
2922 : : * case we skip doing prepare.
2923 : : */
1808 akapila@postgresql.o 2924 :CBC 139 : txn->final_lsn = prepare_lsn;
2925 : 139 : txn->end_lsn = end_lsn;
78 peter@eisentraut.org 2926 :GNC 139 : txn->prepare_time = prepare_time;
1808 akapila@postgresql.o 2927 :CBC 139 : txn->origin_id = origin_id;
2928 : 139 : txn->origin_lsn = origin_lsn;
2929 : :
2930 : : /* Mark this transaction as a prepared transaction */
308 msawada@postgresql.o 2931 [ - + ]: 139 : Assert((txn->txn_flags & RBTXN_PREPARE_STATUS_MASK) == 0);
2932 : 139 : txn->txn_flags |= RBTXN_IS_PREPARED;
2933 : :
1808 akapila@postgresql.o 2934 : 139 : return true;
2935 : : }
2936 : :
2937 : : /* Remember that we have skipped prepare */
2938 : : void
2939 : 101 : ReorderBufferSkipPrepare(ReorderBuffer *rb, TransactionId xid)
2940 : : {
2941 : : ReorderBufferTXN *txn;
2942 : :
2943 : 101 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr, false);
2944 : :
2945 : : /* unknown transaction, nothing to do */
2946 [ - + ]: 101 : if (txn == NULL)
1808 akapila@postgresql.o 2947 :UBC 0 : return;
2948 : :
2949 : : /* txn must have been marked as a prepared transaction */
308 msawada@postgresql.o 2950 [ - + ]:CBC 101 : Assert((txn->txn_flags & RBTXN_PREPARE_STATUS_MASK) == RBTXN_IS_PREPARED);
1808 akapila@postgresql.o 2951 : 101 : txn->txn_flags |= RBTXN_SKIPPED_PREPARE;
2952 : : }
2953 : :
2954 : : /*
2955 : : * Prepare a two-phase transaction.
2956 : : *
2957 : : * See comments for ReorderBufferReplay().
2958 : : */
2959 : : void
2960 : 38 : ReorderBufferPrepare(ReorderBuffer *rb, TransactionId xid,
2961 : : char *gid)
2962 : : {
2963 : : ReorderBufferTXN *txn;
2964 : :
2965 : 38 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
2966 : : false);
2967 : :
2968 : : /* unknown transaction, nothing to replay */
2969 [ - + ]: 38 : if (txn == NULL)
1808 akapila@postgresql.o 2970 :UBC 0 : return;
2971 : :
2972 : : /*
2973 : : * txn must have been marked as a prepared transaction and must have
2974 : : * neither been skipped nor sent a prepare. Also, the prepare info must
2975 : : * have been updated in it by now.
2976 : : */
308 msawada@postgresql.o 2977 [ - + ]:CBC 38 : Assert((txn->txn_flags & RBTXN_PREPARE_STATUS_MASK) == RBTXN_IS_PREPARED);
41 alvherre@kurilemu.de 2978 [ - + ]:GNC 38 : Assert(XLogRecPtrIsValid(txn->final_lsn));
2979 : :
308 msawada@postgresql.o 2980 :CBC 38 : txn->gid = pstrdup(gid);
2981 : :
1808 akapila@postgresql.o 2982 : 38 : ReorderBufferReplay(txn, rb, xid, txn->final_lsn, txn->end_lsn,
78 peter@eisentraut.org 2983 :GNC 38 : txn->prepare_time, txn->origin_id, txn->origin_lsn);
2984 : :
2985 : : /*
2986 : : * Send a prepare if not already done so. This might occur if we have
2987 : : * detected a concurrent abort while replaying the non-streaming
2988 : : * transaction.
2989 : : */
308 msawada@postgresql.o 2990 [ - + ]:CBC 38 : if (!rbtxn_sent_prepare(txn))
2991 : : {
1721 akapila@postgresql.o 2992 :UBC 0 : rb->prepare(rb, txn, txn->final_lsn);
308 msawada@postgresql.o 2993 : 0 : txn->txn_flags |= RBTXN_SENT_PREPARE;
2994 : : }
2995 : : }
2996 : :
2997 : : /*
2998 : : * This is used to handle COMMIT/ROLLBACK PREPARED.
2999 : : */
3000 : : void
1808 akapila@postgresql.o 3001 :CBC 41 : ReorderBufferFinishPrepared(ReorderBuffer *rb, TransactionId xid,
3002 : : XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
3003 : : XLogRecPtr two_phase_at,
3004 : : TimestampTz commit_time, RepOriginId origin_id,
3005 : : XLogRecPtr origin_lsn, char *gid, bool is_commit)
3006 : : {
3007 : : ReorderBufferTXN *txn;
3008 : : XLogRecPtr prepare_end_lsn;
3009 : : TimestampTz prepare_time;
3010 : :
1758 3011 : 41 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, commit_lsn, false);
3012 : :
3013 : : /* unknown transaction, nothing to do */
1808 3014 [ - + ]: 41 : if (txn == NULL)
1808 akapila@postgresql.o 3015 :UBC 0 : return;
3016 : :
3017 : : /*
3018 : : * By this time the txn has the prepare record information, remember it to
3019 : : * be later used for rollback.
3020 : : */
1808 akapila@postgresql.o 3021 :CBC 41 : prepare_end_lsn = txn->end_lsn;
78 peter@eisentraut.org 3022 :GNC 41 : prepare_time = txn->prepare_time;
3023 : :
3024 : : /* add the gid in the txn */
1808 akapila@postgresql.o 3025 :CBC 41 : txn->gid = pstrdup(gid);
3026 : :
3027 : : /*
3028 : : * It is possible that this transaction is not decoded at prepare time
3029 : : * either because by that time we didn't have a consistent snapshot, or
3030 : : * two_phase was not enabled, or it was decoded earlier but we have
3031 : : * restarted. We only need to send the prepare if it was not decoded
3032 : : * earlier. We don't need to decode the xact for aborts if it is not done
3033 : : * already.
3034 : : */
1617 3035 [ + + + - ]: 41 : if ((txn->final_lsn < two_phase_at) && is_commit)
3036 : : {
3037 : : /*
3038 : : * txn must have been marked as a prepared transaction and skipped but
3039 : : * not sent a prepare. Also, the prepare info must have been updated
3040 : : * in txn even if we skip prepare.
3041 : : */
308 msawada@postgresql.o 3042 [ - + ]: 3 : Assert((txn->txn_flags & RBTXN_PREPARE_STATUS_MASK) ==
3043 : : (RBTXN_IS_PREPARED | RBTXN_SKIPPED_PREPARE));
41 alvherre@kurilemu.de 3044 [ - + ]:GNC 3 : Assert(XLogRecPtrIsValid(txn->final_lsn));
3045 : :
3046 : : /*
3047 : : * By this time the txn has the prepare record information and it is
3048 : : * important to use that so that downstream gets the accurate
3049 : : * information. If instead, we have passed commit information here
3050 : : * then downstream can behave as it has already replayed commit
3051 : : * prepared after the restart.
3052 : : */
1808 akapila@postgresql.o 3053 :CBC 3 : ReorderBufferReplay(txn, rb, xid, txn->final_lsn, txn->end_lsn,
78 peter@eisentraut.org 3054 :GNC 3 : txn->prepare_time, txn->origin_id, txn->origin_lsn);
3055 : : }
3056 : :
1808 akapila@postgresql.o 3057 :CBC 41 : txn->final_lsn = commit_lsn;
3058 : 41 : txn->end_lsn = end_lsn;
78 peter@eisentraut.org 3059 :GNC 41 : txn->commit_time = commit_time;
1808 akapila@postgresql.o 3060 :CBC 41 : txn->origin_id = origin_id;
3061 : 41 : txn->origin_lsn = origin_lsn;
3062 : :
3063 [ + + ]: 41 : if (is_commit)
3064 : 32 : rb->commit_prepared(rb, txn, commit_lsn);
3065 : : else
3066 : 9 : rb->rollback_prepared(rb, txn, prepare_end_lsn, prepare_time);
3067 : :
3068 : : /* cleanup: make sure there's no cache pollution */
3069 : 41 : ReorderBufferExecuteInvalidations(txn->ninvalidations,
3070 : : txn->invalidations);
3071 : 41 : ReorderBufferCleanupTXN(rb, txn);
3072 : : }
3073 : :
3074 : : /*
3075 : : * Abort a transaction that possibly has previous changes. Needs to be first
3076 : : * called for subtransactions and then for the toplevel xid.
3077 : : *
3078 : : * NB: Transactions handled here have to have actively aborted (i.e. have
3079 : : * produced an abort record). Implicitly aborted transactions are handled via
3080 : : * ReorderBufferAbortOld(); transactions we're just not interested in, but
3081 : : * which have committed are handled in ReorderBufferForget().
3082 : : *
3083 : : * This function purges this transaction and its contents from memory and
3084 : : * disk.
3085 : : */
3086 : : void
1073 3087 : 165 : ReorderBufferAbort(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn,
3088 : : TimestampTz abort_time)
3089 : : {
3090 : : ReorderBufferTXN *txn;
3091 : :
4307 rhaas@postgresql.org 3092 : 165 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
3093 : : false);
3094 : :
3095 : : /* unknown, nothing to remove */
3096 [ - + ]: 165 : if (txn == NULL)
4307 rhaas@postgresql.org 3097 :UBC 0 : return;
3098 : :
78 peter@eisentraut.org 3099 :GNC 165 : txn->abort_time = abort_time;
3100 : :
3101 : : /* For streamed transactions notify the remote node about the abort. */
1957 akapila@postgresql.o 3102 [ + + ]:CBC 165 : if (rbtxn_is_streamed(txn))
3103 : : {
3104 : 30 : rb->stream_abort(rb, txn, lsn);
3105 : :
3106 : : /*
3107 : : * We might have decoded changes for this transaction that could load
3108 : : * the cache as per the current transaction's view (consider DDL's
3109 : : * happened in this transaction). We don't want the decoding of future
3110 : : * transactions to use those cache entries so execute only the inval
3111 : : * messages in this transaction.
3112 : : */
3113 [ - + ]: 30 : if (txn->ninvalidations > 0)
1957 akapila@postgresql.o 3114 :UBC 0 : ReorderBufferImmediateInvalidation(rb, txn->ninvalidations,
3115 : : txn->invalidations);
3116 : : }
3117 : :
3118 : : /* cosmetic... */
4307 rhaas@postgresql.org 3119 :CBC 165 : txn->final_lsn = lsn;
3120 : :
3121 : : /* remove potential on-disk data, and deallocate */
3122 : 165 : ReorderBufferCleanupTXN(rb, txn);
3123 : : }
3124 : :
3125 : : /*
3126 : : * Abort all transactions that aren't actually running anymore because the
3127 : : * server restarted.
3128 : : *
3129 : : * NB: These really have to be transactions that have aborted due to a server
3130 : : * crash/immediate restart, as we don't deal with invalidations here.
3131 : : */
3132 : : void
3133 : 1444 : ReorderBufferAbortOld(ReorderBuffer *rb, TransactionId oldestRunningXid)
3134 : : {
3135 : : dlist_mutable_iter it;
3136 : :
3137 : : /*
3138 : : * Iterate through all (potential) toplevel TXNs and abort all that are
3139 : : * older than what possibly can be running. Once we've found the first
3140 : : * that is alive we stop, there might be some that acquired an xid earlier
3141 : : * but started writing later, but it's unlikely and they will be cleaned
3142 : : * up in a later call to this function.
3143 : : */
3144 [ + - + + ]: 1447 : dlist_foreach_modify(it, &rb->toplevel_by_lsn)
3145 : : {
3146 : : ReorderBufferTXN *txn;
3147 : :
3148 : 60 : txn = dlist_container(ReorderBufferTXN, node, it.cur);
3149 : :
3150 [ + + ]: 60 : if (TransactionIdPrecedes(txn->xid, oldestRunningXid))
3151 : : {
3140 andres@anarazel.de 3152 [ - + ]: 3 : elog(DEBUG2, "aborting old transaction %u", txn->xid);
3153 : :
3154 : : /* Notify the remote node about the crash/immediate restart. */
1075 akapila@postgresql.o 3155 [ - + ]: 3 : if (rbtxn_is_streamed(txn))
1075 akapila@postgresql.o 3156 :UBC 0 : rb->stream_abort(rb, txn, InvalidXLogRecPtr);
3157 : :
3158 : : /* remove potential on-disk data, and deallocate this tx */
4307 rhaas@postgresql.org 3159 :CBC 3 : ReorderBufferCleanupTXN(rb, txn);
3160 : : }
3161 : : else
3162 : 57 : return;
3163 : : }
3164 : : }
3165 : :
3166 : : /*
3167 : : * Forget the contents of a transaction if we aren't interested in its
3168 : : * contents. Needs to be first called for subtransactions and then for the
3169 : : * toplevel xid.
3170 : : *
3171 : : * This is significantly different to ReorderBufferAbort() because
3172 : : * transactions that have committed need to be treated differently from aborted
3173 : : * ones since they may have modified the catalog.
3174 : : *
3175 : : * Note that this is only allowed to be called in the moment a transaction
3176 : : * commit has just been read, not earlier; otherwise later records referring
3177 : : * to this xid might re-create the transaction incompletely.
3178 : : */
3179 : : void
3180 : 2594 : ReorderBufferForget(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
3181 : : {
3182 : : ReorderBufferTXN *txn;
3183 : :
3184 : 2594 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
3185 : : false);
3186 : :
3187 : : /* unknown, nothing to forget */
3188 [ + + ]: 2594 : if (txn == NULL)
3189 : 561 : return;
3190 : :
3191 : : /* this transaction mustn't be streamed */
1105 akapila@postgresql.o 3192 [ - + ]: 2033 : Assert(!rbtxn_is_streamed(txn));
3193 : :
3194 : : /* cosmetic... */
4307 rhaas@postgresql.org 3195 : 2033 : txn->final_lsn = lsn;
3196 : :
3197 : : /*
3198 : : * Process only cache invalidation messages in this transaction if there
3199 : : * are any. Even if we're not interested in the transaction's contents, it
3200 : : * could have manipulated the catalog and we need to update the caches
3201 : : * according to that.
3202 : : */
3203 [ + + + + ]: 2033 : if (txn->base_snapshot != NULL && txn->ninvalidations > 0)
3525 andres@anarazel.de 3204 : 560 : ReorderBufferImmediateInvalidation(rb, txn->ninvalidations,
3205 : : txn->invalidations);
3206 : : else
4307 rhaas@postgresql.org 3207 [ - + ]: 1473 : Assert(txn->ninvalidations == 0);
3208 : :
3209 : : /* remove potential on-disk data, and deallocate */
3210 : 2033 : ReorderBufferCleanupTXN(rb, txn);
3211 : : }
3212 : :
3213 : : /*
3214 : : * Invalidate cache for those transactions that need to be skipped just in case
3215 : : * catalogs were manipulated as part of the transaction.
3216 : : *
3217 : : * Note that this is a special-purpose function for prepared transactions where
3218 : : * we don't want to clean up the TXN even when we decide to skip it. See
3219 : : * DecodePrepare.
3220 : : */
3221 : : void
1808 akapila@postgresql.o 3222 : 98 : ReorderBufferInvalidate(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
3223 : : {
3224 : : ReorderBufferTXN *txn;
3225 : :
3226 : 98 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
3227 : : false);
3228 : :
3229 : : /* unknown, nothing to do */
3230 [ - + ]: 98 : if (txn == NULL)
1808 akapila@postgresql.o 3231 :UBC 0 : return;
3232 : :
3233 : : /*
3234 : : * Process cache invalidation messages if there are any. Even if we're not
3235 : : * interested in the transaction's contents, it could have manipulated the
3236 : : * catalog and we need to update the caches according to that.
3237 : : */
1808 akapila@postgresql.o 3238 [ + - + + ]:CBC 98 : if (txn->base_snapshot != NULL && txn->ninvalidations > 0)
3239 : 29 : ReorderBufferImmediateInvalidation(rb, txn->ninvalidations,
3240 : : txn->invalidations);
3241 : : else
3242 [ - + ]: 69 : Assert(txn->ninvalidations == 0);
3243 : : }
3244 : :
3245 : :
3246 : : /*
3247 : : * Execute invalidations happening outside the context of a decoded
3248 : : * transaction. That currently happens either for xid-less commits
3249 : : * (cf. RecordTransactionCommit()) or for invalidations in uninteresting
3250 : : * transactions (via ReorderBufferForget()).
3251 : : */
3252 : : void
3525 andres@anarazel.de 3253 : 596 : ReorderBufferImmediateInvalidation(ReorderBuffer *rb, uint32 ninvalidations,
3254 : : SharedInvalidationMessage *invalidations)
3255 : : {
3256 : 596 : bool use_subtxn = IsTransactionOrTransactionBlock();
96 alvherre@kurilemu.de 3257 :GNC 596 : MemoryContext ccxt = CurrentMemoryContext;
3258 : 596 : ResourceOwner cowner = CurrentResourceOwner;
3259 : : int i;
3260 : :
3525 andres@anarazel.de 3261 [ + + ]:CBC 596 : if (use_subtxn)
3262 : 427 : BeginInternalSubTransaction("replay");
3263 : :
3264 : : /*
3265 : : * Force invalidations to happen outside of a valid transaction - that way
3266 : : * entries will just be marked as invalid without accessing the catalog.
3267 : : * That's advantageous because we don't need to setup the full state
3268 : : * necessary for catalog access.
3269 : : */
3270 [ + + ]: 596 : if (use_subtxn)
3271 : 427 : AbortCurrentTransaction();
3272 : :
3273 [ + + ]: 24977 : for (i = 0; i < ninvalidations; i++)
3274 : 24381 : LocalExecuteInvalidationMessage(&invalidations[i]);
3275 : :
3276 [ + + ]: 596 : if (use_subtxn)
3277 : : {
3278 : 427 : RollbackAndReleaseCurrentSubTransaction();
96 alvherre@kurilemu.de 3279 :GNC 427 : MemoryContextSwitchTo(ccxt);
3280 : 427 : CurrentResourceOwner = cowner;
3281 : : }
3525 andres@anarazel.de 3282 :CBC 596 : }
3283 : :
3284 : : /*
3285 : : * Tell reorderbuffer about an xid seen in the WAL stream. Has to be called at
3286 : : * least once for every xid in XLogRecord->xl_xid (other places in records
3287 : : * may, but do not have to be passed through here).
3288 : : *
3289 : : * Reorderbuffer keeps some data structures about transactions in LSN order,
3290 : : * for efficiency. To do that it has to know about when transactions are seen
3291 : : * first in the WAL. As many types of records are not actually interesting for
3292 : : * logical decoding, they do not necessarily pass through here.
3293 : : */
3294 : : void
3574 3295 : 2182343 : ReorderBufferProcessXid(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
3296 : : {
3297 : : /* many records won't have an xid assigned, centralize check here */
3298 [ + + ]: 2182343 : if (xid != InvalidTransactionId)
3299 : 2180281 : ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
4307 rhaas@postgresql.org 3300 : 2182343 : }
3301 : :
3302 : : /*
3303 : : * Add a new snapshot to this transaction that may only used after lsn 'lsn'
3304 : : * because the previous snapshot doesn't describe the catalog correctly for
3305 : : * following rows.
3306 : : */
3307 : : void
3308 : 1261 : ReorderBufferAddSnapshot(ReorderBuffer *rb, TransactionId xid,
3309 : : XLogRecPtr lsn, Snapshot snap)
3310 : : {
280 heikki.linnakangas@i 3311 : 1261 : ReorderBufferChange *change = ReorderBufferAllocChange(rb);
3312 : :
4303 tgl@sss.pgh.pa.us 3313 : 1261 : change->data.snapshot = snap;
3314 : 1261 : change->action = REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT;
3315 : :
1957 akapila@postgresql.o 3316 : 1261 : ReorderBufferQueueChange(rb, xid, lsn, change, false);
4307 rhaas@postgresql.org 3317 : 1261 : }
3318 : :
3319 : : /*
3320 : : * Set up the transaction's base snapshot.
3321 : : *
3322 : : * If we know that xid is a subtransaction, set the base snapshot on the
3323 : : * top-level transaction instead.
3324 : : */
3325 : : void
3326 : 3250 : ReorderBufferSetBaseSnapshot(ReorderBuffer *rb, TransactionId xid,
3327 : : XLogRecPtr lsn, Snapshot snap)
3328 : : {
3329 : : ReorderBufferTXN *txn;
3330 : : bool is_new;
3331 : :
1146 peter@eisentraut.org 3332 [ - + ]: 3250 : Assert(snap != NULL);
3333 : :
3334 : : /*
3335 : : * Fetch the transaction to operate on. If we know it's a subtransaction,
3336 : : * operate on its top-level transaction instead.
3337 : : */
4307 rhaas@postgresql.org 3338 : 3250 : txn = ReorderBufferTXNByXid(rb, xid, true, &is_new, lsn, true);
2168 alvherre@alvh.no-ip. 3339 [ + + ]: 3250 : if (rbtxn_is_known_subxact(txn))
2731 3340 : 98 : txn = ReorderBufferTXNByXid(rb, txn->toplevel_xid, false,
3341 : : NULL, InvalidXLogRecPtr, false);
4307 rhaas@postgresql.org 3342 [ - + ]: 3250 : Assert(txn->base_snapshot == NULL);
3343 : :
3344 : 3250 : txn->base_snapshot = snap;
3345 : 3250 : txn->base_snapshot_lsn = lsn;
2731 alvherre@alvh.no-ip. 3346 : 3250 : dlist_push_tail(&rb->txns_by_base_snapshot_lsn, &txn->base_snapshot_node);
3347 : :
3348 : 3250 : AssertTXNLsnOrder(rb);
4307 rhaas@postgresql.org 3349 : 3250 : }
3350 : :
3351 : : /*
3352 : : * Access the catalog with this CommandId at this point in the changestream.
3353 : : *
3354 : : * May only be called for command ids > 1
3355 : : */
3356 : : void
3357 : 24187 : ReorderBufferAddNewCommandId(ReorderBuffer *rb, TransactionId xid,
3358 : : XLogRecPtr lsn, CommandId cid)
3359 : : {
280 heikki.linnakangas@i 3360 : 24187 : ReorderBufferChange *change = ReorderBufferAllocChange(rb);
3361 : :
4303 tgl@sss.pgh.pa.us 3362 : 24187 : change->data.command_id = cid;
3363 : 24187 : change->action = REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID;
3364 : :
1957 akapila@postgresql.o 3365 : 24187 : ReorderBufferQueueChange(rb, xid, lsn, change, false);
4307 rhaas@postgresql.org 3366 : 24187 : }
3367 : :
3368 : : /*
3369 : : * Update memory counters to account for the new or removed change.
3370 : : *
3371 : : * We update two counters - in the reorder buffer, and in the transaction
3372 : : * containing the change. The reorder buffer counter allows us to quickly
3373 : : * decide if we reached the memory limit, the transaction counter allows
3374 : : * us to quickly pick the largest transaction for eviction.
3375 : : *
3376 : : * Either txn or change must be non-NULL at least. We update the memory
3377 : : * counter of txn if it's non-NULL, otherwise change->txn.
3378 : : *
3379 : : * When streaming is enabled, we need to update the toplevel transaction
3380 : : * counters instead - we don't really care about subtransactions as we
3381 : : * can't stream them individually anyway, and we only pick toplevel
3382 : : * transactions for eviction. So only toplevel transactions matter.
3383 : : */
3384 : : static void
2223 akapila@postgresql.o 3385 : 1802627 : ReorderBufferChangeMemoryUpdate(ReorderBuffer *rb,
3386 : : ReorderBufferChange *change,
3387 : : ReorderBufferTXN *txn,
3388 : : bool addition, Size sz)
3389 : : {
3390 : : ReorderBufferTXN *toptxn;
3391 : :
623 msawada@postgresql.o 3392 [ + + - + ]: 1802627 : Assert(txn || change);
3393 : :
3394 : : /*
3395 : : * Ignore tuple CID changes, because those are not evicted when reaching
3396 : : * memory limit. So we just don't count them, because it might easily
3397 : : * trigger a pointless attempt to spill.
3398 : : */
3399 [ + + + + ]: 1802627 : if (change && change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID)
2223 akapila@postgresql.o 3400 : 24070 : return;
3401 : :
623 msawada@postgresql.o 3402 [ + + ]: 1778557 : if (sz == 0)
3403 : 875 : return;
3404 : :
3405 [ + + ]: 1777682 : if (txn == NULL)
3406 : 1770032 : txn = change->txn;
3407 [ - + ]: 1777682 : Assert(txn != NULL);
3408 : :
3409 : : /*
3410 : : * Update the total size in top level as well. This is later used to
3411 : : * compute the decoding stats.
3412 : : */
1006 akapila@postgresql.o 3413 [ + + ]: 1777682 : toptxn = rbtxn_get_toptxn(txn);
3414 : :
2223 3415 [ + + ]: 1777682 : if (addition)
3416 : : {
615 msawada@postgresql.o 3417 : 1591379 : Size oldsize = txn->size;
3418 : :
1957 akapila@postgresql.o 3419 : 1591379 : txn->size += sz;
2223 3420 : 1591379 : rb->size += sz;
3421 : :
3422 : : /* Update the total size in the top transaction. */
1689 3423 : 1591379 : toptxn->total_size += sz;
3424 : :
3425 : : /* Update the max-heap */
615 msawada@postgresql.o 3426 [ + + ]: 1591379 : if (oldsize != 0)
3427 : 1583664 : pairingheap_remove(rb->txn_heap, &txn->txn_node);
3428 : 1591379 : pairingheap_add(rb->txn_heap, &txn->txn_node);
3429 : : }
3430 : : else
3431 : : {
1957 akapila@postgresql.o 3432 [ + - - + ]: 186303 : Assert((rb->size >= sz) && (txn->size >= sz));
3433 : 186303 : txn->size -= sz;
2223 3434 : 186303 : rb->size -= sz;
3435 : :
3436 : : /* Update the total size in the top transaction. */
1689 3437 : 186303 : toptxn->total_size -= sz;
3438 : :
3439 : : /* Update the max-heap */
615 msawada@postgresql.o 3440 : 186303 : pairingheap_remove(rb->txn_heap, &txn->txn_node);
3441 [ + + ]: 186303 : if (txn->size != 0)
3442 : 178624 : pairingheap_add(rb->txn_heap, &txn->txn_node);
3443 : : }
3444 : :
1957 akapila@postgresql.o 3445 [ - + ]: 1777682 : Assert(txn->size <= rb->size);
3446 : : }
3447 : :
3448 : : /*
3449 : : * Add new (relfilelocator, tid) -> (cmin, cmax) mappings.
3450 : : *
3451 : : * We do not include this change type in memory accounting, because we
3452 : : * keep CIDs in a separate list and do not evict them when reaching
3453 : : * the memory limit.
3454 : : */
3455 : : void
4307 rhaas@postgresql.org 3456 : 24187 : ReorderBufferAddNewTupleCids(ReorderBuffer *rb, TransactionId xid,
3457 : : XLogRecPtr lsn, RelFileLocator locator,
3458 : : ItemPointerData tid, CommandId cmin,
3459 : : CommandId cmax, CommandId combocid)
3460 : : {
280 heikki.linnakangas@i 3461 : 24187 : ReorderBufferChange *change = ReorderBufferAllocChange(rb);
3462 : : ReorderBufferTXN *txn;
3463 : :
4307 rhaas@postgresql.org 3464 : 24187 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3465 : :
1260 3466 : 24187 : change->data.tuplecid.locator = locator;
4303 tgl@sss.pgh.pa.us 3467 : 24187 : change->data.tuplecid.tid = tid;
3468 : 24187 : change->data.tuplecid.cmin = cmin;
3469 : 24187 : change->data.tuplecid.cmax = cmax;
3470 : 24187 : change->data.tuplecid.combocid = combocid;
4307 rhaas@postgresql.org 3471 : 24187 : change->lsn = lsn;
2223 akapila@postgresql.o 3472 : 24187 : change->txn = txn;
4303 tgl@sss.pgh.pa.us 3473 : 24187 : change->action = REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID;
3474 : :
4307 rhaas@postgresql.org 3475 : 24187 : dlist_push_tail(&txn->tuplecids, &change->node);
3476 : 24187 : txn->ntuplecids++;
3477 : 24187 : }
3478 : :
3479 : : /*
3480 : : * Add new invalidation messages to the reorder buffer queue.
3481 : : */
3482 : : static void
184 msawada@postgresql.o 3483 : 5189 : ReorderBufferQueueInvalidations(ReorderBuffer *rb, TransactionId xid,
3484 : : XLogRecPtr lsn, Size nmsgs,
3485 : : SharedInvalidationMessage *msgs)
3486 : : {
3487 : : ReorderBufferChange *change;
3488 : :
3489 : 5189 : change = ReorderBufferAllocChange(rb);
3490 : 5189 : change->action = REORDER_BUFFER_CHANGE_INVALIDATION;
3491 : 5189 : change->data.inval.ninvalidations = nmsgs;
7 michael@paquier.xyz 3492 :GNC 5189 : change->data.inval.invalidations = palloc_array(SharedInvalidationMessage, nmsgs);
184 msawada@postgresql.o 3493 :CBC 5189 : memcpy(change->data.inval.invalidations, msgs,
3494 : : sizeof(SharedInvalidationMessage) * nmsgs);
3495 : :
3496 : 5189 : ReorderBufferQueueChange(rb, xid, lsn, change, false);
3497 : 5189 : }
3498 : :
3499 : : /*
3500 : : * A helper function for ReorderBufferAddInvalidations() and
3501 : : * ReorderBufferAddDistributedInvalidations() to accumulate the invalidation
3502 : : * messages to the **invals_out.
3503 : : */
3504 : : static void
3505 : 5189 : ReorderBufferAccumulateInvalidations(SharedInvalidationMessage **invals_out,
3506 : : uint32 *ninvals_out,
3507 : : SharedInvalidationMessage *msgs_new,
3508 : : Size nmsgs_new)
3509 : : {
3510 [ + + ]: 5189 : if (*ninvals_out == 0)
3511 : : {
3512 : 1274 : *ninvals_out = nmsgs_new;
7 michael@paquier.xyz 3513 :GNC 1274 : *invals_out = palloc_array(SharedInvalidationMessage, nmsgs_new);
184 msawada@postgresql.o 3514 :CBC 1274 : memcpy(*invals_out, msgs_new, sizeof(SharedInvalidationMessage) * nmsgs_new);
3515 : : }
3516 : : else
3517 : : {
3518 : : /* Enlarge the array of inval messages */
3519 : 3915 : *invals_out = (SharedInvalidationMessage *)
3520 : 3915 : repalloc(*invals_out, sizeof(SharedInvalidationMessage) *
3521 : 3915 : (*ninvals_out + nmsgs_new));
3522 : 3915 : memcpy(*invals_out + *ninvals_out, msgs_new,
3523 : : nmsgs_new * sizeof(SharedInvalidationMessage));
3524 : 3915 : *ninvals_out += nmsgs_new;
3525 : : }
3526 : 5189 : }
3527 : :
3528 : : /*
3529 : : * Accumulate the invalidations for executing them later.
3530 : : *
3531 : : * This needs to be called for each XLOG_XACT_INVALIDATIONS message and
3532 : : * accumulates all the invalidation messages in the toplevel transaction, if
3533 : : * available, otherwise in the current transaction, as well as in the form of
3534 : : * change in reorder buffer. We require to record it in form of the change
3535 : : * so that we can execute only the required invalidations instead of executing
3536 : : * all the invalidations on each CommandId increment. We also need to
3537 : : * accumulate these in the txn buffer because in some cases where we skip
3538 : : * processing the transaction (see ReorderBufferForget), we need to execute
3539 : : * all the invalidations together.
3540 : : */
3541 : : void
4307 rhaas@postgresql.org 3542 : 5160 : ReorderBufferAddInvalidations(ReorderBuffer *rb, TransactionId xid,
3543 : : XLogRecPtr lsn, Size nmsgs,
3544 : : SharedInvalidationMessage *msgs)
3545 : : {
3546 : : ReorderBufferTXN *txn;
3547 : : MemoryContext oldcontext;
3548 : :
3549 : 5160 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3550 : :
1889 akapila@postgresql.o 3551 : 5160 : oldcontext = MemoryContextSwitchTo(rb->context);
3552 : :
3553 : : /*
3554 : : * Collect all the invalidations under the top transaction, if available,
3555 : : * so that we can execute them all together. See comments atop this
3556 : : * function.
3557 : : */
1006 3558 [ + + ]: 5160 : txn = rbtxn_get_toptxn(txn);
3559 : :
4307 rhaas@postgresql.org 3560 [ - + ]: 5160 : Assert(nmsgs > 0);
3561 : :
184 msawada@postgresql.o 3562 : 5160 : ReorderBufferAccumulateInvalidations(&txn->invalidations,
3563 : : &txn->ninvalidations,
3564 : : msgs, nmsgs);
3565 : :
3566 : 5160 : ReorderBufferQueueInvalidations(rb, xid, lsn, nmsgs, msgs);
3567 : :
3568 : 5160 : MemoryContextSwitchTo(oldcontext);
3569 : 5160 : }
3570 : :
3571 : : /*
3572 : : * Accumulate the invalidations distributed by other committed transactions
3573 : : * for executing them later.
3574 : : *
3575 : : * This function is similar to ReorderBufferAddInvalidations() but stores
3576 : : * the given inval messages to the txn->invalidations_distributed with the
3577 : : * overflow check.
3578 : : *
3579 : : * This needs to be called by committed transactions to distribute their
3580 : : * inval messages to in-progress transactions.
3581 : : */
3582 : : void
3583 : 29 : ReorderBufferAddDistributedInvalidations(ReorderBuffer *rb, TransactionId xid,
3584 : : XLogRecPtr lsn, Size nmsgs,
3585 : : SharedInvalidationMessage *msgs)
3586 : : {
3587 : : ReorderBufferTXN *txn;
3588 : : MemoryContext oldcontext;
3589 : :
3590 : 29 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3591 : :
3592 : 29 : oldcontext = MemoryContextSwitchTo(rb->context);
3593 : :
3594 : : /*
3595 : : * Collect all the invalidations under the top transaction, if available,
3596 : : * so that we can execute them all together. See comments
3597 : : * ReorderBufferAddInvalidations.
3598 : : */
3599 [ - + ]: 29 : txn = rbtxn_get_toptxn(txn);
3600 : :
3601 [ - + ]: 29 : Assert(nmsgs > 0);
3602 : :
3603 [ + - ]: 29 : if (!rbtxn_distr_inval_overflowed(txn))
3604 : : {
3605 : : /*
3606 : : * Check the transaction has enough space for storing distributed
3607 : : * invalidation messages.
3608 : : */
3609 [ - + ]: 29 : if (txn->ninvalidations_distributed + nmsgs >= MAX_DISTR_INVAL_MSG_PER_TXN)
3610 : : {
3611 : : /*
3612 : : * Mark the invalidation message as overflowed and free up the
3613 : : * messages accumulated so far.
3614 : : */
184 msawada@postgresql.o 3615 :UBC 0 : txn->txn_flags |= RBTXN_DISTR_INVAL_OVERFLOWED;
3616 : :
3617 [ # # ]: 0 : if (txn->invalidations_distributed)
3618 : : {
3619 : 0 : pfree(txn->invalidations_distributed);
3620 : 0 : txn->invalidations_distributed = NULL;
3621 : 0 : txn->ninvalidations_distributed = 0;
3622 : : }
3623 : : }
3624 : : else
184 msawada@postgresql.o 3625 :CBC 29 : ReorderBufferAccumulateInvalidations(&txn->invalidations_distributed,
3626 : : &txn->ninvalidations_distributed,
3627 : : msgs, nmsgs);
3628 : : }
3629 : :
3630 : : /* Queue the invalidation messages into the transaction */
3631 : 29 : ReorderBufferQueueInvalidations(rb, xid, lsn, nmsgs, msgs);
3632 : :
1889 akapila@postgresql.o 3633 : 29 : MemoryContextSwitchTo(oldcontext);
4307 rhaas@postgresql.org 3634 : 29 : }
3635 : :
3636 : : /*
3637 : : * Apply all invalidations we know. Possibly we only need parts at this point
3638 : : * in the changestream but we don't know which those are.
3639 : : */
3640 : : static void
1889 akapila@postgresql.o 3641 : 6663 : ReorderBufferExecuteInvalidations(uint32 nmsgs, SharedInvalidationMessage *msgs)
3642 : : {
3643 : : int i;
3644 : :
3645 [ + + ]: 49327 : for (i = 0; i < nmsgs; i++)
3646 : 42664 : LocalExecuteInvalidationMessage(&msgs[i]);
4307 rhaas@postgresql.org 3647 : 6663 : }
3648 : :
3649 : : /*
3650 : : * Mark a transaction as containing catalog changes
3651 : : */
3652 : : void
3653 : 29381 : ReorderBufferXidSetCatalogChanges(ReorderBuffer *rb, TransactionId xid,
3654 : : XLogRecPtr lsn)
3655 : : {
3656 : : ReorderBufferTXN *txn;
3657 : :
3658 : 29381 : txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
3659 : :
1224 akapila@postgresql.o 3660 [ + + ]: 29381 : if (!rbtxn_has_catalog_changes(txn))
3661 : : {
3662 : 1283 : txn->txn_flags |= RBTXN_HAS_CATALOG_CHANGES;
1141 drowley@postgresql.o 3663 : 1283 : dclist_push_tail(&rb->catchange_txns, &txn->catchange_node);
3664 : : }
3665 : :
3666 : : /*
3667 : : * Mark top-level transaction as having catalog changes too if one of its
3668 : : * children has so that the ReorderBufferBuildTupleCidHash can
3669 : : * conveniently check just top-level transaction and decide whether to
3670 : : * build the hash table or not.
3671 : : */
1006 akapila@postgresql.o 3672 [ + + ]: 29381 : if (rbtxn_is_subtxn(txn))
3673 : : {
3674 [ + - ]: 896 : ReorderBufferTXN *toptxn = rbtxn_get_toptxn(txn);
3675 : :
3676 [ + + ]: 896 : if (!rbtxn_has_catalog_changes(toptxn))
3677 : : {
3678 : 20 : toptxn->txn_flags |= RBTXN_HAS_CATALOG_CHANGES;
3679 : 20 : dclist_push_tail(&rb->catchange_txns, &toptxn->catchange_node);
3680 : : }
3681 : : }
1224 3682 : 29381 : }
3683 : :
3684 : : /*
3685 : : * Return palloc'ed array of the transactions that have changed catalogs.
3686 : : * The returned array is sorted in xidComparator order.
3687 : : *
3688 : : * The caller must free the returned array when done with it.
3689 : : */
3690 : : TransactionId *
3691 : 288 : ReorderBufferGetCatalogChangesXacts(ReorderBuffer *rb)
3692 : : {
3693 : : dlist_iter iter;
3694 : 288 : TransactionId *xids = NULL;
3695 : 288 : size_t xcnt = 0;
3696 : :
3697 : : /* Quick return if the list is empty */
1141 drowley@postgresql.o 3698 [ + + ]: 288 : if (dclist_count(&rb->catchange_txns) == 0)
1224 akapila@postgresql.o 3699 : 281 : return NULL;
3700 : :
3701 : : /* Initialize XID array */
7 michael@paquier.xyz 3702 :GNC 7 : xids = palloc_array(TransactionId, dclist_count(&rb->catchange_txns));
1141 drowley@postgresql.o 3703 [ + - + + ]:CBC 17 : dclist_foreach(iter, &rb->catchange_txns)
3704 : : {
3705 : 10 : ReorderBufferTXN *txn = dclist_container(ReorderBufferTXN,
3706 : : catchange_node,
3707 : : iter.cur);
3708 : :
1224 akapila@postgresql.o 3709 [ - + ]: 10 : Assert(rbtxn_has_catalog_changes(txn));
3710 : :
3711 : 10 : xids[xcnt++] = txn->xid;
3712 : : }
3713 : :
3714 : 7 : qsort(xids, xcnt, sizeof(TransactionId), xidComparator);
3715 : :
1141 drowley@postgresql.o 3716 [ - + ]: 7 : Assert(xcnt == dclist_count(&rb->catchange_txns));
1224 akapila@postgresql.o 3717 : 7 : return xids;
3718 : : }
3719 : :
3720 : : /*
3721 : : * Query whether a transaction is already *known* to contain catalog
3722 : : * changes. This can be wrong until directly before the commit!
3723 : : */
3724 : : bool
4307 rhaas@postgresql.org 3725 : 4344 : ReorderBufferXidHasCatalogChanges(ReorderBuffer *rb, TransactionId xid)
3726 : : {
3727 : : ReorderBufferTXN *txn;
3728 : :
3729 : 4344 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
3730 : : false);
3731 [ + + ]: 4344 : if (txn == NULL)
3732 : 658 : return false;
3733 : :
2168 alvherre@alvh.no-ip. 3734 : 3686 : return rbtxn_has_catalog_changes(txn);
3735 : : }
3736 : :
3737 : : /*
3738 : : * ReorderBufferXidHasBaseSnapshot
3739 : : * Have we already set the base snapshot for the given txn/subtxn?
3740 : : */
3741 : : bool
4307 rhaas@postgresql.org 3742 : 1403139 : ReorderBufferXidHasBaseSnapshot(ReorderBuffer *rb, TransactionId xid)
3743 : : {
3744 : : ReorderBufferTXN *txn;
3745 : :
2731 alvherre@alvh.no-ip. 3746 : 1403139 : txn = ReorderBufferTXNByXid(rb, xid, false,
3747 : : NULL, InvalidXLogRecPtr, false);
3748 : :
3749 : : /* transaction isn't known yet, ergo no snapshot */
4307 rhaas@postgresql.org 3750 [ + + ]: 1403139 : if (txn == NULL)
3751 : 3 : return false;
3752 : :
3753 : : /* a known subtxn? operate on top-level txn instead */
2168 alvherre@alvh.no-ip. 3754 [ + + ]: 1403136 : if (rbtxn_is_known_subxact(txn))
2731 3755 : 342026 : txn = ReorderBufferTXNByXid(rb, txn->toplevel_xid, false,
3756 : : NULL, InvalidXLogRecPtr, false);
3757 : :
4307 rhaas@postgresql.org 3758 : 1403136 : return txn->base_snapshot != NULL;
3759 : : }
3760 : :
3761 : :
3762 : : /*
3763 : : * ---------------------------------------
3764 : : * Disk serialization support
3765 : : * ---------------------------------------
3766 : : */
3767 : :
3768 : : /*
3769 : : * Ensure the IO buffer is >= sz.
3770 : : */
3771 : : static void
3772 : 2717486 : ReorderBufferSerializeReserve(ReorderBuffer *rb, Size sz)
3773 : : {
3774 [ + + ]: 2717486 : if (!rb->outbufsize)
3775 : : {
3776 : 47 : rb->outbuf = MemoryContextAlloc(rb->context, sz);
3777 : 47 : rb->outbufsize = sz;
3778 : : }
3779 [ + + ]: 2717439 : else if (rb->outbufsize < sz)
3780 : : {
3781 : 266 : rb->outbuf = repalloc(rb->outbuf, sz);
3782 : 266 : rb->outbufsize = sz;
3783 : : }
3784 : 2717486 : }
3785 : :
3786 : :
3787 : : /* Compare two transactions by size */
3788 : : static int
615 msawada@postgresql.o 3789 : 344055 : ReorderBufferTXNSizeCompare(const pairingheap_node *a, const pairingheap_node *b, void *arg)
3790 : : {
3791 : 344055 : const ReorderBufferTXN *ta = pairingheap_const_container(ReorderBufferTXN, txn_node, a);
3792 : 344055 : const ReorderBufferTXN *tb = pairingheap_const_container(ReorderBufferTXN, txn_node, b);
3793 : :
623 3794 [ + + ]: 344055 : if (ta->size < tb->size)
3795 : 245372 : return -1;
3796 [ + + ]: 98683 : if (ta->size > tb->size)
3797 : 97815 : return 1;
3798 : 868 : return 0;
3799 : : }
3800 : :
3801 : : /*
3802 : : * Find the largest transaction (toplevel or subxact) to evict (spill to disk).
3803 : : */
3804 : : static ReorderBufferTXN *
3805 : 3468 : ReorderBufferLargestTXN(ReorderBuffer *rb)
3806 : : {
3807 : : ReorderBufferTXN *largest;
3808 : :
3809 : : /* Get the largest transaction from the max-heap */
615 3810 : 3468 : largest = pairingheap_container(ReorderBufferTXN, txn_node,
3811 : : pairingheap_first(rb->txn_heap));
3812 : :
2223 akapila@postgresql.o 3813 [ - + ]: 3468 : Assert(largest);
3814 [ - + ]: 3468 : Assert(largest->size > 0);
3815 [ - + ]: 3468 : Assert(largest->size <= rb->size);
3816 : :
3817 : 3468 : return largest;
3818 : : }
3819 : :
3820 : : /*
3821 : : * Find the largest streamable (and non-aborted) toplevel transaction to evict
3822 : : * (by streaming).
3823 : : *
3824 : : * This can be seen as an optimized version of ReorderBufferLargestTXN, which
3825 : : * should give us the same transaction (because we don't update memory account
3826 : : * for subtransaction with streaming, so it's always 0). But we can simply
3827 : : * iterate over the limited number of toplevel transactions that have a base
3828 : : * snapshot. There is no use of selecting a transaction that doesn't have base
3829 : : * snapshot because we don't decode such transactions. Also, we do not select
3830 : : * the transaction which doesn't have any streamable change.
3831 : : *
3832 : : * Note that, we skip transactions that contain incomplete changes. There
3833 : : * is a scope of optimization here such that we can select the largest
3834 : : * transaction which has incomplete changes. But that will make the code and
3835 : : * design quite complex and that might not be worth the benefit. If we plan to
3836 : : * stream the transactions that contain incomplete changes then we need to
3837 : : * find a way to partially stream/truncate the transaction changes in-memory
3838 : : * and build a mechanism to partially truncate the spilled files.
3839 : : * Additionally, whenever we partially stream the transaction we need to
3840 : : * maintain the last streamed lsn and next time we need to restore from that
3841 : : * segment and the offset in WAL. As we stream the changes from the top
3842 : : * transaction and restore them subtransaction wise, we need to even remember
3843 : : * the subxact from where we streamed the last change.
3844 : : */
3845 : : static ReorderBufferTXN *
1105 3846 : 798 : ReorderBufferLargestStreamableTopTXN(ReorderBuffer *rb)
3847 : : {
3848 : : dlist_iter iter;
1957 3849 : 798 : Size largest_size = 0;
3850 : 798 : ReorderBufferTXN *largest = NULL;
3851 : :
3852 : : /* Find the largest top-level transaction having a base snapshot. */
1692 3853 [ + - + + ]: 1708 : dlist_foreach(iter, &rb->txns_by_base_snapshot_lsn)
3854 : : {
3855 : : ReorderBufferTXN *txn;
3856 : :
3857 : 910 : txn = dlist_container(ReorderBufferTXN, base_snapshot_node, iter.cur);
3858 : :
3859 : : /* must not be a subtxn */
3860 [ - + ]: 910 : Assert(!rbtxn_is_known_subxact(txn));
3861 : : /* base_snapshot must be set */
3862 [ - + ]: 910 : Assert(txn->base_snapshot != NULL);
3863 : :
3864 : : /* Don't consider these kinds of transactions for eviction. */
308 msawada@postgresql.o 3865 [ + + ]: 910 : if (rbtxn_has_partial_change(txn) ||
3866 [ + + ]: 763 : !rbtxn_has_streamable_change(txn) ||
3867 [ - + ]: 733 : rbtxn_is_aborted(txn))
3868 : 177 : continue;
3869 : :
3870 : : /* Find the largest of the eviction candidates. */
1692 akapila@postgresql.o 3871 [ + + + - ]: 733 : if ((largest == NULL || txn->total_size > largest_size) &&
308 msawada@postgresql.o 3872 [ + + ]: 733 : (txn->total_size > 0))
3873 : : {
1957 akapila@postgresql.o 3874 : 687 : largest = txn;
3875 : 687 : largest_size = txn->total_size;
3876 : : }
3877 : : }
3878 : :
3879 : 798 : return largest;
3880 : : }
3881 : :
3882 : : /*
3883 : : * Check whether the logical_decoding_work_mem limit was reached, and if yes
3884 : : * pick the largest (sub)transaction at-a-time to evict and spill its changes to
3885 : : * disk or send to the output plugin until we reach under the memory limit.
3886 : : *
3887 : : * If debug_logical_replication_streaming is set to "immediate", stream or
3888 : : * serialize the changes immediately.
3889 : : *
3890 : : * XXX At this point we select the transactions until we reach under the memory
3891 : : * limit, but we might also adapt a more elaborate eviction strategy - for example
3892 : : * evicting enough transactions to free certain fraction (e.g. 50%) of the memory
3893 : : * limit.
3894 : : */
3895 : : static void
2223 3896 : 1412739 : ReorderBufferCheckMemoryLimit(ReorderBuffer *rb)
3897 : : {
3898 : : ReorderBufferTXN *txn;
70 msawada@postgresql.o 3899 :GNC 1412739 : bool update_stats = true;
3900 : :
3901 [ + + ]: 1412739 : if (rb->size >= logical_decoding_work_mem * (Size) 1024)
3902 : : {
3903 : : /*
3904 : : * Update the statistics as the memory usage has reached the limit. We
3905 : : * report the statistics update later in this function since we can
3906 : : * update the slot statistics altogether while streaming or
3907 : : * serializing transactions in most cases.
3908 : : */
3909 : 3147 : rb->memExceededCount += 1;
3910 : : }
3911 [ + + ]: 1409592 : else if (debug_logical_replication_streaming == DEBUG_LOGICAL_REP_STREAMING_BUFFERED)
3912 : : {
3913 : : /*
3914 : : * Bail out if debug_logical_replication_streaming is buffered and we
3915 : : * haven't exceeded the memory limit.
3916 : : */
2223 akapila@postgresql.o 3917 :CBC 1408647 : return;
3918 : : }
3919 : :
3920 : : /*
3921 : : * If debug_logical_replication_streaming is immediate, loop until there's
3922 : : * no change. Otherwise, loop until we reach under the memory limit. One
3923 : : * might think that just by evicting the largest (sub)transaction we will
3924 : : * come under the memory limit based on assumption that the selected
3925 : : * transaction is at least as large as the most recent change (which
3926 : : * caused us to go over the memory limit). However, that is not true
3927 : : * because a user can reduce the logical_decoding_work_mem to a smaller
3928 : : * value before the most recent change.
3929 : : */
320 tgl@sss.pgh.pa.us 3930 [ + + ]: 8181 : while (rb->size >= logical_decoding_work_mem * (Size) 1024 ||
841 peter@eisentraut.org 3931 [ + + ]: 5034 : (debug_logical_replication_streaming == DEBUG_LOGICAL_REP_STREAMING_IMMEDIATE &&
1087 akapila@postgresql.o 3932 [ + + ]: 1887 : rb->size > 0))
3933 : : {
3934 : : /*
3935 : : * Pick the largest non-aborted transaction and evict it from memory
3936 : : * by streaming, if possible. Otherwise, spill to disk.
3937 : : */
1957 3938 [ + + + + ]: 4887 : if (ReorderBufferCanStartStreaming(rb) &&
1105 3939 : 798 : (txn = ReorderBufferLargestStreamableTopTXN(rb)) != NULL)
3940 : : {
3941 : : /* we know there has to be one, because the size is not zero */
1006 3942 [ + - - + ]: 621 : Assert(txn && rbtxn_is_toptxn(txn));
1957 3943 [ - + ]: 621 : Assert(txn->total_size > 0);
3944 [ - + ]: 621 : Assert(rb->size >= txn->total_size);
3945 : :
3946 : : /* skip the transaction if aborted */
308 msawada@postgresql.o 3947 [ - + ]: 621 : if (ReorderBufferCheckAndTruncateAbortedTXN(rb, txn))
308 msawada@postgresql.o 3948 :UBC 0 : continue;
3949 : :
1957 akapila@postgresql.o 3950 :CBC 621 : ReorderBufferStreamTXN(rb, txn);
3951 : : }
3952 : : else
3953 : : {
3954 : : /*
3955 : : * Pick the largest transaction (or subtransaction) and evict it
3956 : : * from memory by serializing it to disk.
3957 : : */
3958 : 3468 : txn = ReorderBufferLargestTXN(rb);
3959 : :
3960 : : /* we know there has to be one, because the size is not zero */
3961 [ - + ]: 3468 : Assert(txn);
3962 [ - + ]: 3468 : Assert(txn->size > 0);
3963 [ - + ]: 3468 : Assert(rb->size >= txn->size);
3964 : :
3965 : : /* skip the transaction if aborted */
308 msawada@postgresql.o 3966 [ + + ]: 3468 : if (ReorderBufferCheckAndTruncateAbortedTXN(rb, txn))
3967 : 9 : continue;
3968 : :
1957 akapila@postgresql.o 3969 : 3459 : ReorderBufferSerializeTXN(rb, txn);
3970 : : }
3971 : :
3972 : : /*
3973 : : * After eviction, the transaction should have no entries in memory,
3974 : : * and should use 0 bytes for changes.
3975 : : */
2016 3976 [ - + ]: 4080 : Assert(txn->size == 0);
3977 [ - + ]: 4080 : Assert(txn->nentries_mem == 0);
3978 : :
3979 : : /*
3980 : : * We've reported the memExceededCount update while streaming or
3981 : : * serializing the transaction.
3982 : : */
70 msawada@postgresql.o 3983 :GNC 4080 : update_stats = false;
3984 : : }
3985 : :
3986 [ + + ]: 4092 : if (update_stats)
3987 : 12 : UpdateDecodingStats((LogicalDecodingContext *) rb->private_data);
3988 : :
3989 : : /* We must be under the memory limit now. */
320 tgl@sss.pgh.pa.us 3990 [ - + ]:CBC 4092 : Assert(rb->size < logical_decoding_work_mem * (Size) 1024);
3991 : : }
3992 : :
3993 : : /*
3994 : : * Spill data of a large transaction (and its subtransactions) to disk.
3995 : : */
3996 : : static void
4307 rhaas@postgresql.org 3997 : 3624 : ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
3998 : : {
3999 : : dlist_iter subtxn_i;
4000 : : dlist_mutable_iter change_i;
4001 : 3624 : int fd = -1;
4002 : 3624 : XLogSegNo curOpenSegNo = 0;
4003 : 3624 : Size spilled = 0;
1896 akapila@postgresql.o 4004 : 3624 : Size size = txn->size;
4005 : :
4249 tgl@sss.pgh.pa.us 4006 [ - + ]: 3624 : elog(DEBUG2, "spill %u changes in XID %u to disk",
4007 : : (uint32) txn->nentries_mem, txn->xid);
4008 : :
4009 : : /* do the same to all child TXs */
4307 rhaas@postgresql.org 4010 [ + - + + ]: 3749 : dlist_foreach(subtxn_i, &txn->subtxns)
4011 : : {
4012 : : ReorderBufferTXN *subtxn;
4013 : :
4014 : 125 : subtxn = dlist_container(ReorderBufferTXN, node, subtxn_i.cur);
4015 : 125 : ReorderBufferSerializeTXN(rb, subtxn);
4016 : : }
4017 : :
4018 : : /* serialize changestream */
4019 [ + - + + ]: 1192607 : dlist_foreach_modify(change_i, &txn->changes)
4020 : : {
4021 : : ReorderBufferChange *change;
4022 : :
4023 : 1188983 : change = dlist_container(ReorderBufferChange, node, change_i.cur);
4024 : :
4025 : : /*
4026 : : * store in segment in which it belongs by start lsn, don't split over
4027 : : * multiple segments tho
4028 : : */
3011 andres@anarazel.de 4029 [ + + ]: 1188983 : if (fd == -1 ||
4030 [ + + ]: 1185479 : !XLByteInSeg(change->lsn, curOpenSegNo, wal_segment_size))
4031 : : {
4032 : : char path[MAXPGPATH];
4033 : :
4307 rhaas@postgresql.org 4034 [ + + ]: 3524 : if (fd != -1)
4035 : 20 : CloseTransientFile(fd);
4036 : :
3011 andres@anarazel.de 4037 : 3524 : XLByteToSeg(change->lsn, curOpenSegNo, wal_segment_size);
4038 : :
4039 : : /*
4040 : : * No need to care about TLIs here, only used during a single run,
4041 : : * so each LSN only maps to a specific WAL record.
4042 : : */
2843 alvherre@alvh.no-ip. 4043 : 3524 : ReorderBufferSerializedPath(path, MyReplicationSlot, txn->xid,
4044 : : curOpenSegNo);
4045 : :
4046 : : /* open segment, create it if necessary */
4307 rhaas@postgresql.org 4047 : 3524 : fd = OpenTransientFile(path,
4048 : : O_CREAT | O_WRONLY | O_APPEND | PG_BINARY);
4049 : :
4050 [ - + ]: 3524 : if (fd < 0)
4307 rhaas@postgresql.org 4051 [ # # ]:UBC 0 : ereport(ERROR,
4052 : : (errcode_for_file_access(),
4053 : : errmsg("could not open file \"%s\": %m", path)));
4054 : : }
4055 : :
4307 rhaas@postgresql.org 4056 :CBC 1188983 : ReorderBufferSerializeChange(rb, txn, fd, change);
4057 : 1188983 : dlist_delete(&change->node);
280 heikki.linnakangas@i 4058 : 1188983 : ReorderBufferFreeChange(rb, change, false);
4059 : :
4307 rhaas@postgresql.org 4060 : 1188983 : spilled++;
4061 : : }
4062 : :
4063 : : /* Update the memory counter */
623 msawada@postgresql.o 4064 : 3624 : ReorderBufferChangeMemoryUpdate(rb, NULL, txn, false, size);
4065 : :
4066 : : /* update the statistics iff we have spilled anything */
1896 akapila@postgresql.o 4067 [ + + ]: 3624 : if (spilled)
4068 : : {
4069 : 3504 : rb->spillCount += 1;
4070 : 3504 : rb->spillBytes += size;
4071 : :
4072 : : /* don't consider already serialized transactions */
4073 [ + + + - ]: 3504 : rb->spillTxns += (rbtxn_is_serialized(txn) || rbtxn_is_serialized_clear(txn)) ? 0 : 1;
4074 : :
4075 : : /* update the decoding stats */
1686 4076 : 3504 : UpdateDecodingStats((LogicalDecodingContext *) rb->private_data);
4077 : : }
4078 : :
4307 rhaas@postgresql.org 4079 [ - + ]: 3624 : Assert(spilled == txn->nentries_mem);
4080 [ - + ]: 3624 : Assert(dlist_is_empty(&txn->changes));
4081 : 3624 : txn->nentries_mem = 0;
2168 alvherre@alvh.no-ip. 4082 : 3624 : txn->txn_flags |= RBTXN_IS_SERIALIZED;
4083 : :
4307 rhaas@postgresql.org 4084 [ + + ]: 3624 : if (fd != -1)
4085 : 3504 : CloseTransientFile(fd);
4086 : 3624 : }
4087 : :
4088 : : /*
4089 : : * Serialize individual change to disk.
4090 : : */
4091 : : static void
4092 : 1188983 : ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
4093 : : int fd, ReorderBufferChange *change)
4094 : : {
4095 : : ReorderBufferDiskChange *ondisk;
4096 : 1188983 : Size sz = sizeof(ReorderBufferDiskChange);
4097 : :
4098 : 1188983 : ReorderBufferSerializeReserve(rb, sz);
4099 : :
4100 : 1188983 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4101 : 1188983 : memcpy(&ondisk->change, change, sizeof(ReorderBufferChange));
4102 : :
4303 tgl@sss.pgh.pa.us 4103 [ + + + + : 1188983 : switch (change->action)
+ + - ]
4104 : : {
4105 : : /* fall through these, they're all similar enough */
4106 : 1171495 : case REORDER_BUFFER_CHANGE_INSERT:
4107 : : case REORDER_BUFFER_CHANGE_UPDATE:
4108 : : case REORDER_BUFFER_CHANGE_DELETE:
4109 : : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
4110 : : {
4111 : : char *data;
4112 : : HeapTuple oldtup,
4113 : : newtup;
4307 rhaas@postgresql.org 4114 : 1171495 : Size oldlen = 0;
4115 : 1171495 : Size newlen = 0;
4116 : :
4303 tgl@sss.pgh.pa.us 4117 : 1171495 : oldtup = change->data.tp.oldtuple;
4118 : 1171495 : newtup = change->data.tp.newtuple;
4119 : :
4120 [ + + ]: 1171495 : if (oldtup)
4121 : : {
3574 andres@anarazel.de 4122 : 160127 : sz += sizeof(HeapTupleData);
688 msawada@postgresql.o 4123 : 160127 : oldlen = oldtup->t_len;
3574 andres@anarazel.de 4124 : 160127 : sz += oldlen;
4125 : : }
4126 : :
4303 tgl@sss.pgh.pa.us 4127 [ + + ]: 1171495 : if (newtup)
4128 : : {
3574 andres@anarazel.de 4129 : 957653 : sz += sizeof(HeapTupleData);
688 msawada@postgresql.o 4130 : 957653 : newlen = newtup->t_len;
3574 andres@anarazel.de 4131 : 957653 : sz += newlen;
4132 : : }
4133 : :
4134 : : /* make sure we have enough space */
4307 rhaas@postgresql.org 4135 : 1171495 : ReorderBufferSerializeReserve(rb, sz);
4136 : :
4137 : 1171495 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
4138 : : /* might have been reallocated above */
4139 : 1171495 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4140 : :
4141 [ + + ]: 1171495 : if (oldlen)
4142 : : {
688 msawada@postgresql.o 4143 : 160127 : memcpy(data, oldtup, sizeof(HeapTupleData));
3574 andres@anarazel.de 4144 : 160127 : data += sizeof(HeapTupleData);
4145 : :
688 msawada@postgresql.o 4146 : 160127 : memcpy(data, oldtup->t_data, oldlen);
4307 rhaas@postgresql.org 4147 : 160127 : data += oldlen;
4148 : : }
4149 : :
4150 [ + + ]: 1171495 : if (newlen)
4151 : : {
688 msawada@postgresql.o 4152 : 957653 : memcpy(data, newtup, sizeof(HeapTupleData));
3574 andres@anarazel.de 4153 : 957653 : data += sizeof(HeapTupleData);
4154 : :
688 msawada@postgresql.o 4155 : 957653 : memcpy(data, newtup->t_data, newlen);
3572 andres@anarazel.de 4156 : 957653 : data += newlen;
4157 : : }
3542 simon@2ndQuadrant.co 4158 : 1171495 : break;
4159 : : }
4160 : 13 : case REORDER_BUFFER_CHANGE_MESSAGE:
4161 : : {
4162 : : char *data;
4163 : 13 : Size prefix_size = strlen(change->data.msg.prefix) + 1;
4164 : :
4165 : 13 : sz += prefix_size + change->data.msg.message_size +
4166 : : sizeof(Size) + sizeof(Size);
4167 : 13 : ReorderBufferSerializeReserve(rb, sz);
4168 : :
4169 : 13 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
4170 : :
4171 : : /* might have been reallocated above */
3367 rhaas@postgresql.org 4172 : 13 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4173 : :
4174 : : /* write the prefix including the size */
3542 simon@2ndQuadrant.co 4175 : 13 : memcpy(data, &prefix_size, sizeof(Size));
4176 : 13 : data += sizeof(Size);
4177 : 13 : memcpy(data, change->data.msg.prefix,
4178 : : prefix_size);
4179 : 13 : data += prefix_size;
4180 : :
4181 : : /* write the message including the size */
4182 : 13 : memcpy(data, &change->data.msg.message_size, sizeof(Size));
4183 : 13 : data += sizeof(Size);
4184 : 13 : memcpy(data, change->data.msg.message,
4185 : : change->data.msg.message_size);
4186 : 13 : data += change->data.msg.message_size;
4187 : :
1889 akapila@postgresql.o 4188 : 13 : break;
4189 : : }
4190 : 154 : case REORDER_BUFFER_CHANGE_INVALIDATION:
4191 : : {
4192 : : char *data;
4193 : 154 : Size inval_size = sizeof(SharedInvalidationMessage) *
943 tgl@sss.pgh.pa.us 4194 : 154 : change->data.inval.ninvalidations;
4195 : :
1889 akapila@postgresql.o 4196 : 154 : sz += inval_size;
4197 : :
4198 : 154 : ReorderBufferSerializeReserve(rb, sz);
4199 : 154 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
4200 : :
4201 : : /* might have been reallocated above */
4202 : 154 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4203 : 154 : memcpy(data, change->data.inval.invalidations, inval_size);
4204 : 154 : data += inval_size;
4205 : :
4307 rhaas@postgresql.org 4206 : 154 : break;
4207 : : }
4208 : 8 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
4209 : : {
4210 : : Snapshot snap;
4211 : : char *data;
4212 : :
4303 tgl@sss.pgh.pa.us 4213 : 8 : snap = change->data.snapshot;
4214 : :
4307 rhaas@postgresql.org 4215 : 8 : sz += sizeof(SnapshotData) +
4303 tgl@sss.pgh.pa.us 4216 : 8 : sizeof(TransactionId) * snap->xcnt +
2161 alvherre@alvh.no-ip. 4217 : 8 : sizeof(TransactionId) * snap->subxcnt;
4218 : :
4219 : : /* make sure we have enough space */
4307 rhaas@postgresql.org 4220 : 8 : ReorderBufferSerializeReserve(rb, sz);
4221 : 8 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
4222 : : /* might have been reallocated above */
4223 : 8 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4224 : :
4303 tgl@sss.pgh.pa.us 4225 : 8 : memcpy(data, snap, sizeof(SnapshotData));
4307 rhaas@postgresql.org 4226 : 8 : data += sizeof(SnapshotData);
4227 : :
4303 tgl@sss.pgh.pa.us 4228 [ + - ]: 8 : if (snap->xcnt)
4229 : : {
4230 : 8 : memcpy(data, snap->xip,
4240 rhaas@postgresql.org 4231 : 8 : sizeof(TransactionId) * snap->xcnt);
4232 : 8 : data += sizeof(TransactionId) * snap->xcnt;
4233 : : }
4234 : :
4303 tgl@sss.pgh.pa.us 4235 [ - + ]: 8 : if (snap->subxcnt)
4236 : : {
4303 tgl@sss.pgh.pa.us 4237 :UBC 0 : memcpy(data, snap->subxip,
4240 rhaas@postgresql.org 4238 : 0 : sizeof(TransactionId) * snap->subxcnt);
4239 : 0 : data += sizeof(TransactionId) * snap->subxcnt;
4240 : : }
4307 rhaas@postgresql.org 4241 :CBC 8 : break;
4242 : : }
2811 peter_e@gmx.net 4243 : 2 : case REORDER_BUFFER_CHANGE_TRUNCATE:
4244 : : {
4245 : : Size size;
4246 : : char *data;
4247 : :
4248 : : /* account for the OIDs of truncated relations */
2662 tomas.vondra@postgre 4249 : 2 : size = sizeof(Oid) * change->data.truncate.nrelids;
4250 : 2 : sz += size;
4251 : :
4252 : : /* make sure we have enough space */
4253 : 2 : ReorderBufferSerializeReserve(rb, sz);
4254 : :
4255 : 2 : data = ((char *) rb->outbuf) + sizeof(ReorderBufferDiskChange);
4256 : : /* might have been reallocated above */
4257 : 2 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4258 : :
4259 : 2 : memcpy(data, change->data.truncate.relids, size);
4260 : 2 : data += size;
4261 : :
4262 : 2 : break;
4263 : : }
3876 andres@anarazel.de 4264 : 17311 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
4265 : : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
4266 : : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
4267 : : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
4268 : : /* ReorderBufferChange contains everything important */
4307 rhaas@postgresql.org 4269 : 17311 : break;
4270 : : }
4271 : :
4272 : 1188983 : ondisk->size = sz;
4273 : :
2691 michael@paquier.xyz 4274 : 1188983 : errno = 0;
3196 rhaas@postgresql.org 4275 : 1188983 : pgstat_report_wait_start(WAIT_EVENT_REORDER_BUFFER_WRITE);
4307 4276 [ - + ]: 1188983 : if (write(fd, rb->outbuf, ondisk->size) != ondisk->size)
4277 : : {
3399 tgl@sss.pgh.pa.us 4278 :UBC 0 : int save_errno = errno;
4279 : :
4307 rhaas@postgresql.org 4280 : 0 : CloseTransientFile(fd);
4281 : :
4282 : : /* if write didn't set errno, assume problem is no disk space */
2732 michael@paquier.xyz 4283 [ # # ]: 0 : errno = save_errno ? save_errno : ENOSPC;
4307 rhaas@postgresql.org 4284 [ # # ]: 0 : ereport(ERROR,
4285 : : (errcode_for_file_access(),
4286 : : errmsg("could not write to data file for XID %u: %m",
4287 : : txn->xid)));
4288 : : }
3196 rhaas@postgresql.org 4289 :CBC 1188983 : pgstat_report_wait_end();
4290 : :
4291 : : /*
4292 : : * Keep the transaction's final_lsn up to date with each change we send to
4293 : : * disk, so that ReorderBufferRestoreCleanup works correctly. (We used to
4294 : : * only do this on commit and abort records, but that doesn't work if a
4295 : : * system crash leaves a transaction without its abort record).
4296 : : *
4297 : : * Make sure not to move it backwards.
4298 : : */
2161 alvherre@alvh.no-ip. 4299 [ + + ]: 1188983 : if (txn->final_lsn < change->lsn)
4300 : 1184500 : txn->final_lsn = change->lsn;
4301 : :
4303 tgl@sss.pgh.pa.us 4302 [ - + ]: 1188983 : Assert(ondisk->change.action == change->action);
4307 rhaas@postgresql.org 4303 : 1188983 : }
4304 : :
4305 : : /* Returns true, if the output plugin supports streaming, false, otherwise. */
4306 : : static inline bool
1957 akapila@postgresql.o 4307 : 1920112 : ReorderBufferCanStream(ReorderBuffer *rb)
4308 : : {
4309 : 1920112 : LogicalDecodingContext *ctx = rb->private_data;
4310 : :
4311 : 1920112 : return ctx->streaming;
4312 : : }
4313 : :
4314 : : /* Returns true, if the streaming can be started now, false, otherwise. */
4315 : : static inline bool
4316 : 507373 : ReorderBufferCanStartStreaming(ReorderBuffer *rb)
4317 : : {
4318 : 507373 : LogicalDecodingContext *ctx = rb->private_data;
4319 : 507373 : SnapBuild *builder = ctx->snapshot_builder;
4320 : :
4321 : : /* We can't start streaming unless a consistent state is reached. */
1839 4322 [ - + ]: 507373 : if (SnapBuildCurrentState(builder) < SNAPBUILD_CONSISTENT)
1839 akapila@postgresql.o 4323 :UBC 0 : return false;
4324 : :
4325 : : /*
4326 : : * We can't start streaming immediately even if the streaming is enabled
4327 : : * because we previously decoded this transaction and now just are
4328 : : * restarting.
4329 : : */
1957 akapila@postgresql.o 4330 [ + + ]:CBC 507373 : if (ReorderBufferCanStream(rb) &&
1105 4331 [ + + ]: 505445 : !SnapBuildXactNeedsSkip(builder, ctx->reader->ReadRecPtr))
1957 4332 : 170410 : return true;
4333 : :
4334 : 336963 : return false;
4335 : : }
4336 : :
4337 : : /*
4338 : : * Send data of a large transaction (and its subtransactions) to the
4339 : : * output plugin, but using the stream API.
4340 : : */
4341 : : static void
4342 : 692 : ReorderBufferStreamTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
4343 : : {
4344 : : Snapshot snapshot_now;
4345 : : CommandId command_id;
4346 : : Size stream_bytes;
4347 : : bool txn_is_streamed;
4348 : :
4349 : : /* We can never reach here for a subtransaction. */
1006 4350 [ - + ]: 692 : Assert(rbtxn_is_toptxn(txn));
4351 : :
4352 : : /*
4353 : : * We can't make any assumptions about base snapshot here, similar to what
4354 : : * ReorderBufferCommit() does. That relies on base_snapshot getting
4355 : : * transferred from subxact in ReorderBufferCommitChild(), but that was
4356 : : * not yet called as the transaction is in-progress.
4357 : : *
4358 : : * So just walk the subxacts and use the same logic here. But we only need
4359 : : * to do that once, when the transaction is streamed for the first time.
4360 : : * After that we need to reuse the snapshot from the previous run.
4361 : : *
4362 : : * Unlike DecodeCommit which adds xids of all the subtransactions in
4363 : : * snapshot's xip array via SnapBuildCommitTxn, we can't do that here but
4364 : : * we do add them to subxip array instead via ReorderBufferCopySnap. This
4365 : : * allows the catalog changes made in subtransactions decoded till now to
4366 : : * be visible.
4367 : : */
1957 4368 [ + + ]: 692 : if (txn->snapshot_now == NULL)
4369 : : {
4370 : : dlist_iter subxact_i;
4371 : :
4372 : : /* make sure this transaction is streamed for the first time */
4373 [ - + ]: 68 : Assert(!rbtxn_is_streamed(txn));
4374 : :
4375 : : /* at the beginning we should have invalid command ID */
4376 [ - + ]: 68 : Assert(txn->command_id == InvalidCommandId);
4377 : :
4378 [ + - + + ]: 72 : dlist_foreach(subxact_i, &txn->subtxns)
4379 : : {
4380 : : ReorderBufferTXN *subtxn;
4381 : :
4382 : 4 : subtxn = dlist_container(ReorderBufferTXN, node, subxact_i.cur);
4383 : 4 : ReorderBufferTransferSnapToParent(txn, subtxn);
4384 : : }
4385 : :
4386 : : /*
4387 : : * If this transaction has no snapshot, it didn't make any changes to
4388 : : * the database till now, so there's nothing to decode.
4389 : : */
4390 [ - + ]: 68 : if (txn->base_snapshot == NULL)
4391 : : {
1957 akapila@postgresql.o 4392 [ # # ]:UBC 0 : Assert(txn->ninvalidations == 0);
4393 : 0 : return;
4394 : : }
4395 : :
1957 akapila@postgresql.o 4396 :CBC 68 : command_id = FirstCommandId;
4397 : 68 : snapshot_now = ReorderBufferCopySnap(rb, txn->base_snapshot,
4398 : : txn, command_id);
4399 : : }
4400 : : else
4401 : : {
4402 : : /* the transaction must have been already streamed */
4403 [ - + ]: 624 : Assert(rbtxn_is_streamed(txn));
4404 : :
4405 : : /*
4406 : : * Nah, we already have snapshot from the previous streaming run. We
4407 : : * assume new subxacts can't move the LSN backwards, and so can't beat
4408 : : * the LSN condition in the previous branch (so no need to walk
4409 : : * through subxacts again). In fact, we must not do that as we may be
4410 : : * using snapshot half-way through the subxact.
4411 : : */
4412 : 624 : command_id = txn->command_id;
4413 : :
4414 : : /*
4415 : : * We can't use txn->snapshot_now directly because after the last
4416 : : * streaming run, we might have got some new sub-transactions. So we
4417 : : * need to add them to the snapshot.
4418 : : */
4419 : 624 : snapshot_now = ReorderBufferCopySnap(rb, txn->snapshot_now,
4420 : : txn, command_id);
4421 : :
4422 : : /* Free the previously copied snapshot. */
4423 [ - + ]: 624 : Assert(txn->snapshot_now->copied);
4424 : 624 : ReorderBufferFreeSnap(rb, txn->snapshot_now);
4425 : 624 : txn->snapshot_now = NULL;
4426 : : }
4427 : :
4428 : : /*
4429 : : * Remember this information to be used later to update stats. We can't
4430 : : * update the stats here as an error while processing the changes would
4431 : : * lead to the accumulation of stats even though we haven't streamed all
4432 : : * the changes.
4433 : : */
1875 4434 : 692 : txn_is_streamed = rbtxn_is_streamed(txn);
4435 : 692 : stream_bytes = txn->total_size;
4436 : :
4437 : : /* Process and send the changes to output plugin. */
1957 4438 : 692 : ReorderBufferProcessTXN(rb, txn, InvalidXLogRecPtr, snapshot_now,
4439 : : command_id, true);
4440 : :
1875 4441 : 692 : rb->streamCount += 1;
4442 : 692 : rb->streamBytes += stream_bytes;
4443 : :
4444 : : /* Don't consider already streamed transaction. */
4445 : 692 : rb->streamTxns += (txn_is_streamed) ? 0 : 1;
4446 : :
4447 : : /* update the decoding stats */
1686 4448 : 692 : UpdateDecodingStats((LogicalDecodingContext *) rb->private_data);
4449 : :
1957 4450 [ - + ]: 692 : Assert(dlist_is_empty(&txn->changes));
4451 [ - + ]: 692 : Assert(txn->nentries == 0);
4452 [ - + ]: 692 : Assert(txn->nentries_mem == 0);
4453 : : }
4454 : :
4455 : : /*
4456 : : * Size of a change in memory.
4457 : : */
4458 : : static Size
2223 4459 : 2017723 : ReorderBufferChangeSize(ReorderBufferChange *change)
4460 : : {
4461 : 2017723 : Size sz = sizeof(ReorderBufferChange);
4462 : :
4463 [ + + + + : 2017723 : switch (change->action)
+ + - ]
4464 : : {
4465 : : /* fall through these, they're all similar enough */
4466 : 1910306 : case REORDER_BUFFER_CHANGE_INSERT:
4467 : : case REORDER_BUFFER_CHANGE_UPDATE:
4468 : : case REORDER_BUFFER_CHANGE_DELETE:
4469 : : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
4470 : : {
4471 : : HeapTuple oldtup,
4472 : : newtup;
4473 : 1910306 : Size oldlen = 0;
4474 : 1910306 : Size newlen = 0;
4475 : :
4476 : 1910306 : oldtup = change->data.tp.oldtuple;
4477 : 1910306 : newtup = change->data.tp.newtuple;
4478 : :
4479 [ + + ]: 1910306 : if (oldtup)
4480 : : {
4481 : 262186 : sz += sizeof(HeapTupleData);
688 msawada@postgresql.o 4482 : 262186 : oldlen = oldtup->t_len;
2223 akapila@postgresql.o 4483 : 262186 : sz += oldlen;
4484 : : }
4485 : :
4486 [ + + ]: 1910306 : if (newtup)
4487 : : {
4488 : 1565301 : sz += sizeof(HeapTupleData);
688 msawada@postgresql.o 4489 : 1565301 : newlen = newtup->t_len;
2223 akapila@postgresql.o 4490 : 1565301 : sz += newlen;
4491 : : }
4492 : :
4493 : 1910306 : break;
4494 : : }
4495 : 67 : case REORDER_BUFFER_CHANGE_MESSAGE:
4496 : : {
4497 : 67 : Size prefix_size = strlen(change->data.msg.prefix) + 1;
4498 : :
4499 : 67 : sz += prefix_size + change->data.msg.message_size +
4500 : : sizeof(Size) + sizeof(Size);
4501 : :
4502 : 67 : break;
4503 : : }
1889 4504 : 10163 : case REORDER_BUFFER_CHANGE_INVALIDATION:
4505 : : {
4506 : 10163 : sz += sizeof(SharedInvalidationMessage) *
4507 : 10163 : change->data.inval.ninvalidations;
4508 : 10163 : break;
4509 : : }
2223 4510 : 2509 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
4511 : : {
4512 : : Snapshot snap;
4513 : :
4514 : 2509 : snap = change->data.snapshot;
4515 : :
4516 : 2509 : sz += sizeof(SnapshotData) +
4517 : 2509 : sizeof(TransactionId) * snap->xcnt +
4518 : 2509 : sizeof(TransactionId) * snap->subxcnt;
4519 : :
4520 : 2509 : break;
4521 : : }
4522 : 81 : case REORDER_BUFFER_CHANGE_TRUNCATE:
4523 : : {
4524 : 81 : sz += sizeof(Oid) * change->data.truncate.nrelids;
4525 : :
4526 : 81 : break;
4527 : : }
4528 : 94597 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
4529 : : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
4530 : : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
4531 : : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
4532 : : /* ReorderBufferChange contains everything important */
4533 : 94597 : break;
4534 : : }
4535 : :
4536 : 2017723 : return sz;
4537 : : }
4538 : :
4539 : :
4540 : : /*
4541 : : * Restore a number of changes spilled to disk back into memory.
4542 : : */
4543 : : static Size
4307 rhaas@postgresql.org 4544 : 105 : ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn,
4545 : : TXNEntryFile *file, XLogSegNo *segno)
4546 : : {
4547 : 105 : Size restored = 0;
4548 : : XLogSegNo last_segno;
4549 : : dlist_mutable_iter cleanup_iter;
2195 akapila@postgresql.o 4550 : 105 : File *fd = &file->vfd;
4551 : :
41 alvherre@kurilemu.de 4552 [ - + ]:GNC 105 : Assert(XLogRecPtrIsValid(txn->first_lsn));
4553 [ - + ]: 105 : Assert(XLogRecPtrIsValid(txn->final_lsn));
4554 : :
4555 : : /* free current entries, so we have memory for more */
4307 rhaas@postgresql.org 4556 [ + - + + ]:CBC 174836 : dlist_foreach_modify(cleanup_iter, &txn->changes)
4557 : : {
4558 : 174731 : ReorderBufferChange *cleanup =
943 tgl@sss.pgh.pa.us 4559 : 174731 : dlist_container(ReorderBufferChange, node, cleanup_iter.cur);
4560 : :
4307 rhaas@postgresql.org 4561 : 174731 : dlist_delete(&cleanup->node);
280 heikki.linnakangas@i 4562 : 174731 : ReorderBufferFreeChange(rb, cleanup, true);
4563 : : }
4307 rhaas@postgresql.org 4564 : 105 : txn->nentries_mem = 0;
4565 [ - + ]: 105 : Assert(dlist_is_empty(&txn->changes));
4566 : :
3011 andres@anarazel.de 4567 : 105 : XLByteToSeg(txn->final_lsn, last_segno, wal_segment_size);
4568 : :
4307 rhaas@postgresql.org 4569 [ + + + + ]: 178542 : while (restored < max_changes_in_memory && *segno <= last_segno)
4570 : : {
4571 : : int readBytes;
4572 : : ReorderBufferDiskChange *ondisk;
4573 : :
1153 akapila@postgresql.o 4574 [ - + ]: 178437 : CHECK_FOR_INTERRUPTS();
4575 : :
4307 rhaas@postgresql.org 4576 [ + + ]: 178437 : if (*fd == -1)
4577 : : {
4578 : : char path[MAXPGPATH];
4579 : :
4580 : : /* first time in */
4581 [ + + ]: 43 : if (*segno == 0)
3011 andres@anarazel.de 4582 : 40 : XLByteToSeg(txn->first_lsn, *segno, wal_segment_size);
4583 : :
4307 rhaas@postgresql.org 4584 [ - + - - ]: 43 : Assert(*segno != 0 || dlist_is_empty(&txn->changes));
4585 : :
4586 : : /*
4587 : : * No need to care about TLIs here, only used during a single run,
4588 : : * so each LSN only maps to a specific WAL record.
4589 : : */
2843 alvherre@alvh.no-ip. 4590 : 43 : ReorderBufferSerializedPath(path, MyReplicationSlot, txn->xid,
4591 : : *segno);
4592 : :
2195 akapila@postgresql.o 4593 : 43 : *fd = PathNameOpenFile(path, O_RDONLY | PG_BINARY);
4594 : :
4595 : : /* No harm in resetting the offset even in case of failure */
4596 : 43 : file->curOffset = 0;
4597 : :
4307 rhaas@postgresql.org 4598 [ - + - - ]: 43 : if (*fd < 0 && errno == ENOENT)
4599 : : {
4307 rhaas@postgresql.org 4600 :LBC (1) : *fd = -1;
4601 : (1) : (*segno)++;
4602 : (1) : continue;
4603 : : }
4307 rhaas@postgresql.org 4604 [ - + ]:CBC 43 : else if (*fd < 0)
4307 rhaas@postgresql.org 4605 [ # # ]:UBC 0 : ereport(ERROR,
4606 : : (errcode_for_file_access(),
4607 : : errmsg("could not open file \"%s\": %m",
4608 : : path)));
4609 : : }
4610 : :
4611 : : /*
4612 : : * Read the statically sized part of a change which has information
4613 : : * about the total size. If we couldn't read a record, we're at the
4614 : : * end of this file.
4615 : : */
4240 rhaas@postgresql.org 4616 :CBC 178437 : ReorderBufferSerializeReserve(rb, sizeof(ReorderBufferDiskChange));
2195 akapila@postgresql.o 4617 : 178437 : readBytes = FileRead(file->vfd, rb->outbuf,
4618 : : sizeof(ReorderBufferDiskChange),
4619 : : file->curOffset, WAIT_EVENT_REORDER_BUFFER_READ);
4620 : :
4621 : : /* eof */
4307 rhaas@postgresql.org 4622 [ + + ]: 178437 : if (readBytes == 0)
4623 : : {
2195 akapila@postgresql.o 4624 : 43 : FileClose(*fd);
4307 rhaas@postgresql.org 4625 : 43 : *fd = -1;
4626 : 43 : (*segno)++;
4627 : 43 : continue;
4628 : : }
4629 [ - + ]: 178394 : else if (readBytes < 0)
4307 rhaas@postgresql.org 4630 [ # # ]:UBC 0 : ereport(ERROR,
4631 : : (errcode_for_file_access(),
4632 : : errmsg("could not read from reorderbuffer spill file: %m")));
4307 rhaas@postgresql.org 4633 [ - + ]:CBC 178394 : else if (readBytes != sizeof(ReorderBufferDiskChange))
4307 rhaas@postgresql.org 4634 [ # # ]:UBC 0 : ereport(ERROR,
4635 : : (errcode_for_file_access(),
4636 : : errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes",
4637 : : readBytes,
4638 : : (uint32) sizeof(ReorderBufferDiskChange))));
4639 : :
2195 akapila@postgresql.o 4640 :CBC 178394 : file->curOffset += readBytes;
4641 : :
4307 rhaas@postgresql.org 4642 : 178394 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4643 : :
4644 : 178394 : ReorderBufferSerializeReserve(rb,
3101 tgl@sss.pgh.pa.us 4645 : 178394 : sizeof(ReorderBufferDiskChange) + ondisk->size);
4307 rhaas@postgresql.org 4646 : 178394 : ondisk = (ReorderBufferDiskChange *) rb->outbuf;
4647 : :
2195 akapila@postgresql.o 4648 : 356788 : readBytes = FileRead(file->vfd,
4649 : 178394 : rb->outbuf + sizeof(ReorderBufferDiskChange),
4650 : 178394 : ondisk->size - sizeof(ReorderBufferDiskChange),
4651 : : file->curOffset,
4652 : : WAIT_EVENT_REORDER_BUFFER_READ);
4653 : :
4307 rhaas@postgresql.org 4654 [ - + ]: 178394 : if (readBytes < 0)
4307 rhaas@postgresql.org 4655 [ # # ]:UBC 0 : ereport(ERROR,
4656 : : (errcode_for_file_access(),
4657 : : errmsg("could not read from reorderbuffer spill file: %m")));
4307 rhaas@postgresql.org 4658 [ - + ]:CBC 178394 : else if (readBytes != ondisk->size - sizeof(ReorderBufferDiskChange))
4307 rhaas@postgresql.org 4659 [ # # ]:UBC 0 : ereport(ERROR,
4660 : : (errcode_for_file_access(),
4661 : : errmsg("could not read from reorderbuffer spill file: read %d instead of %u bytes",
4662 : : readBytes,
4663 : : (uint32) (ondisk->size - sizeof(ReorderBufferDiskChange)))));
4664 : :
2195 akapila@postgresql.o 4665 :CBC 178394 : file->curOffset += readBytes;
4666 : :
4667 : : /*
4668 : : * ok, read a full change from disk, now restore it into proper
4669 : : * in-memory format
4670 : : */
4307 rhaas@postgresql.org 4671 : 178394 : ReorderBufferRestoreChange(rb, txn, rb->outbuf);
4672 : 178394 : restored++;
4673 : : }
4674 : :
4675 : 105 : return restored;
4676 : : }
4677 : :
4678 : : /*
4679 : : * Convert change from its on-disk format to in-memory format and queue it onto
4680 : : * the TXN's ->changes list.
4681 : : *
4682 : : * Note: although "data" is declared char*, at entry it points to a
4683 : : * maxalign'd buffer, making it safe in most of this function to assume
4684 : : * that the pointed-to data is suitably aligned for direct access.
4685 : : */
4686 : : static void
4687 : 178394 : ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
4688 : : char *data)
4689 : : {
4690 : : ReorderBufferDiskChange *ondisk;
4691 : : ReorderBufferChange *change;
4692 : :
4693 : 178394 : ondisk = (ReorderBufferDiskChange *) data;
4694 : :
280 heikki.linnakangas@i 4695 : 178394 : change = ReorderBufferAllocChange(rb);
4696 : :
4697 : : /* copy static part */
4307 rhaas@postgresql.org 4698 : 178394 : memcpy(change, &ondisk->change, sizeof(ReorderBufferChange));
4699 : :
4700 : 178394 : data += sizeof(ReorderBufferDiskChange);
4701 : :
4702 : : /* restore individual stuff */
4303 tgl@sss.pgh.pa.us 4703 [ + + + + : 178394 : switch (change->action)
- + - ]
4704 : : {
4705 : : /* fall through these, they're all similar enough */
4706 : 176465 : case REORDER_BUFFER_CHANGE_INSERT:
4707 : : case REORDER_BUFFER_CHANGE_UPDATE:
4708 : : case REORDER_BUFFER_CHANGE_DELETE:
4709 : : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT:
3574 andres@anarazel.de 4710 [ + + ]: 176465 : if (change->data.tp.oldtuple)
4711 : : {
3534 tgl@sss.pgh.pa.us 4712 : 5006 : uint32 tuplelen = ((HeapTuple) data)->t_len;
4713 : :
3574 andres@anarazel.de 4714 : 5006 : change->data.tp.oldtuple =
280 heikki.linnakangas@i 4715 : 5006 : ReorderBufferAllocTupleBuf(rb, tuplelen - SizeofHeapTupleHeader);
4716 : :
4717 : : /* restore ->tuple */
688 msawada@postgresql.o 4718 : 5006 : memcpy(change->data.tp.oldtuple, data,
4719 : : sizeof(HeapTupleData));
3574 andres@anarazel.de 4720 : 5006 : data += sizeof(HeapTupleData);
4721 : :
4722 : : /* reset t_data pointer into the new tuplebuf */
688 msawada@postgresql.o 4723 : 5006 : change->data.tp.oldtuple->t_data =
4724 : 5006 : (HeapTupleHeader) ((char *) change->data.tp.oldtuple + HEAPTUPLESIZE);
4725 : :
4726 : : /* restore tuple data itself */
4727 : 5006 : memcpy(change->data.tp.oldtuple->t_data, data, tuplelen);
3574 andres@anarazel.de 4728 : 5006 : data += tuplelen;
4729 : : }
4730 : :
4731 [ + + ]: 176465 : if (change->data.tp.newtuple)
4732 : : {
4733 : : /* here, data might not be suitably aligned! */
4734 : : uint32 tuplelen;
4735 : :
3534 tgl@sss.pgh.pa.us 4736 : 166244 : memcpy(&tuplelen, data + offsetof(HeapTupleData, t_len),
4737 : : sizeof(uint32));
4738 : :
3574 andres@anarazel.de 4739 : 166244 : change->data.tp.newtuple =
280 heikki.linnakangas@i 4740 : 166244 : ReorderBufferAllocTupleBuf(rb, tuplelen - SizeofHeapTupleHeader);
4741 : :
4742 : : /* restore ->tuple */
688 msawada@postgresql.o 4743 : 166244 : memcpy(change->data.tp.newtuple, data,
4744 : : sizeof(HeapTupleData));
3574 andres@anarazel.de 4745 : 166244 : data += sizeof(HeapTupleData);
4746 : :
4747 : : /* reset t_data pointer into the new tuplebuf */
688 msawada@postgresql.o 4748 : 166244 : change->data.tp.newtuple->t_data =
4749 : 166244 : (HeapTupleHeader) ((char *) change->data.tp.newtuple + HEAPTUPLESIZE);
4750 : :
4751 : : /* restore tuple data itself */
4752 : 166244 : memcpy(change->data.tp.newtuple->t_data, data, tuplelen);
3574 andres@anarazel.de 4753 : 166244 : data += tuplelen;
4754 : : }
4755 : :
4307 rhaas@postgresql.org 4756 : 176465 : break;
3542 simon@2ndQuadrant.co 4757 : 1 : case REORDER_BUFFER_CHANGE_MESSAGE:
4758 : : {
4759 : : Size prefix_size;
4760 : :
4761 : : /* read prefix */
4762 : 1 : memcpy(&prefix_size, data, sizeof(Size));
4763 : 1 : data += sizeof(Size);
4764 : 1 : change->data.msg.prefix = MemoryContextAlloc(rb->context,
4765 : : prefix_size);
4766 : 1 : memcpy(change->data.msg.prefix, data, prefix_size);
3478 rhaas@postgresql.org 4767 [ - + ]: 1 : Assert(change->data.msg.prefix[prefix_size - 1] == '\0');
3542 simon@2ndQuadrant.co 4768 : 1 : data += prefix_size;
4769 : :
4770 : : /* read the message */
4771 : 1 : memcpy(&change->data.msg.message_size, data, sizeof(Size));
4772 : 1 : data += sizeof(Size);
4773 : 1 : change->data.msg.message = MemoryContextAlloc(rb->context,
4774 : : change->data.msg.message_size);
4775 : 1 : memcpy(change->data.msg.message, data,
4776 : : change->data.msg.message_size);
4777 : 1 : data += change->data.msg.message_size;
4778 : :
1889 akapila@postgresql.o 4779 : 1 : break;
4780 : : }
4781 : 23 : case REORDER_BUFFER_CHANGE_INVALIDATION:
4782 : : {
4783 : 23 : Size inval_size = sizeof(SharedInvalidationMessage) *
943 tgl@sss.pgh.pa.us 4784 : 23 : change->data.inval.ninvalidations;
4785 : :
1889 akapila@postgresql.o 4786 : 23 : change->data.inval.invalidations =
4787 : 23 : MemoryContextAlloc(rb->context, inval_size);
4788 : :
4789 : : /* read the message */
4790 : 23 : memcpy(change->data.inval.invalidations, data, inval_size);
4791 : :
3542 simon@2ndQuadrant.co 4792 : 23 : break;
4793 : : }
4307 rhaas@postgresql.org 4794 : 2 : case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
4795 : : {
4796 : : Snapshot oldsnap;
4797 : : Snapshot newsnap;
4798 : : Size size;
4799 : :
4303 tgl@sss.pgh.pa.us 4800 : 2 : oldsnap = (Snapshot) data;
4801 : :
4802 : 2 : size = sizeof(SnapshotData) +
4803 : 2 : sizeof(TransactionId) * oldsnap->xcnt +
4804 : 2 : sizeof(TransactionId) * (oldsnap->subxcnt + 0);
4805 : :
4806 : 2 : change->data.snapshot = MemoryContextAllocZero(rb->context, size);
4807 : :
4808 : 2 : newsnap = change->data.snapshot;
4809 : :
4810 : 2 : memcpy(newsnap, data, size);
4811 : 2 : newsnap->xip = (TransactionId *)
4812 : : (((char *) newsnap) + sizeof(SnapshotData));
4813 : 2 : newsnap->subxip = newsnap->xip + newsnap->xcnt;
4814 : 2 : newsnap->copied = true;
4307 rhaas@postgresql.org 4815 : 2 : break;
4816 : : }
4817 : : /* the base struct contains all the data, easy peasy */
2811 peter_e@gmx.net 4818 :UBC 0 : case REORDER_BUFFER_CHANGE_TRUNCATE:
4819 : : {
4820 : : Oid *relids;
4821 : :
280 heikki.linnakangas@i 4822 : 0 : relids = ReorderBufferAllocRelids(rb, change->data.truncate.nrelids);
2662 tomas.vondra@postgre 4823 : 0 : memcpy(relids, data, change->data.truncate.nrelids * sizeof(Oid));
4824 : 0 : change->data.truncate.relids = relids;
4825 : :
4826 : 0 : break;
4827 : : }
3876 andres@anarazel.de 4828 :CBC 1903 : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
4829 : : case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_ABORT:
4830 : : case REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID:
4831 : : case REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID:
4307 rhaas@postgresql.org 4832 : 1903 : break;
4833 : : }
4834 : :
4835 : 178394 : dlist_push_tail(&txn->changes, &change->node);
4836 : 178394 : txn->nentries_mem++;
4837 : :
4838 : : /*
4839 : : * Update memory accounting for the restored change. We need to do this
4840 : : * although we don't check the memory limit when restoring the changes in
4841 : : * this branch (we only do that when initially queueing the changes after
4842 : : * decoding), because we will release the changes later, and that will
4843 : : * update the accounting too (subtracting the size from the counters). And
4844 : : * we don't want to underflow there.
4845 : : */
623 msawada@postgresql.o 4846 : 178394 : ReorderBufferChangeMemoryUpdate(rb, change, NULL, true,
4847 : : ReorderBufferChangeSize(change));
4307 rhaas@postgresql.org 4848 : 178394 : }
4849 : :
4850 : : /*
4851 : : * Remove all on-disk stored for the passed in transaction.
4852 : : */
4853 : : static void
4854 : 246 : ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn)
4855 : : {
4856 : : XLogSegNo first;
4857 : : XLogSegNo cur;
4858 : : XLogSegNo last;
4859 : :
41 alvherre@kurilemu.de 4860 [ - + ]:GNC 246 : Assert(XLogRecPtrIsValid(txn->first_lsn));
4861 [ - + ]: 246 : Assert(XLogRecPtrIsValid(txn->final_lsn));
4862 : :
3011 andres@anarazel.de 4863 :CBC 246 : XLByteToSeg(txn->first_lsn, first, wal_segment_size);
4864 : 246 : XLByteToSeg(txn->final_lsn, last, wal_segment_size);
4865 : :
4866 : : /* iterate over all possible filenames, and delete them */
4307 rhaas@postgresql.org 4867 [ + + ]: 512 : for (cur = first; cur <= last; cur++)
4868 : : {
4869 : : char path[MAXPGPATH];
4870 : :
2843 alvherre@alvh.no-ip. 4871 : 266 : ReorderBufferSerializedPath(path, MyReplicationSlot, txn->xid, cur);
4307 rhaas@postgresql.org 4872 [ - + - - ]: 266 : if (unlink(path) != 0 && errno != ENOENT)
4307 rhaas@postgresql.org 4873 [ # # ]:UBC 0 : ereport(ERROR,
4874 : : (errcode_for_file_access(),
4875 : : errmsg("could not remove file \"%s\": %m", path)));
4876 : : }
4307 rhaas@postgresql.org 4877 :CBC 246 : }
4878 : :
4879 : : /*
4880 : : * Remove any leftover serialized reorder buffers from a slot directory after a
4881 : : * prior crash or decoding session exit.
4882 : : */
4883 : : static void
2843 alvherre@alvh.no-ip. 4884 : 2003 : ReorderBufferCleanupSerializedTXNs(const char *slotname)
4885 : : {
4886 : : DIR *spill_dir;
4887 : : struct dirent *spill_de;
4888 : : struct stat statbuf;
4889 : : char path[MAXPGPATH * 2 + sizeof(PG_REPLSLOT_DIR)];
4890 : :
474 michael@paquier.xyz 4891 : 2003 : sprintf(path, "%s/%s", PG_REPLSLOT_DIR, slotname);
4892 : :
4893 : : /* we're only handling directories here, skip if it's not ours */
2843 alvherre@alvh.no-ip. 4894 [ + - - + ]: 2003 : if (lstat(path, &statbuf) == 0 && !S_ISDIR(statbuf.st_mode))
2843 alvherre@alvh.no-ip. 4895 :UBC 0 : return;
4896 : :
2843 alvherre@alvh.no-ip. 4897 :CBC 2003 : spill_dir = AllocateDir(path);
4898 [ + + ]: 10015 : while ((spill_de = ReadDirExtended(spill_dir, path, INFO)) != NULL)
4899 : : {
4900 : : /* only look at names that can be ours */
4901 [ - + ]: 6009 : if (strncmp(spill_de->d_name, "xid", 3) == 0)
4902 : : {
2843 alvherre@alvh.no-ip. 4903 :UBC 0 : snprintf(path, sizeof(path),
4904 : : "%s/%s/%s", PG_REPLSLOT_DIR, slotname,
4905 : 0 : spill_de->d_name);
4906 : :
4907 [ # # ]: 0 : if (unlink(path) != 0)
4908 [ # # ]: 0 : ereport(ERROR,
4909 : : (errcode_for_file_access(),
4910 : : errmsg("could not remove file \"%s\" during removal of %s/%s/xid*: %m",
4911 : : path, PG_REPLSLOT_DIR, slotname)));
4912 : : }
4913 : : }
2843 alvherre@alvh.no-ip. 4914 :CBC 2003 : FreeDir(spill_dir);
4915 : : }
4916 : :
4917 : : /*
4918 : : * Given a replication slot, transaction ID and segment number, fill in the
4919 : : * corresponding spill file into 'path', which is a caller-owned buffer of size
4920 : : * at least MAXPGPATH.
4921 : : */
4922 : : static void
4923 : 3833 : ReorderBufferSerializedPath(char *path, ReplicationSlot *slot, TransactionId xid,
4924 : : XLogSegNo segno)
4925 : : {
4926 : : XLogRecPtr recptr;
4927 : :
2718 4928 : 3833 : XLogSegNoOffsetToRecPtr(segno, 0, wal_segment_size, recptr);
4929 : :
474 michael@paquier.xyz 4930 : 3833 : snprintf(path, MAXPGPATH, "%s/%s/xid-%u-lsn-%X-%X.spill",
4931 : : PG_REPLSLOT_DIR,
2792 tgl@sss.pgh.pa.us 4932 : 3833 : NameStr(MyReplicationSlot->data.name),
1758 peter@eisentraut.org 4933 : 3833 : xid, LSN_FORMAT_ARGS(recptr));
2843 alvherre@alvh.no-ip. 4934 : 3833 : }
4935 : :
4936 : : /*
4937 : : * Delete all data spilled to disk after we've restarted/crashed. It will be
4938 : : * recreated when the respective slots are reused.
4939 : : */
4940 : : void
4307 rhaas@postgresql.org 4941 : 925 : StartupReorderBuffer(void)
4942 : : {
4943 : : DIR *logical_dir;
4944 : : struct dirent *logical_de;
4945 : :
474 michael@paquier.xyz 4946 : 925 : logical_dir = AllocateDir(PG_REPLSLOT_DIR);
4947 [ + + ]: 2873 : while ((logical_de = ReadDir(logical_dir, PG_REPLSLOT_DIR)) != NULL)
4948 : : {
4307 rhaas@postgresql.org 4949 [ + + ]: 1948 : if (strcmp(logical_de->d_name, ".") == 0 ||
4950 [ + + ]: 1023 : strcmp(logical_de->d_name, "..") == 0)
4951 : 1850 : continue;
4952 : :
4953 : : /* if it cannot be a slot, skip the directory */
147 akapila@postgresql.o 4954 [ - + ]:GNC 98 : if (!ReplicationSlotValidateName(logical_de->d_name, true, DEBUG2))
4307 rhaas@postgresql.org 4955 :UBC 0 : continue;
4956 : :
4957 : : /*
4958 : : * ok, has to be a surviving logical slot, iterate and delete
4959 : : * everything starting with xid-*
4960 : : */
2843 alvherre@alvh.no-ip. 4961 :CBC 98 : ReorderBufferCleanupSerializedTXNs(logical_de->d_name);
4962 : : }
4307 rhaas@postgresql.org 4963 : 925 : FreeDir(logical_dir);
4964 : 925 : }
4965 : :
4966 : : /* ---------------------------------------
4967 : : * toast reassembly support
4968 : : * ---------------------------------------
4969 : : */
4970 : :
4971 : : /*
4972 : : * Initialize per tuple toast reconstruction support.
4973 : : */
4974 : : static void
4975 : 35 : ReorderBufferToastInitHash(ReorderBuffer *rb, ReorderBufferTXN *txn)
4976 : : {
4977 : : HASHCTL hash_ctl;
4978 : :
4979 [ - + ]: 35 : Assert(txn->toast_hash == NULL);
4980 : :
4981 : 35 : hash_ctl.keysize = sizeof(Oid);
4982 : 35 : hash_ctl.entrysize = sizeof(ReorderBufferToastEnt);
4983 : 35 : hash_ctl.hcxt = rb->context;
4984 : 35 : txn->toast_hash = hash_create("ReorderBufferToastHash", 5, &hash_ctl,
4985 : : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
4986 : 35 : }
4987 : :
4988 : : /*
4989 : : * Per toast-chunk handling for toast reconstruction
4990 : : *
4991 : : * Appends a toast chunk so we can reconstruct it when the tuple "owning" the
4992 : : * toasted Datum comes along.
4993 : : */
4994 : : static void
4995 : 1830 : ReorderBufferToastAppendChunk(ReorderBuffer *rb, ReorderBufferTXN *txn,
4996 : : Relation relation, ReorderBufferChange *change)
4997 : : {
4998 : : ReorderBufferToastEnt *ent;
4999 : : HeapTuple newtup;
5000 : : bool found;
5001 : : int32 chunksize;
5002 : : bool isnull;
5003 : : Pointer chunk;
5004 : 1830 : TupleDesc desc = RelationGetDescr(relation);
5005 : : Oid chunk_id;
5006 : : int32 chunk_seq;
5007 : :
5008 [ + + ]: 1830 : if (txn->toast_hash == NULL)
5009 : 35 : ReorderBufferToastInitHash(rb, txn);
5010 : :
5011 [ - + ]: 1830 : Assert(IsToastRelation(relation));
5012 : :
4303 tgl@sss.pgh.pa.us 5013 : 1830 : newtup = change->data.tp.newtuple;
688 msawada@postgresql.o 5014 : 1830 : chunk_id = DatumGetObjectId(fastgetattr(newtup, 1, desc, &isnull));
4307 rhaas@postgresql.org 5015 [ - + ]: 1830 : Assert(!isnull);
688 msawada@postgresql.o 5016 : 1830 : chunk_seq = DatumGetInt32(fastgetattr(newtup, 2, desc, &isnull));
4307 rhaas@postgresql.org 5017 [ - + ]: 1830 : Assert(!isnull);
5018 : :
5019 : : ent = (ReorderBufferToastEnt *)
1045 peter@eisentraut.org 5020 : 1830 : hash_search(txn->toast_hash, &chunk_id, HASH_ENTER, &found);
5021 : :
4307 rhaas@postgresql.org 5022 [ + + ]: 1830 : if (!found)
5023 : : {
5024 [ - + ]: 49 : Assert(ent->chunk_id == chunk_id);
5025 : 49 : ent->num_chunks = 0;
5026 : 49 : ent->last_chunk_seq = 0;
5027 : 49 : ent->size = 0;
5028 : 49 : ent->reconstructed = NULL;
5029 : 49 : dlist_init(&ent->chunks);
5030 : :
5031 [ - + ]: 49 : if (chunk_seq != 0)
4307 rhaas@postgresql.org 5032 [ # # ]:UBC 0 : elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq 0",
5033 : : chunk_seq, chunk_id);
5034 : : }
4307 rhaas@postgresql.org 5035 [ + - - + ]:CBC 1781 : else if (found && chunk_seq != ent->last_chunk_seq + 1)
4307 rhaas@postgresql.org 5036 [ # # ]:UBC 0 : elog(ERROR, "got sequence entry %d for toast chunk %u instead of seq %d",
5037 : : chunk_seq, chunk_id, ent->last_chunk_seq + 1);
5038 : :
688 msawada@postgresql.o 5039 :CBC 1830 : chunk = DatumGetPointer(fastgetattr(newtup, 3, desc, &isnull));
4307 rhaas@postgresql.org 5040 [ - + ]: 1830 : Assert(!isnull);
5041 : :
5042 : : /* calculate size so we can allocate the right size at once later */
5043 [ + - ]: 1830 : if (!VARATT_IS_EXTENDED(chunk))
5044 : 1830 : chunksize = VARSIZE(chunk) - VARHDRSZ;
4307 rhaas@postgresql.org 5045 [ # # ]:UBC 0 : else if (VARATT_IS_SHORT(chunk))
5046 : : /* could happen due to heap_form_tuple doing its thing */
5047 : 0 : chunksize = VARSIZE_SHORT(chunk) - VARHDRSZ_SHORT;
5048 : : else
5049 [ # # ]: 0 : elog(ERROR, "unexpected type of toast chunk");
5050 : :
4307 rhaas@postgresql.org 5051 :CBC 1830 : ent->size += chunksize;
5052 : 1830 : ent->last_chunk_seq = chunk_seq;
5053 : 1830 : ent->num_chunks++;
5054 : 1830 : dlist_push_tail(&ent->chunks, &change->node);
5055 : 1830 : }
5056 : :
5057 : : /*
5058 : : * Rejigger change->newtuple to point to in-memory toast tuples instead of
5059 : : * on-disk toast tuples that may no longer exist (think DROP TABLE or VACUUM).
5060 : : *
5061 : : * We cannot replace unchanged toast tuples though, so those will still point
5062 : : * to on-disk toast data.
5063 : : *
5064 : : * While updating the existing change with detoasted tuple data, we need to
5065 : : * update the memory accounting info, because the change size will differ.
5066 : : * Otherwise the accounting may get out of sync, triggering serialization
5067 : : * at unexpected times.
5068 : : *
5069 : : * We simply subtract size of the change before rejiggering the tuple, and
5070 : : * then add the new size. This makes it look like the change was removed
5071 : : * and then added back, except it only tweaks the accounting info.
5072 : : *
5073 : : * In particular it can't trigger serialization, which would be pointless
5074 : : * anyway as it happens during commit processing right before handing
5075 : : * the change to the output plugin.
5076 : : */
5077 : : static void
5078 : 334053 : ReorderBufferToastReplace(ReorderBuffer *rb, ReorderBufferTXN *txn,
5079 : : Relation relation, ReorderBufferChange *change)
5080 : : {
5081 : : TupleDesc desc;
5082 : : int natt;
5083 : : Datum *attrs;
5084 : : bool *isnull;
5085 : : bool *free;
5086 : : HeapTuple tmphtup;
5087 : : Relation toast_rel;
5088 : : TupleDesc toast_desc;
5089 : : MemoryContext oldcontext;
5090 : : HeapTuple newtup;
5091 : : Size old_size;
5092 : :
5093 : : /* no toast tuples changed */
5094 [ + + ]: 334053 : if (txn->toast_hash == NULL)
5095 : 333807 : return;
5096 : :
5097 : : /*
5098 : : * We're going to modify the size of the change. So, to make sure the
5099 : : * accounting is correct we record the current change size and then after
5100 : : * re-computing the change we'll subtract the recorded size and then
5101 : : * re-add the new change size at the end. We don't immediately subtract
5102 : : * the old size because if there is any error before we add the new size,
5103 : : * we will release the changes and that will update the accounting info
5104 : : * (subtracting the size from the counters). And we don't want to
5105 : : * underflow there.
5106 : : */
1556 akapila@postgresql.o 5107 : 246 : old_size = ReorderBufferChangeSize(change);
5108 : :
4307 rhaas@postgresql.org 5109 : 246 : oldcontext = MemoryContextSwitchTo(rb->context);
5110 : :
5111 : : /* we should only have toast tuples in an INSERT or UPDATE */
4303 tgl@sss.pgh.pa.us 5112 [ - + ]: 246 : Assert(change->data.tp.newtuple);
5113 : :
4307 rhaas@postgresql.org 5114 : 246 : desc = RelationGetDescr(relation);
5115 : :
5116 : 246 : toast_rel = RelationIdGetRelation(relation->rd_rel->reltoastrelid);
2292 tgl@sss.pgh.pa.us 5117 [ - + ]: 246 : if (!RelationIsValid(toast_rel))
1547 akapila@postgresql.o 5118 [ # # ]:UBC 0 : elog(ERROR, "could not open toast relation with OID %u (base relation \"%s\")",
5119 : : relation->rd_rel->reltoastrelid, RelationGetRelationName(relation));
5120 : :
4307 rhaas@postgresql.org 5121 :CBC 246 : toast_desc = RelationGetDescr(toast_rel);
5122 : :
5123 : : /* should we allocate from stack instead? */
7 michael@paquier.xyz 5124 :GNC 246 : attrs = palloc0_array(Datum, desc->natts);
5125 : 246 : isnull = palloc0_array(bool, desc->natts);
5126 : 246 : free = palloc0_array(bool, desc->natts);
5127 : :
4303 tgl@sss.pgh.pa.us 5128 :CBC 246 : newtup = change->data.tp.newtuple;
5129 : :
688 msawada@postgresql.o 5130 : 246 : heap_deform_tuple(newtup, desc, attrs, isnull);
5131 : :
4307 rhaas@postgresql.org 5132 [ + + ]: 757 : for (natt = 0; natt < desc->natts; natt++)
5133 : : {
56 drowley@postgresql.o 5134 :GNC 511 : CompactAttribute *attr = TupleDescCompactAttr(desc, natt);
5135 : : ReorderBufferToastEnt *ent;
5136 : : struct varlena *varlena;
5137 : :
5138 : : /* va_rawsize is the size of the original datum -- including header */
5139 : : struct varatt_external toast_pointer;
5140 : : struct varatt_indirect redirect_pointer;
4307 rhaas@postgresql.org 5141 :CBC 511 : struct varlena *new_datum = NULL;
5142 : : struct varlena *reconstructed;
5143 : : dlist_iter it;
5144 : 511 : Size data_done = 0;
5145 : :
5146 [ - + ]: 511 : if (attr->attisdropped)
4307 rhaas@postgresql.org 5147 :GBC 463 : continue;
5148 : :
5149 : : /* not a varlena datatype */
4307 rhaas@postgresql.org 5150 [ + + ]:CBC 511 : if (attr->attlen != -1)
5151 : 241 : continue;
5152 : :
5153 : : /* no data */
5154 [ + + ]: 270 : if (isnull[natt])
5155 : 12 : continue;
5156 : :
5157 : : /* ok, we know we have a toast datum */
5158 : 258 : varlena = (struct varlena *) DatumGetPointer(attrs[natt]);
5159 : :
5160 : : /* no need to do anything if the tuple isn't external */
5161 [ + + ]: 258 : if (!VARATT_IS_EXTERNAL(varlena))
5162 : 202 : continue;
5163 : :
5164 [ - + - + : 56 : VARATT_EXTERNAL_GET_POINTER(toast_pointer, varlena);
+ - - + -
+ ]
5165 : :
5166 : : /*
5167 : : * Check whether the toast tuple changed, replace if so.
5168 : : */
5169 : : ent = (ReorderBufferToastEnt *)
5170 : 56 : hash_search(txn->toast_hash,
5171 : : &toast_pointer.va_valueid,
5172 : : HASH_FIND,
5173 : : NULL);
5174 [ + + ]: 56 : if (ent == NULL)
5175 : 8 : continue;
5176 : :
5177 : : new_datum =
5178 : 48 : (struct varlena *) palloc0(INDIRECT_POINTER_SIZE);
5179 : :
5180 : 48 : free[natt] = true;
5181 : :
5182 : 48 : reconstructed = palloc0(toast_pointer.va_rawsize);
5183 : :
5184 : 48 : ent->reconstructed = reconstructed;
5185 : :
5186 : : /* stitch toast tuple back together from its parts */
5187 [ + - + + ]: 1827 : dlist_foreach(it, &ent->chunks)
5188 : : {
5189 : : bool cisnull;
5190 : : ReorderBufferChange *cchange;
5191 : : HeapTuple ctup;
5192 : : Pointer chunk;
5193 : :
4303 tgl@sss.pgh.pa.us 5194 : 1779 : cchange = dlist_container(ReorderBufferChange, node, it.cur);
5195 : 1779 : ctup = cchange->data.tp.newtuple;
688 msawada@postgresql.o 5196 : 1779 : chunk = DatumGetPointer(fastgetattr(ctup, 3, toast_desc, &cisnull));
5197 : :
839 michael@paquier.xyz 5198 [ - + ]: 1779 : Assert(!cisnull);
4307 rhaas@postgresql.org 5199 [ - + ]: 1779 : Assert(!VARATT_IS_EXTERNAL(chunk));
5200 [ - + ]: 1779 : Assert(!VARATT_IS_SHORT(chunk));
5201 : :
5202 : 1779 : memcpy(VARDATA(reconstructed) + data_done,
5203 : 1779 : VARDATA(chunk),
5204 : 1779 : VARSIZE(chunk) - VARHDRSZ);
5205 : 1779 : data_done += VARSIZE(chunk) - VARHDRSZ;
5206 : : }
1734 5207 [ - + ]: 48 : Assert(data_done == VARATT_EXTERNAL_GET_EXTSIZE(toast_pointer));
5208 : :
5209 : : /* make sure its marked as compressed or not */
4307 5210 [ + + ]: 48 : if (VARATT_EXTERNAL_IS_COMPRESSED(toast_pointer))
5211 : 5 : SET_VARSIZE_COMPRESSED(reconstructed, data_done + VARHDRSZ);
5212 : : else
5213 : 43 : SET_VARSIZE(reconstructed, data_done + VARHDRSZ);
5214 : :
5215 : 48 : memset(&redirect_pointer, 0, sizeof(redirect_pointer));
5216 : 48 : redirect_pointer.pointer = reconstructed;
5217 : :
5218 : 48 : SET_VARTAG_EXTERNAL(new_datum, VARTAG_INDIRECT);
5219 : 48 : memcpy(VARDATA_EXTERNAL(new_datum), &redirect_pointer,
5220 : : sizeof(redirect_pointer));
5221 : :
5222 : 48 : attrs[natt] = PointerGetDatum(new_datum);
5223 : : }
5224 : :
5225 : : /*
5226 : : * Build tuple in separate memory & copy tuple back into the tuplebuf
5227 : : * passed to the output plugin. We can't directly heap_fill_tuple() into
5228 : : * the tuplebuf because attrs[] will point back into the current content.
5229 : : */
4303 tgl@sss.pgh.pa.us 5230 : 246 : tmphtup = heap_form_tuple(desc, attrs, isnull);
688 msawada@postgresql.o 5231 [ - + ]: 246 : Assert(newtup->t_len <= MaxHeapTupleSize);
5232 [ - + ]: 246 : Assert(newtup->t_data == (HeapTupleHeader) ((char *) newtup + HEAPTUPLESIZE));
5233 : :
5234 : 246 : memcpy(newtup->t_data, tmphtup->t_data, tmphtup->t_len);
5235 : 246 : newtup->t_len = tmphtup->t_len;
5236 : :
5237 : : /*
5238 : : * free resources we won't further need, more persistent stuff will be
5239 : : * free'd in ReorderBufferToastReset().
5240 : : */
4307 rhaas@postgresql.org 5241 : 246 : RelationClose(toast_rel);
4303 tgl@sss.pgh.pa.us 5242 : 246 : pfree(tmphtup);
4307 rhaas@postgresql.org 5243 [ + + ]: 757 : for (natt = 0; natt < desc->natts; natt++)
5244 : : {
5245 [ + + ]: 511 : if (free[natt])
5246 : 48 : pfree(DatumGetPointer(attrs[natt]));
5247 : : }
5248 : 246 : pfree(attrs);
5249 : 246 : pfree(free);
5250 : 246 : pfree(isnull);
5251 : :
5252 : 246 : MemoryContextSwitchTo(oldcontext);
5253 : :
5254 : : /* subtract the old change size */
623 msawada@postgresql.o 5255 : 246 : ReorderBufferChangeMemoryUpdate(rb, change, NULL, false, old_size);
5256 : : /* now add the change back, with the correct size */
5257 : 246 : ReorderBufferChangeMemoryUpdate(rb, change, NULL, true,
5258 : : ReorderBufferChangeSize(change));
5259 : : }
5260 : :
5261 : : /*
5262 : : * Free all resources allocated for toast reconstruction.
5263 : : */
5264 : : static void
4307 rhaas@postgresql.org 5265 : 337709 : ReorderBufferToastReset(ReorderBuffer *rb, ReorderBufferTXN *txn)
5266 : : {
5267 : : HASH_SEQ_STATUS hstat;
5268 : : ReorderBufferToastEnt *ent;
5269 : :
5270 [ + + ]: 337709 : if (txn->toast_hash == NULL)
5271 : 337674 : return;
5272 : :
5273 : : /* sequentially walk over the hash and free everything */
5274 : 35 : hash_seq_init(&hstat, txn->toast_hash);
5275 [ + + ]: 84 : while ((ent = (ReorderBufferToastEnt *) hash_seq_search(&hstat)) != NULL)
5276 : : {
5277 : : dlist_mutable_iter it;
5278 : :
5279 [ + + ]: 49 : if (ent->reconstructed != NULL)
5280 : 48 : pfree(ent->reconstructed);
5281 : :
5282 [ + - + + ]: 1879 : dlist_foreach_modify(it, &ent->chunks)
5283 : : {
5284 : 1830 : ReorderBufferChange *change =
943 tgl@sss.pgh.pa.us 5285 : 1830 : dlist_container(ReorderBufferChange, node, it.cur);
5286 : :
4307 rhaas@postgresql.org 5287 : 1830 : dlist_delete(&change->node);
280 heikki.linnakangas@i 5288 : 1830 : ReorderBufferFreeChange(rb, change, true);
5289 : : }
5290 : : }
5291 : :
4307 rhaas@postgresql.org 5292 : 35 : hash_destroy(txn->toast_hash);
5293 : 35 : txn->toast_hash = NULL;
5294 : : }
5295 : :
5296 : :
5297 : : /* ---------------------------------------
5298 : : * Visibility support for logical decoding
5299 : : *
5300 : : *
5301 : : * Lookup actual cmin/cmax values when using decoding snapshot. We can't
5302 : : * always rely on stored cmin/cmax values because of two scenarios:
5303 : : *
5304 : : * * A tuple got changed multiple times during a single transaction and thus
5305 : : * has got a combo CID. Combo CIDs are only valid for the duration of a
5306 : : * single transaction.
5307 : : * * A tuple with a cmin but no cmax (and thus no combo CID) got
5308 : : * deleted/updated in another transaction than the one which created it
5309 : : * which we are looking at right now. As only one of cmin, cmax or combo CID
5310 : : * is actually stored in the heap we don't have access to the value we
5311 : : * need anymore.
5312 : : *
5313 : : * To resolve those problems we have a per-transaction hash of (cmin,
5314 : : * cmax) tuples keyed by (relfilelocator, ctid) which contains the actual
5315 : : * (cmin, cmax) values. That also takes care of combo CIDs by simply
5316 : : * not caring about them at all. As we have the real cmin/cmax values
5317 : : * combo CIDs aren't interesting.
5318 : : *
5319 : : * As we only care about catalog tuples here the overhead of this
5320 : : * hashtable should be acceptable.
5321 : : *
5322 : : * Heap rewrites complicate this a bit, check rewriteheap.c for
5323 : : * details.
5324 : : * -------------------------------------------------------------------------
5325 : : */
5326 : :
5327 : : /* struct for sorting mapping files by LSN efficiently */
5328 : : typedef struct RewriteMappingFile
5329 : : {
5330 : : XLogRecPtr lsn;
5331 : : char fname[MAXPGPATH];
5332 : : } RewriteMappingFile;
5333 : :
5334 : : #ifdef NOT_USED
5335 : : static void
5336 : : DisplayMapping(HTAB *tuplecid_data)
5337 : : {
5338 : : HASH_SEQ_STATUS hstat;
5339 : : ReorderBufferTupleCidEnt *ent;
5340 : :
5341 : : hash_seq_init(&hstat, tuplecid_data);
5342 : : while ((ent = (ReorderBufferTupleCidEnt *) hash_seq_search(&hstat)) != NULL)
5343 : : {
5344 : : elog(DEBUG3, "mapping: node: %u/%u/%u tid: %u/%u cmin: %u, cmax: %u",
5345 : : ent->key.rlocator.dbOid,
5346 : : ent->key.rlocator.spcOid,
5347 : : ent->key.rlocator.relNumber,
5348 : : ItemPointerGetBlockNumber(&ent->key.tid),
5349 : : ItemPointerGetOffsetNumber(&ent->key.tid),
5350 : : ent->cmin,
5351 : : ent->cmax
5352 : : );
5353 : : }
5354 : : }
5355 : : #endif
5356 : :
5357 : : /*
5358 : : * Apply a single mapping file to tuplecid_data.
5359 : : *
5360 : : * The mapping file has to have been verified to be a) committed b) for our
5361 : : * transaction c) applied in LSN order.
5362 : : */
5363 : : static void
5364 : 27 : ApplyLogicalMappingFile(HTAB *tuplecid_data, Oid relid, const char *fname)
5365 : : {
5366 : : char path[MAXPGPATH];
5367 : : int fd;
5368 : : int readBytes;
5369 : : LogicalRewriteMappingData map;
5370 : :
474 michael@paquier.xyz 5371 : 27 : sprintf(path, "%s/%s", PG_LOGICAL_MAPPINGS_DIR, fname);
3007 peter_e@gmx.net 5372 : 27 : fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
4307 rhaas@postgresql.org 5373 [ + - ]: 27 : if (fd < 0)
4307 rhaas@postgresql.org 5374 [ # # ]:UBC 0 : ereport(ERROR,
5375 : : (errcode_for_file_access(),
5376 : : errmsg("could not open file \"%s\": %m", path)));
5377 : :
5378 : : while (true)
4307 rhaas@postgresql.org 5379 :CBC 209 : {
5380 : : ReorderBufferTupleCidKey key;
5381 : : ReorderBufferTupleCidEnt *ent;
5382 : : ReorderBufferTupleCidEnt *new_ent;
5383 : : bool found;
5384 : :
5385 : : /* be careful about padding */
5386 : 236 : memset(&key, 0, sizeof(ReorderBufferTupleCidKey));
5387 : :
5388 : : /* read all mappings till the end of the file */
3196 5389 : 236 : pgstat_report_wait_start(WAIT_EVENT_REORDER_LOGICAL_MAPPING_READ);
4307 5390 : 236 : readBytes = read(fd, &map, sizeof(LogicalRewriteMappingData));
3196 5391 : 236 : pgstat_report_wait_end();
5392 : :
4307 5393 [ - + ]: 236 : if (readBytes < 0)
4307 rhaas@postgresql.org 5394 [ # # ]:UBC 0 : ereport(ERROR,
5395 : : (errcode_for_file_access(),
5396 : : errmsg("could not read file \"%s\": %m",
5397 : : path)));
4243 bruce@momjian.us 5398 [ + + ]:CBC 236 : else if (readBytes == 0) /* EOF */
4307 rhaas@postgresql.org 5399 : 27 : break;
5400 [ - + ]: 209 : else if (readBytes != sizeof(LogicalRewriteMappingData))
4307 rhaas@postgresql.org 5401 [ # # ]:UBC 0 : ereport(ERROR,
5402 : : (errcode_for_file_access(),
5403 : : errmsg("could not read from file \"%s\": read %d instead of %d bytes",
5404 : : path, readBytes,
5405 : : (int32) sizeof(LogicalRewriteMappingData))));
5406 : :
1260 rhaas@postgresql.org 5407 :CBC 209 : key.rlocator = map.old_locator;
4307 5408 : 209 : ItemPointerCopy(&map.old_tid,
5409 : : &key.tid);
5410 : :
5411 : :
5412 : : ent = (ReorderBufferTupleCidEnt *)
1045 peter@eisentraut.org 5413 : 209 : hash_search(tuplecid_data, &key, HASH_FIND, NULL);
5414 : :
5415 : : /* no existing mapping, no need to update */
4307 rhaas@postgresql.org 5416 [ - + ]: 209 : if (!ent)
4307 rhaas@postgresql.org 5417 :UBC 0 : continue;
5418 : :
1260 rhaas@postgresql.org 5419 :CBC 209 : key.rlocator = map.new_locator;
4307 5420 : 209 : ItemPointerCopy(&map.new_tid,
5421 : : &key.tid);
5422 : :
5423 : : new_ent = (ReorderBufferTupleCidEnt *)
1045 peter@eisentraut.org 5424 : 209 : hash_search(tuplecid_data, &key, HASH_ENTER, &found);
5425 : :
4307 rhaas@postgresql.org 5426 [ + + ]: 209 : if (found)
5427 : : {
5428 : : /*
5429 : : * Make sure the existing mapping makes sense. We sometime update
5430 : : * old records that did not yet have a cmax (e.g. pg_class' own
5431 : : * entry while rewriting it) during rewrites, so allow that.
5432 : : */
5433 [ + - - + ]: 6 : Assert(ent->cmin == InvalidCommandId || ent->cmin == new_ent->cmin);
5434 [ - + - - ]: 6 : Assert(ent->cmax == InvalidCommandId || ent->cmax == new_ent->cmax);
5435 : : }
5436 : : else
5437 : : {
5438 : : /* update mapping */
5439 : 203 : new_ent->cmin = ent->cmin;
5440 : 203 : new_ent->cmax = ent->cmax;
5441 : 203 : new_ent->combocid = ent->combocid;
5442 : : }
5443 : : }
5444 : :
2356 peter@eisentraut.org 5445 [ - + ]: 27 : if (CloseTransientFile(fd) != 0)
2475 michael@paquier.xyz 5446 [ # # ]:UBC 0 : ereport(ERROR,
5447 : : (errcode_for_file_access(),
5448 : : errmsg("could not close file \"%s\": %m", path)));
4307 rhaas@postgresql.org 5449 :CBC 27 : }
5450 : :
5451 : :
5452 : : /*
5453 : : * Check whether the TransactionId 'xid' is in the pre-sorted array 'xip'.
5454 : : */
5455 : : static bool
5456 : 348 : TransactionIdInArray(TransactionId xid, TransactionId *xip, Size num)
5457 : : {
5458 : 348 : return bsearch(&xid, xip, num,
5459 : 348 : sizeof(TransactionId), xidComparator) != NULL;
5460 : : }
5461 : :
5462 : : /*
5463 : : * list_sort() comparator for sorting RewriteMappingFiles in LSN order.
5464 : : */
5465 : : static int
2346 tgl@sss.pgh.pa.us 5466 : 41 : file_sort_by_lsn(const ListCell *a_p, const ListCell *b_p)
5467 : : {
5468 : 41 : RewriteMappingFile *a = (RewriteMappingFile *) lfirst(a_p);
5469 : 41 : RewriteMappingFile *b = (RewriteMappingFile *) lfirst(b_p);
5470 : :
670 nathan@postgresql.or 5471 : 41 : return pg_cmp_u64(a->lsn, b->lsn);
5472 : : }
5473 : :
5474 : : /*
5475 : : * Apply any existing logical remapping files if there are any targeted at our
5476 : : * transaction for relid.
5477 : : */
5478 : : static void
4307 rhaas@postgresql.org 5479 : 11 : UpdateLogicalMappings(HTAB *tuplecid_data, Oid relid, Snapshot snapshot)
5480 : : {
5481 : : DIR *mapping_dir;
5482 : : struct dirent *mapping_de;
5483 : 11 : List *files = NIL;
5484 : : ListCell *file;
5485 [ + - ]: 11 : Oid dboid = IsSharedRelation(relid) ? InvalidOid : MyDatabaseId;
5486 : :
474 michael@paquier.xyz 5487 : 11 : mapping_dir = AllocateDir(PG_LOGICAL_MAPPINGS_DIR);
5488 [ + + ]: 573 : while ((mapping_de = ReadDir(mapping_dir, PG_LOGICAL_MAPPINGS_DIR)) != NULL)
5489 : : {
5490 : : Oid f_dboid;
5491 : : Oid f_relid;
5492 : : TransactionId f_mapped_xid;
5493 : : TransactionId f_create_xid;
5494 : : XLogRecPtr f_lsn;
5495 : : uint32 f_hi,
5496 : : f_lo;
5497 : : RewriteMappingFile *f;
5498 : :
4307 rhaas@postgresql.org 5499 [ + + ]: 562 : if (strcmp(mapping_de->d_name, ".") == 0 ||
5500 [ + + ]: 551 : strcmp(mapping_de->d_name, "..") == 0)
5501 : 535 : continue;
5502 : :
5503 : : /* Ignore files that aren't ours */
5504 [ - + ]: 540 : if (strncmp(mapping_de->d_name, "map-", 4) != 0)
4307 rhaas@postgresql.org 5505 :UBC 0 : continue;
5506 : :
4307 rhaas@postgresql.org 5507 [ - + ]:CBC 540 : if (sscanf(mapping_de->d_name, LOGICAL_REWRITE_FORMAT,
5508 : : &f_dboid, &f_relid, &f_hi, &f_lo,
5509 : : &f_mapped_xid, &f_create_xid) != 6)
4249 tgl@sss.pgh.pa.us 5510 [ # # ]:UBC 0 : elog(ERROR, "could not parse filename \"%s\"", mapping_de->d_name);
5511 : :
4307 rhaas@postgresql.org 5512 :CBC 540 : f_lsn = ((uint64) f_hi) << 32 | f_lo;
5513 : :
5514 : : /* mapping for another database */
5515 [ - + ]: 540 : if (f_dboid != dboid)
4307 rhaas@postgresql.org 5516 :UBC 0 : continue;
5517 : :
5518 : : /* mapping for another relation */
4307 rhaas@postgresql.org 5519 [ + + ]:CBC 540 : if (f_relid != relid)
5520 : 60 : continue;
5521 : :
5522 : : /* did the creating transaction abort? */
5523 [ + + ]: 480 : if (!TransactionIdDidCommit(f_create_xid))
5524 : 132 : continue;
5525 : :
5526 : : /* not for our transaction */
5527 [ + + ]: 348 : if (!TransactionIdInArray(f_mapped_xid, snapshot->subxip, snapshot->subxcnt))
5528 : 321 : continue;
5529 : :
5530 : : /* ok, relevant, queue for apply */
7 michael@paquier.xyz 5531 :GNC 27 : f = palloc_object(RewriteMappingFile);
4307 rhaas@postgresql.org 5532 :CBC 27 : f->lsn = f_lsn;
5533 : 27 : strcpy(f->fname, mapping_de->d_name);
5534 : 27 : files = lappend(files, f);
5535 : : }
5536 : 11 : FreeDir(mapping_dir);
5537 : :
5538 : : /* sort files so we apply them in LSN order */
2346 tgl@sss.pgh.pa.us 5539 : 11 : list_sort(files, file_sort_by_lsn);
5540 : :
5541 [ + + + + : 38 : foreach(file, files)
+ + ]
5542 : : {
5543 : 27 : RewriteMappingFile *f = (RewriteMappingFile *) lfirst(file);
5544 : :
4249 5545 [ - + ]: 27 : elog(DEBUG1, "applying mapping: \"%s\" in %u", f->fname,
5546 : : snapshot->subxip[0]);
4307 rhaas@postgresql.org 5547 : 27 : ApplyLogicalMappingFile(tuplecid_data, relid, f->fname);
5548 : 27 : pfree(f);
5549 : : }
5550 : 11 : }
5551 : :
5552 : : /*
5553 : : * Lookup cmin/cmax of a tuple, during logical decoding where we can't rely on
5554 : : * combo CIDs.
5555 : : */
5556 : : bool
5557 : 786 : ResolveCminCmaxDuringDecoding(HTAB *tuplecid_data,
5558 : : Snapshot snapshot,
5559 : : HeapTuple htup, Buffer buffer,
5560 : : CommandId *cmin, CommandId *cmax)
5561 : : {
5562 : : ReorderBufferTupleCidKey key;
5563 : : ReorderBufferTupleCidEnt *ent;
5564 : : ForkNumber forkno;
5565 : : BlockNumber blockno;
4243 bruce@momjian.us 5566 : 786 : bool updated_mapping = false;
5567 : :
5568 : : /*
5569 : : * Return unresolved if tuplecid_data is not valid. That's because when
5570 : : * streaming in-progress transactions we may run into tuples with the CID
5571 : : * before actually decoding them. Think e.g. about INSERT followed by
5572 : : * TRUNCATE, where the TRUNCATE may not be decoded yet when applying the
5573 : : * INSERT. So in such cases, we assume the CID is from the future
5574 : : * command.
5575 : : */
1957 akapila@postgresql.o 5576 [ + + ]: 786 : if (tuplecid_data == NULL)
5577 : 8 : return false;
5578 : :
5579 : : /* be careful about padding */
4307 rhaas@postgresql.org 5580 : 778 : memset(&key, 0, sizeof(key));
5581 : :
5582 [ - + ]: 778 : Assert(!BufferIsLocal(buffer));
5583 : :
5584 : : /*
5585 : : * get relfilelocator from the buffer, no convenient way to access it
5586 : : * other than that.
5587 : : */
1260 5588 : 778 : BufferGetTag(buffer, &key.rlocator, &forkno, &blockno);
5589 : :
5590 : : /* tuples can only be in the main fork */
4307 5591 [ - + ]: 778 : Assert(forkno == MAIN_FORKNUM);
5592 [ - + ]: 778 : Assert(blockno == ItemPointerGetBlockNumber(&htup->t_self));
5593 : :
5594 : 778 : ItemPointerCopy(&htup->t_self,
5595 : : &key.tid);
5596 : :
5597 : 789 : restart:
5598 : : ent = (ReorderBufferTupleCidEnt *)
1045 peter@eisentraut.org 5599 : 789 : hash_search(tuplecid_data, &key, HASH_FIND, NULL);
5600 : :
5601 : : /*
5602 : : * failed to find a mapping, check whether the table was rewritten and
5603 : : * apply mapping if so, but only do that once - there can be no new
5604 : : * mappings while we are in here since we have to hold a lock on the
5605 : : * relation.
5606 : : */
4307 rhaas@postgresql.org 5607 [ + + + + ]: 789 : if (ent == NULL && !updated_mapping)
5608 : : {
5609 : 11 : UpdateLogicalMappings(tuplecid_data, htup->t_tableOid, snapshot);
5610 : : /* now check but don't update for a mapping again */
5611 : 11 : updated_mapping = true;
5612 : 11 : goto restart;
5613 : : }
5614 [ + + ]: 778 : else if (ent == NULL)
5615 : 5 : return false;
5616 : :
5617 [ + - ]: 773 : if (cmin)
5618 : 773 : *cmin = ent->cmin;
5619 [ + - ]: 773 : if (cmax)
5620 : 773 : *cmax = ent->cmax;
5621 : 773 : return true;
5622 : : }
5623 : :
5624 : : /*
5625 : : * Count invalidation messages of specified transaction.
5626 : : *
5627 : : * Returns number of messages, and msgs is set to the pointer of the linked
5628 : : * list for the messages.
5629 : : */
5630 : : uint32
251 akapila@postgresql.o 5631 : 33 : ReorderBufferGetInvalidations(ReorderBuffer *rb, TransactionId xid,
5632 : : SharedInvalidationMessage **msgs)
5633 : : {
5634 : : ReorderBufferTXN *txn;
5635 : :
5636 : 33 : txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
5637 : : false);
5638 : :
5639 [ - + ]: 33 : if (txn == NULL)
251 akapila@postgresql.o 5640 :UBC 0 : return 0;
5641 : :
251 akapila@postgresql.o 5642 :CBC 33 : *msgs = txn->invalidations;
5643 : :
5644 : 33 : return txn->ninvalidations;
5645 : : }
|