Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * clog.c
4 : : * PostgreSQL transaction-commit-log manager
5 : : *
6 : : * This module stores two bits per transaction regarding its commit/abort
7 : : * status; the status for four transactions fit in a byte.
8 : : *
9 : : * This would be a pretty simple abstraction on top of slru.c, except that
10 : : * for performance reasons we allow multiple transactions that are
11 : : * committing concurrently to form a queue, so that a single process can
12 : : * update the status for all of them within a single lock acquisition run.
13 : : *
14 : : * XLOG interactions: this module generates an XLOG record whenever a new
15 : : * CLOG page is initialized to zeroes. Other writes of CLOG come from
16 : : * recording of transaction commit or abort in xact.c, which generates its
17 : : * own XLOG records for these events and will re-perform the status update
18 : : * on redo; so we need make no additional XLOG entry here. For synchronous
19 : : * transaction commits, the XLOG is guaranteed flushed through the XLOG commit
20 : : * record before we are called to log a commit, so the WAL rule "write xlog
21 : : * before data" is satisfied automatically. However, for async commits we
22 : : * must track the latest LSN affecting each CLOG page, so that we can flush
23 : : * XLOG that far and satisfy the WAL rule. We don't have to worry about this
24 : : * for aborts (whether sync or async), since the post-crash assumption would
25 : : * be that such transactions failed anyway.
26 : : *
27 : : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
28 : : * Portions Copyright (c) 1994, Regents of the University of California
29 : : *
30 : : * src/backend/access/transam/clog.c
31 : : *
32 : : *-------------------------------------------------------------------------
33 : : */
34 : : #include "postgres.h"
35 : :
36 : : #include "access/clog.h"
37 : : #include "access/slru.h"
38 : : #include "access/transam.h"
39 : : #include "access/xlog.h"
40 : : #include "access/xloginsert.h"
41 : : #include "access/xlogutils.h"
42 : : #include "miscadmin.h"
43 : : #include "pg_trace.h"
44 : : #include "pgstat.h"
45 : : #include "storage/proc.h"
46 : : #include "storage/subsystems.h"
47 : : #include "storage/sync.h"
48 : : #include "utils/guc_hooks.h"
49 : : #include "utils/wait_event.h"
50 : :
51 : : /*
52 : : * Defines for CLOG page sizes. A page is the same BLCKSZ as is used
53 : : * everywhere else in Postgres.
54 : : *
55 : : * Note: because TransactionIds are 32 bits and wrap around at 0xFFFFFFFF,
56 : : * CLOG page numbering also wraps around at 0xFFFFFFFF/CLOG_XACTS_PER_PAGE,
57 : : * and CLOG segment numbering at
58 : : * 0xFFFFFFFF/CLOG_XACTS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need take no
59 : : * explicit notice of that fact in this module, except when comparing segment
60 : : * and page numbers in TruncateCLOG (see CLOGPagePrecedes).
61 : : */
62 : :
63 : : /* We need two bits per xact, so four xacts fit in a byte */
64 : : #define CLOG_BITS_PER_XACT 2
65 : : #define CLOG_XACTS_PER_BYTE 4
66 : : #define CLOG_XACTS_PER_PAGE (BLCKSZ * CLOG_XACTS_PER_BYTE)
67 : : #define CLOG_XACT_BITMASK ((1 << CLOG_BITS_PER_XACT) - 1)
68 : :
69 : : /*
70 : : * Because space used in CLOG by each transaction is so small, we place a
71 : : * smaller limit on the number of CLOG buffers than SLRU allows. No other
72 : : * SLRU needs this.
73 : : */
74 : : #define CLOG_MAX_ALLOWED_BUFFERS \
75 : : Min(SLRU_MAX_ALLOWED_BUFFERS, \
76 : : (((MaxTransactionId / 2) + (CLOG_XACTS_PER_PAGE - 1)) / CLOG_XACTS_PER_PAGE))
77 : :
78 : :
79 : : /*
80 : : * Although we return an int64 the actual value can't currently exceed
81 : : * 0xFFFFFFFF/CLOG_XACTS_PER_PAGE.
82 : : */
83 : : static inline int64
888 akorotkov@postgresql 84 :CBC 1334738 : TransactionIdToPage(TransactionId xid)
85 : : {
86 : 1334738 : return xid / (int64) CLOG_XACTS_PER_PAGE;
87 : : }
88 : :
89 : : #define TransactionIdToPgIndex(xid) ((xid) % (TransactionId) CLOG_XACTS_PER_PAGE)
90 : : #define TransactionIdToByte(xid) (TransactionIdToPgIndex(xid) / CLOG_XACTS_PER_BYTE)
91 : : #define TransactionIdToBIndex(xid) ((xid) % (TransactionId) CLOG_XACTS_PER_BYTE)
92 : :
93 : : /* We store the latest async LSN for each group of transactions */
94 : : #define CLOG_XACTS_PER_LSN_GROUP 32 /* keep this a power of 2 */
95 : : #define CLOG_LSNS_PER_PAGE (CLOG_XACTS_PER_PAGE / CLOG_XACTS_PER_LSN_GROUP)
96 : :
97 : : #define GetLSNIndex(slotno, xid) ((slotno) * CLOG_LSNS_PER_PAGE + \
98 : : ((xid) % (TransactionId) CLOG_XACTS_PER_PAGE) / CLOG_XACTS_PER_LSN_GROUP)
99 : :
100 : : /*
101 : : * The number of subtransactions below which we consider to apply clog group
102 : : * update optimization. Testing reveals that the number higher than this can
103 : : * hurt performance.
104 : : */
105 : : #define THRESHOLD_SUBTRANS_CLOG_OPT 5
106 : :
107 : : /*
108 : : * Link to shared-memory data structures for CLOG control
109 : : */
110 : : static void CLOGShmemRequest(void *arg);
111 : : static void CLOGShmemInit(void *arg);
112 : : static bool CLOGPagePrecedes(int64 page1, int64 page2);
113 : : static int clog_errdetail_for_io_error(const void *opaque_data);
114 : :
115 : : const ShmemCallbacks CLOGShmemCallbacks = {
116 : : .request_fn = CLOGShmemRequest,
117 : : .init_fn = CLOGShmemInit,
118 : : };
119 : :
120 : : static SlruDesc XactSlruDesc;
121 : :
122 : : #define XactCtl (&XactSlruDesc)
123 : :
124 : :
125 : : static void WriteTruncateXlogRec(int64 pageno, TransactionId oldestXact,
126 : : Oid oldestXactDb);
127 : : static void TransactionIdSetPageStatus(TransactionId xid, int nsubxids,
128 : : TransactionId *subxids, XidStatus status,
129 : : XLogRecPtr lsn, int64 pageno,
130 : : bool all_xact_same_page);
131 : : static void TransactionIdSetStatusBit(TransactionId xid, XidStatus status,
132 : : XLogRecPtr lsn, int slotno);
133 : : static void set_status_by_pages(int nsubxids, TransactionId *subxids,
134 : : XidStatus status, XLogRecPtr lsn);
135 : : static bool TransactionGroupUpdateXidStatus(TransactionId xid,
136 : : XidStatus status, XLogRecPtr lsn, int64 pageno);
137 : : static void TransactionIdSetPageStatusInternal(TransactionId xid, int nsubxids,
138 : : TransactionId *subxids, XidStatus status,
139 : : XLogRecPtr lsn, int64 pageno);
140 : :
141 : :
142 : : /*
143 : : * TransactionIdSetTreeStatus
144 : : *
145 : : * Record the final state of transaction entries in the commit log for
146 : : * a transaction and its subtransaction tree. Take care to ensure this is
147 : : * efficient, and as atomic as possible.
148 : : *
149 : : * xid is a single xid to set status for. This will typically be
150 : : * the top level transactionid for a top level commit or abort. It can
151 : : * also be a subtransaction when we record transaction aborts.
152 : : *
153 : : * subxids is an array of xids of length nsubxids, representing subtransactions
154 : : * in the tree of xid. In various cases nsubxids may be zero.
155 : : *
156 : : * lsn must be the WAL location of the commit record when recording an async
157 : : * commit. For a synchronous commit it can be InvalidXLogRecPtr, since the
158 : : * caller guarantees the commit record is already flushed in that case. It
159 : : * should be InvalidXLogRecPtr for abort cases, too.
160 : : *
161 : : * In the commit case, atomicity is limited by whether all the subxids are in
162 : : * the same CLOG page as xid. If they all are, then the lock will be grabbed
163 : : * only once, and the status will be set to committed directly. Otherwise
164 : : * we must
165 : : * 1. set sub-committed all subxids that are not on the same page as the
166 : : * main xid
167 : : * 2. atomically set committed the main xid and the subxids on the same page
168 : : * 3. go over the first bunch again and set them committed
169 : : * Note that as far as concurrent checkers are concerned, main transaction
170 : : * commit as a whole is still atomic.
171 : : *
172 : : * Example:
173 : : * TransactionId t commits and has subxids t1, t2, t3, t4
174 : : * t is on page p1, t1 is also on p1, t2 and t3 are on p2, t4 is on p3
175 : : * 1. update pages2-3:
176 : : * page2: set t2,t3 as sub-committed
177 : : * page3: set t4 as sub-committed
178 : : * 2. update page1:
179 : : * page1: set t,t1 as committed
180 : : * 3. update pages2-3:
181 : : * page2: set t2,t3 as committed
182 : : * page3: set t4 as committed
183 : : *
184 : : * NB: this is a low-level routine and is NOT the preferred entry point
185 : : * for most uses; functions in transam.c are the intended callers.
186 : : *
187 : : * XXX Think about issuing POSIX_FADV_WILLNEED on pages that we will need,
188 : : * but aren't yet in cache, as well as hinting pages not to fall out of
189 : : * cache yet.
190 : : */
191 : : void
6406 alvherre@alvh.no-ip. 192 : 192052 : TransactionIdSetTreeStatus(TransactionId xid, int nsubxids,
193 : : TransactionId *subxids, XidStatus status, XLogRecPtr lsn)
194 : : {
888 akorotkov@postgresql 195 : 192052 : int64 pageno = TransactionIdToPage(xid); /* get page of parent */
196 : : int i;
197 : :
6406 alvherre@alvh.no-ip. 198 [ + + - + ]: 192052 : Assert(status == TRANSACTION_STATUS_COMMITTED ||
199 : : status == TRANSACTION_STATUS_ABORTED);
200 : :
201 : : /*
202 : : * See how many subxids, if any, are on the same page as the parent, if
203 : : * any.
204 : : */
205 [ + + ]: 198755 : for (i = 0; i < nsubxids; i++)
206 : : {
207 [ - + ]: 6703 : if (TransactionIdToPage(subxids[i]) != pageno)
6406 alvherre@alvh.no-ip. 208 :UBC 0 : break;
209 : : }
210 : :
211 : : /*
212 : : * Do all items fit on a single page?
213 : : */
6406 alvherre@alvh.no-ip. 214 [ + - ]:CBC 192052 : if (i == nsubxids)
215 : : {
216 : : /*
217 : : * Set the parent and all subtransactions in a single call
218 : : */
219 : 192052 : TransactionIdSetPageStatus(xid, nsubxids, subxids, status, lsn,
220 : : pageno, true);
221 : : }
222 : : else
223 : : {
6172 bruce@momjian.us 224 :UBC 0 : int nsubxids_on_first_page = i;
225 : :
226 : : /*
227 : : * If this is a commit then we care about doing this correctly (i.e.
228 : : * using the subcommitted intermediate status). By here, we know
229 : : * we're updating more than one page of clog, so we must mark entries
230 : : * that are *not* on the first page so that they show as subcommitted
231 : : * before we then return to update the status to fully committed.
232 : : *
233 : : * To avoid touching the first page twice, skip marking subcommitted
234 : : * for the subxids on that first page.
235 : : */
6406 alvherre@alvh.no-ip. 236 [ # # ]: 0 : if (status == TRANSACTION_STATUS_COMMITTED)
237 : 0 : set_status_by_pages(nsubxids - nsubxids_on_first_page,
238 : 0 : subxids + nsubxids_on_first_page,
239 : : TRANSACTION_STATUS_SUB_COMMITTED, lsn);
240 : :
241 : : /*
242 : : * Now set the parent and subtransactions on same page as the parent,
243 : : * if any
244 : : */
245 : 0 : pageno = TransactionIdToPage(xid);
246 : 0 : TransactionIdSetPageStatus(xid, nsubxids_on_first_page, subxids, status,
247 : : lsn, pageno, false);
248 : :
249 : : /*
250 : : * Now work through the rest of the subxids one clog page at a time,
251 : : * starting from the second page onwards, like we did above.
252 : : */
253 : 0 : set_status_by_pages(nsubxids - nsubxids_on_first_page,
254 : 0 : subxids + nsubxids_on_first_page,
255 : : status, lsn);
256 : : }
6406 alvherre@alvh.no-ip. 257 :CBC 192052 : }
258 : :
259 : : /*
260 : : * Helper for TransactionIdSetTreeStatus: set the status for a bunch of
261 : : * transactions, chunking in the separate CLOG pages involved. We never
262 : : * pass the whole transaction tree to this function, only subtransactions
263 : : * that are on different pages to the top level transaction id.
264 : : */
265 : : static void
6406 alvherre@alvh.no-ip. 266 :UBC 0 : set_status_by_pages(int nsubxids, TransactionId *subxids,
267 : : XidStatus status, XLogRecPtr lsn)
268 : : {
888 akorotkov@postgresql 269 : 0 : int64 pageno = TransactionIdToPage(subxids[0]);
6172 bruce@momjian.us 270 : 0 : int offset = 0;
271 : 0 : int i = 0;
272 : :
3133 tgl@sss.pgh.pa.us 273 [ # # ]: 0 : Assert(nsubxids > 0); /* else the pageno fetch above is unsafe */
274 : :
6406 alvherre@alvh.no-ip. 275 [ # # ]: 0 : while (i < nsubxids)
276 : : {
6172 bruce@momjian.us 277 : 0 : int num_on_page = 0;
278 : : int64 nextpageno;
279 : :
280 : : do
281 : : {
3133 tgl@sss.pgh.pa.us 282 : 0 : nextpageno = TransactionIdToPage(subxids[i]);
283 [ # # ]: 0 : if (nextpageno != pageno)
284 : 0 : break;
6406 alvherre@alvh.no-ip. 285 : 0 : num_on_page++;
286 : 0 : i++;
3133 tgl@sss.pgh.pa.us 287 [ # # ]: 0 : } while (i < nsubxids);
288 : :
6406 alvherre@alvh.no-ip. 289 : 0 : TransactionIdSetPageStatus(InvalidTransactionId,
290 : 0 : num_on_page, subxids + offset,
291 : : status, lsn, pageno, false);
292 : 0 : offset = i;
3133 tgl@sss.pgh.pa.us 293 : 0 : pageno = nextpageno;
294 : : }
6406 alvherre@alvh.no-ip. 295 : 0 : }
296 : :
297 : : /*
298 : : * Record the final state of transaction entries in the commit log for all
299 : : * entries on a single page. Atomic only on this page.
300 : : */
301 : : static void
6406 alvherre@alvh.no-ip. 302 :CBC 192052 : TransactionIdSetPageStatus(TransactionId xid, int nsubxids,
303 : : TransactionId *subxids, XidStatus status,
304 : : XLogRecPtr lsn, int64 pageno,
305 : : bool all_xact_same_page)
306 : : {
307 : : LWLock *lock;
308 : :
309 : : /* Can't use group update when PGPROC overflows. */
310 : : StaticAssertDecl(THRESHOLD_SUBTRANS_CLOG_OPT <= PGPROC_MAX_CACHED_SUBXIDS,
311 : : "group clog threshold less than PGPROC cached subxids");
312 : :
313 : : /* Get the SLRU bank lock for the page we are going to access. */
797 314 : 192052 : lock = SimpleLruGetBankLock(XactCtl, pageno);
315 : :
316 : : /*
317 : : * When there is contention on the SLRU bank lock we need, we try to group
318 : : * multiple updates; a single leader process will perform transaction
319 : : * status updates for multiple backends so that the number of times the
320 : : * bank lock needs to be acquired is reduced.
321 : : *
322 : : * For this optimization to be safe, the XID and subxids in MyProc must be
323 : : * the same as the ones for which we're setting the status. Check that
324 : : * this is the case.
325 : : *
326 : : * For this optimization to be efficient, we shouldn't have too many
327 : : * sub-XIDs and all of the XIDs for which we're adjusting clog should be
328 : : * on the same page. Check those conditions, too.
329 : : */
2090 andres@anarazel.de 330 [ + - + + : 192052 : if (all_xact_same_page && xid == MyProc->xid &&
+ + ]
3168 rhaas@postgresql.org 331 : 165004 : nsubxids <= THRESHOLD_SUBTRANS_CLOG_OPT &&
2090 andres@anarazel.de 332 [ + - + + ]: 165004 : nsubxids == MyProc->subxidStatus.count &&
1524 tgl@sss.pgh.pa.us 333 : 505 : (nsubxids == 0 ||
334 [ + - ]: 505 : memcmp(subxids, MyProc->subxids.xids,
335 : : nsubxids * sizeof(TransactionId)) == 0))
336 : : {
337 : : /*
338 : : * If we can immediately acquire the lock, we update the status of our
339 : : * own XID and release the lock. If not, try use group XID update. If
340 : : * that doesn't work out, fall back to waiting for the lock to perform
341 : : * an update for this transaction only.
342 : : */
797 alvherre@alvh.no-ip. 343 [ + + ]: 165004 : if (LWLockConditionalAcquire(lock, LW_EXCLUSIVE))
344 : : {
345 : : /* Got the lock without waiting! Do the update. */
3168 rhaas@postgresql.org 346 : 164920 : TransactionIdSetPageStatusInternal(xid, nsubxids, subxids, status,
347 : : lsn, pageno);
797 alvherre@alvh.no-ip. 348 : 164920 : LWLockRelease(lock);
3168 rhaas@postgresql.org 349 : 164920 : return;
350 : : }
351 [ + - ]: 84 : else if (TransactionGroupUpdateXidStatus(xid, status, lsn, pageno))
352 : : {
353 : : /* Group update mechanism has done the work. */
354 : 84 : return;
355 : : }
356 : :
357 : : /* Fall through only if update isn't done yet. */
358 : : }
359 : :
360 : : /* Group update not applicable, or couldn't accept this page number. */
797 alvherre@alvh.no-ip. 361 : 27048 : LWLockAcquire(lock, LW_EXCLUSIVE);
3168 rhaas@postgresql.org 362 : 27048 : TransactionIdSetPageStatusInternal(xid, nsubxids, subxids, status,
363 : : lsn, pageno);
797 alvherre@alvh.no-ip. 364 : 27048 : LWLockRelease(lock);
365 : : }
366 : :
367 : : /*
368 : : * Record the final state of transaction entry in the commit log
369 : : *
370 : : * We don't do any locking here; caller must handle that.
371 : : */
372 : : static void
3168 rhaas@postgresql.org 373 : 192052 : TransactionIdSetPageStatusInternal(TransactionId xid, int nsubxids,
374 : : TransactionId *subxids, XidStatus status,
375 : : XLogRecPtr lsn, int64 pageno)
376 : : {
377 : : int slotno;
378 : : int i;
379 : :
9019 tgl@sss.pgh.pa.us 380 [ + + - + : 192052 : Assert(status == TRANSACTION_STATUS_COMMITTED ||
- - - - ]
381 : : status == TRANSACTION_STATUS_ABORTED ||
382 : : (status == TRANSACTION_STATUS_SUB_COMMITTED && !TransactionIdIsValid(xid)));
797 alvherre@alvh.no-ip. 383 [ - + ]: 192052 : Assert(LWLockHeldByMeInMode(SimpleLruGetBankLock(XactCtl, pageno),
384 : : LW_EXCLUSIVE));
385 : :
386 : : /*
387 : : * If we're doing an async commit (ie, lsn is valid), then we must wait
388 : : * for any active write on the page slot to complete. Otherwise our
389 : : * update could reach disk in that write, which will not do since we
390 : : * mustn't let it reach disk until we've done the appropriate WAL flush.
391 : : * But when lsn is invalid, it's OK to scribble on a page while it is
392 : : * write-busy, since we don't care if the update reaches disk sooner than
393 : : * we think.
394 : : */
53 heikki.linnakangas@i 395 :GNC 192052 : slotno = SimpleLruReadPage(XactCtl, pageno, !XLogRecPtrIsValid(lsn), &xid);
396 : :
397 : : /*
398 : : * Set the main transaction id, if any.
399 : : *
400 : : * If we update more than one xid on this page while it is being written
401 : : * out, we might find that some of the bits go to disk and others don't.
402 : : * If we are updating commits on the page with the top-level xid that
403 : : * could break atomicity, so we subcommit the subxids first before we mark
404 : : * the top-level commit.
405 : : */
6406 alvherre@alvh.no-ip. 406 [ + - ]:CBC 192052 : if (TransactionIdIsValid(xid))
407 : : {
408 : : /* Subtransactions first, if needed ... */
409 [ + + ]: 192052 : if (status == TRANSACTION_STATUS_COMMITTED)
410 : : {
411 [ + + ]: 187078 : for (i = 0; i < nsubxids; i++)
412 : : {
2181 tgl@sss.pgh.pa.us 413 [ - + ]: 6376 : Assert(XactCtl->shared->page_number[slotno] == TransactionIdToPage(subxids[i]));
6406 alvherre@alvh.no-ip. 414 : 6376 : TransactionIdSetStatusBit(subxids[i],
415 : : TRANSACTION_STATUS_SUB_COMMITTED,
416 : : lsn, slotno);
417 : : }
418 : : }
419 : :
420 : : /* ... then the main transaction */
421 : 192052 : TransactionIdSetStatusBit(xid, status, lsn, slotno);
422 : : }
423 : :
424 : : /* Set the subtransactions */
425 [ + + ]: 198755 : for (i = 0; i < nsubxids; i++)
426 : : {
2181 tgl@sss.pgh.pa.us 427 [ - + ]: 6703 : Assert(XactCtl->shared->page_number[slotno] == TransactionIdToPage(subxids[i]));
6406 alvherre@alvh.no-ip. 428 : 6703 : TransactionIdSetStatusBit(subxids[i], status, lsn, slotno);
429 : : }
430 : :
2181 tgl@sss.pgh.pa.us 431 : 192052 : XactCtl->shared->page_dirty[slotno] = true;
3168 rhaas@postgresql.org 432 : 192052 : }
433 : :
434 : : /*
435 : : * Subroutine for TransactionIdSetPageStatus, q.v.
436 : : *
437 : : * When we cannot immediately acquire the SLRU bank lock in exclusive mode at
438 : : * commit time, add ourselves to a list of processes that need their XIDs
439 : : * status update. The first process to add itself to the list will acquire
440 : : * the lock in exclusive mode and set transaction status as required on behalf
441 : : * of all group members. This avoids a great deal of contention when many
442 : : * processes are trying to commit at once, since the lock need not be
443 : : * repeatedly handed off from one committing process to the next.
444 : : *
445 : : * Returns true when transaction status has been updated in clog; returns
446 : : * false if we decided against applying the optimization because the page
447 : : * number we need to update differs from those processes already waiting.
448 : : */
449 : : static bool
450 : 84 : TransactionGroupUpdateXidStatus(TransactionId xid, XidStatus status,
451 : : XLogRecPtr lsn, int64 pageno)
452 : : {
453 : 84 : volatile PROC_HDR *procglobal = ProcGlobal;
454 : 84 : PGPROC *proc = MyProc;
455 : : uint32 nextidx;
456 : : uint32 wakeidx;
457 : : int64 prevpageno;
797 alvherre@alvh.no-ip. 458 : 84 : LWLock *prevlock = NULL;
459 : :
460 : : /* We should definitely have an XID whose status needs to be updated. */
3168 rhaas@postgresql.org 461 [ - + ]: 84 : Assert(TransactionIdIsValid(xid));
462 : :
463 : : /*
464 : : * Prepare to add ourselves to the list of processes needing a group XID
465 : : * status update.
466 : : */
467 : 84 : proc->clogGroupMember = true;
468 : 84 : proc->clogGroupMemberXid = xid;
469 : 84 : proc->clogGroupMemberXidStatus = status;
470 : 84 : proc->clogGroupMemberPage = pageno;
471 : 84 : proc->clogGroupMemberLsn = lsn;
472 : :
473 : : /*
474 : : * We put ourselves in the queue by writing MyProcNumber to
475 : : * ProcGlobal->clogGroupFirst. However, if there's already a process
476 : : * listed there, we compare our pageno with that of that process; if it
477 : : * differs, we cannot participate in the group, so we return for caller to
478 : : * update pg_xact in the normal way.
479 : : *
480 : : * If we're not the first process in the list, we must follow the leader.
481 : : * We do this by storing the data we want updated in our PGPROC entry
482 : : * where the leader can find it, then going to sleep.
483 : : *
484 : : * If no process is already in the list, we're the leader; our first step
485 : : * is to lock the SLRU bank to which our page belongs, then we close out
486 : : * the group by resetting the list pointer from ProcGlobal->clogGroupFirst
487 : : * (this lets other processes set up other groups later); finally we do
488 : : * the SLRU updates, release the SLRU bank lock, and wake up the sleeping
489 : : * processes.
490 : : *
491 : : * If another group starts to update a page in a different SLRU bank, they
492 : : * can proceed concurrently, since the bank lock they're going to use is
493 : : * different from ours. If another group starts to update a page in the
494 : : * same bank as ours, they wait until we release the lock.
495 : : */
496 : 84 : nextidx = pg_atomic_read_u32(&procglobal->clogGroupFirst);
497 : :
498 : : while (true)
499 : : {
500 : : /*
501 : : * Add the proc to list, if the clog page where we need to update the
502 : : * current transaction status is same as group leader's clog page.
503 : : *
504 : : * There is a race condition here, which is that after doing the below
505 : : * check and before adding this proc's clog update to a group, the
506 : : * group leader might have already finished the group update for this
507 : : * page and becomes group leader of another group, updating a
508 : : * different page. This will lead to a situation where a single group
509 : : * can have different clog page updates. This isn't likely and will
510 : : * still work, just less efficiently -- we handle this case by
511 : : * switching to a different bank lock in the loop below.
512 : : */
793 heikki.linnakangas@i 513 [ + + ]: 84 : if (nextidx != INVALID_PROC_NUMBER &&
803 514 [ - + ]: 21 : GetPGProcByNumber(nextidx)->clogGroupMemberPage != proc->clogGroupMemberPage)
515 : : {
516 : : /*
517 : : * Ensure that this proc is not a member of any clog group that
518 : : * needs an XID status update.
519 : : */
3168 rhaas@postgresql.org 520 :UBC 0 : proc->clogGroupMember = false;
793 heikki.linnakangas@i 521 : 0 : pg_atomic_write_u32(&proc->clogGroupNext, INVALID_PROC_NUMBER);
3168 rhaas@postgresql.org 522 : 0 : return false;
523 : : }
524 : :
3168 rhaas@postgresql.org 525 :CBC 84 : pg_atomic_write_u32(&proc->clogGroupNext, nextidx);
526 : :
527 [ + - ]: 84 : if (pg_atomic_compare_exchange_u32(&procglobal->clogGroupFirst,
528 : : &nextidx,
529 : : (uint32) MyProcNumber))
530 : 84 : break;
531 : : }
532 : :
533 : : /*
534 : : * If the list was not empty, the leader will update the status of our
535 : : * XID. It is impossible to have followers without a leader because the
536 : : * first process that has added itself to the list will always have
537 : : * nextidx as INVALID_PROC_NUMBER.
538 : : */
793 heikki.linnakangas@i 539 [ + + ]: 84 : if (nextidx != INVALID_PROC_NUMBER)
540 : : {
3168 rhaas@postgresql.org 541 : 21 : int extraWaits = 0;
542 : :
543 : : /* Sleep until the leader updates our XID status. */
2180 tgl@sss.pgh.pa.us 544 : 21 : pgstat_report_wait_start(WAIT_EVENT_XACT_GROUP_UPDATE);
545 : : for (;;)
546 : : {
547 : : /* acts as a read barrier */
3168 rhaas@postgresql.org 548 : 21 : PGSemaphoreLock(proc->sem);
549 [ + - ]: 21 : if (!proc->clogGroupMember)
550 : 21 : break;
3168 rhaas@postgresql.org 551 :UBC 0 : extraWaits++;
552 : : }
3168 rhaas@postgresql.org 553 :CBC 21 : pgstat_report_wait_end();
554 : :
793 heikki.linnakangas@i 555 [ - + ]: 21 : Assert(pg_atomic_read_u32(&proc->clogGroupNext) == INVALID_PROC_NUMBER);
556 : :
557 : : /* Fix semaphore count for any absorbed wakeups */
3168 rhaas@postgresql.org 558 [ - + ]: 21 : while (extraWaits-- > 0)
3168 rhaas@postgresql.org 559 :UBC 0 : PGSemaphoreUnlock(proc->sem);
3168 rhaas@postgresql.org 560 :CBC 21 : return true;
561 : : }
562 : :
563 : : /*
564 : : * By here, we know we're the leader process. Acquire the SLRU bank lock
565 : : * that corresponds to the page we originally wanted to modify.
566 : : */
797 alvherre@alvh.no-ip. 567 : 63 : prevpageno = proc->clogGroupMemberPage;
568 : 63 : prevlock = SimpleLruGetBankLock(XactCtl, prevpageno);
569 : 63 : LWLockAcquire(prevlock, LW_EXCLUSIVE);
570 : :
571 : : /*
572 : : * Now that we've got the lock, clear the list of processes waiting for
573 : : * group XID status update, saving a pointer to the head of the list.
574 : : * (Trying to pop elements one at a time could lead to an ABA problem.)
575 : : *
576 : : * At this point, any processes trying to do this would create a separate
577 : : * group.
578 : : */
3168 rhaas@postgresql.org 579 : 63 : nextidx = pg_atomic_exchange_u32(&procglobal->clogGroupFirst,
580 : : INVALID_PROC_NUMBER);
581 : :
582 : : /* Remember head of list so we can perform wakeups after dropping lock. */
583 : 63 : wakeidx = nextidx;
584 : :
585 : : /* Walk the list and update the status of all XIDs. */
793 heikki.linnakangas@i 586 [ + + ]: 147 : while (nextidx != INVALID_PROC_NUMBER)
587 : : {
120 drowley@postgresql.o 588 :GNC 84 : PGPROC *nextproc = GetPGProcByNumber(nextidx);
651 michael@paquier.xyz 589 :CBC 84 : int64 thispageno = nextproc->clogGroupMemberPage;
590 : :
591 : : /*
592 : : * If the page to update belongs to a different bank than the previous
593 : : * one, exchange bank lock to the new one. This should be quite rare,
594 : : * as described above.
595 : : *
596 : : * (We could try to optimize this by waking up the processes for which
597 : : * we have already updated the status while we exchange the lock, but
598 : : * the code doesn't do that at present. I think it'd require
599 : : * additional bookkeeping, making the common path slower in order to
600 : : * improve an infrequent case.)
601 : : */
797 alvherre@alvh.no-ip. 602 [ - + ]: 84 : if (thispageno != prevpageno)
603 : : {
797 alvherre@alvh.no-ip. 604 :UBC 0 : LWLock *lock = SimpleLruGetBankLock(XactCtl, thispageno);
605 : :
606 [ # # ]: 0 : if (prevlock != lock)
607 : : {
608 : 0 : LWLockRelease(prevlock);
609 : 0 : LWLockAcquire(lock, LW_EXCLUSIVE);
610 : : }
611 : 0 : prevlock = lock;
612 : 0 : prevpageno = thispageno;
613 : : }
614 : :
615 : : /*
616 : : * Transactions with more than THRESHOLD_SUBTRANS_CLOG_OPT sub-XIDs
617 : : * should not use group XID status update mechanism.
618 : : */
1308 drowley@postgresql.o 619 [ - + ]:CBC 84 : Assert(nextproc->subxidStatus.count <= THRESHOLD_SUBTRANS_CLOG_OPT);
620 : :
621 : 84 : TransactionIdSetPageStatusInternal(nextproc->clogGroupMemberXid,
622 : 84 : nextproc->subxidStatus.count,
623 : 84 : nextproc->subxids.xids,
624 : : nextproc->clogGroupMemberXidStatus,
625 : : nextproc->clogGroupMemberLsn,
626 : : nextproc->clogGroupMemberPage);
627 : :
628 : : /* Move to next proc in list. */
629 : 84 : nextidx = pg_atomic_read_u32(&nextproc->clogGroupNext);
630 : : }
631 : :
632 : : /* We're done with the lock now. */
797 alvherre@alvh.no-ip. 633 [ + - ]: 63 : if (prevlock != NULL)
634 : 63 : LWLockRelease(prevlock);
635 : :
636 : : /*
637 : : * Now that we've released the lock, go back and wake everybody up. We
638 : : * don't do this under the lock so as to keep lock hold times to a
639 : : * minimum.
640 : : *
641 : : * (Perhaps we could do this in two passes, the first setting
642 : : * clogGroupNext to invalid while saving the semaphores to an array, then
643 : : * a single write barrier, then another pass unlocking the semaphores.)
644 : : */
793 heikki.linnakangas@i 645 [ + + ]: 147 : while (wakeidx != INVALID_PROC_NUMBER)
646 : : {
120 drowley@postgresql.o 647 :GNC 84 : PGPROC *wakeproc = GetPGProcByNumber(wakeidx);
648 : :
1308 drowley@postgresql.o 649 :CBC 84 : wakeidx = pg_atomic_read_u32(&wakeproc->clogGroupNext);
793 heikki.linnakangas@i 650 : 84 : pg_atomic_write_u32(&wakeproc->clogGroupNext, INVALID_PROC_NUMBER);
651 : :
652 : : /* ensure all previous writes are visible before follower continues. */
3168 rhaas@postgresql.org 653 : 84 : pg_write_barrier();
654 : :
1308 drowley@postgresql.o 655 : 84 : wakeproc->clogGroupMember = false;
656 : :
657 [ + + ]: 84 : if (wakeproc != MyProc)
658 : 21 : PGSemaphoreUnlock(wakeproc->sem);
659 : : }
660 : :
3168 rhaas@postgresql.org 661 : 63 : return true;
662 : : }
663 : :
664 : : /*
665 : : * Sets the commit status of a single transaction.
666 : : *
667 : : * Caller must hold the corresponding SLRU bank lock, will be held at exit.
668 : : */
669 : : static void
6406 alvherre@alvh.no-ip. 670 : 205131 : TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, int slotno)
671 : : {
672 : 205131 : int byteno = TransactionIdToByte(xid);
673 : 205131 : int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT;
674 : : char *byteptr;
675 : : char byteval;
676 : : char curval;
677 : :
797 678 [ - + ]: 205131 : Assert(XactCtl->shared->page_number[slotno] == TransactionIdToPage(xid));
679 [ - + ]: 205131 : Assert(LWLockHeldByMeInMode(SimpleLruGetBankLock(XactCtl,
680 : : XactCtl->shared->page_number[slotno]),
681 : : LW_EXCLUSIVE));
682 : :
2181 tgl@sss.pgh.pa.us 683 : 205131 : byteptr = XactCtl->shared->page_buffer[slotno] + byteno;
6392 alvherre@alvh.no-ip. 684 : 205131 : curval = (*byteptr >> bshift) & CLOG_XACT_BITMASK;
685 : :
686 : : /*
687 : : * When replaying transactions during recovery we still need to perform
688 : : * the two phases of subcommit and then commit. However, some transactions
689 : : * are already correctly marked, so we just treat those as a no-op which
690 : : * allows us to keep the following Assert as restrictive as possible.
691 : : */
692 [ + + + + : 205131 : if (InRecovery && status == TRANSACTION_STATUS_SUB_COMMITTED &&
- + ]
693 : : curval == TRANSACTION_STATUS_COMMITTED)
6392 alvherre@alvh.no-ip. 694 :UBC 0 : return;
695 : :
696 : : /*
697 : : * Current state change should be from 0 or subcommitted to target state
698 : : * or we should already be there when replaying changes during recovery.
699 : : */
6392 alvherre@alvh.no-ip. 700 [ + + + + :CBC 205131 : Assert(curval == 0 ||
- + - + ]
701 : : (curval == TRANSACTION_STATUS_SUB_COMMITTED &&
702 : : status != TRANSACTION_STATUS_IN_PROGRESS) ||
703 : : curval == status);
704 : :
705 : : /* note this assumes exclusive access to the clog page */
7976 tgl@sss.pgh.pa.us 706 : 205131 : byteval = *byteptr;
707 : 205131 : byteval &= ~(((1 << CLOG_BITS_PER_XACT) - 1) << bshift);
708 : 205131 : byteval |= (status << bshift);
709 : 205131 : *byteptr = byteval;
710 : :
711 : : /*
712 : : * Update the group LSN if the transaction completion LSN is higher.
713 : : *
714 : : * Note: lsn will be invalid when supplied during InRecovery processing,
715 : : * so we don't need to do anything special to avoid LSN updates during
716 : : * recovery. After recovery completes the next clog change will set the
717 : : * LSN correctly.
718 : : */
180 alvherre@kurilemu.de 719 [ + + ]:GNC 205131 : if (XLogRecPtrIsValid(lsn))
720 : : {
6852 tgl@sss.pgh.pa.us 721 :CBC 31476 : int lsnindex = GetLSNIndex(slotno, xid);
722 : :
2181 723 [ + + ]: 31476 : if (XactCtl->shared->group_lsn[lsnindex] < lsn)
724 : 28860 : XactCtl->shared->group_lsn[lsnindex] = lsn;
725 : : }
726 : : }
727 : :
728 : : /*
729 : : * Interrogate the state of a transaction in the commit log.
730 : : *
731 : : * Aside from the actual commit status, this function returns (into *lsn)
732 : : * an LSN that is late enough to be able to guarantee that if we flush up to
733 : : * that LSN then we will have flushed the transaction's commit record to disk.
734 : : * The result is not necessarily the exact LSN of the transaction's commit
735 : : * record! For example, for long-past transactions (those whose clog pages
736 : : * already migrated to disk), we'll return InvalidXLogRecPtr. Also, because
737 : : * we group transactions on the same clog page to conserve storage, we might
738 : : * return the LSN of a later transaction that falls into the same group.
739 : : *
740 : : * NB: this is a low-level routine and is NOT the preferred entry point
741 : : * for most uses; TransactionLogFetch() in transam.c is the intended caller.
742 : : */
743 : : XidStatus
6852 744 : 915517 : TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn)
745 : : {
888 akorotkov@postgresql 746 : 915517 : int64 pageno = TransactionIdToPage(xid);
9019 tgl@sss.pgh.pa.us 747 : 915517 : int byteno = TransactionIdToByte(xid);
748 : 915517 : int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT;
749 : : int slotno;
750 : : int lsnindex;
751 : : char *byteptr;
752 : : XidStatus status;
753 : :
754 : : /* lock is acquired by SimpleLruReadPage_ReadOnly */
755 : :
53 heikki.linnakangas@i 756 :GNC 915517 : slotno = SimpleLruReadPage_ReadOnly(XactCtl, pageno, &xid);
2181 tgl@sss.pgh.pa.us 757 :CBC 915517 : byteptr = XactCtl->shared->page_buffer[slotno] + byteno;
758 : :
9019 759 : 915517 : status = (*byteptr >> bshift) & CLOG_XACT_BITMASK;
760 : :
6852 761 : 915517 : lsnindex = GetLSNIndex(slotno, xid);
2181 762 : 915517 : *lsn = XactCtl->shared->group_lsn[lsnindex];
763 : :
797 alvherre@alvh.no-ip. 764 : 915517 : LWLockRelease(SimpleLruGetBankLock(XactCtl, pageno));
765 : :
9019 tgl@sss.pgh.pa.us 766 : 915517 : return status;
767 : : }
768 : :
769 : : /*
770 : : * Number of shared CLOG buffers.
771 : : *
772 : : * If asked to autotune, use 2MB for every 1GB of shared buffers, up to 8MB.
773 : : * Otherwise just cap the configured amount to be between 16 and the maximum
774 : : * allowed.
775 : : */
776 : : static int
5233 rhaas@postgresql.org 777 : 2483 : CLOGShmemBuffers(void)
778 : : {
779 : : /* auto-tune based on shared buffers */
797 alvherre@alvh.no-ip. 780 [ + + ]: 2483 : if (transaction_buffers == 0)
781 : 1239 : return SimpleLruAutotuneBuffers(512, 1024);
782 : :
783 [ + - + - ]: 1244 : return Min(Max(16, transaction_buffers), CLOG_MAX_ALLOWED_BUFFERS);
784 : : }
785 : :
786 : : /*
787 : : * Register shared memory for CLOG
788 : : */
789 : : static void
29 heikki.linnakangas@i 790 :GNC 1244 : CLOGShmemRequest(void *arg)
791 : : {
792 : : /* If auto-tuning is requested, now is the time to do it */
797 alvherre@alvh.no-ip. 793 [ + + ]:CBC 1244 : if (transaction_buffers == 0)
794 : : {
795 : : char buf[32];
796 : :
797 : 1239 : snprintf(buf, sizeof(buf), "%d", CLOGShmemBuffers());
798 : 1239 : SetConfigOption("transaction_buffers", buf, PGC_POSTMASTER,
799 : : PGC_S_DYNAMIC_DEFAULT);
800 : :
801 : : /*
802 : : * We prefer to report this value's source as PGC_S_DYNAMIC_DEFAULT.
803 : : * However, if the DBA explicitly set transaction_buffers = 0 in the
804 : : * config file, then PGC_S_DYNAMIC_DEFAULT will fail to override that
805 : : * and we must force the matter with PGC_S_OVERRIDE.
806 : : */
807 [ - + ]: 1239 : if (transaction_buffers == 0) /* failed to apply it? */
797 alvherre@alvh.no-ip. 808 :UBC 0 : SetConfigOption("transaction_buffers", buf, PGC_POSTMASTER,
809 : : PGC_S_OVERRIDE);
810 : : }
797 alvherre@alvh.no-ip. 811 [ - + ]:CBC 1244 : Assert(transaction_buffers != 0);
29 heikki.linnakangas@i 812 :GNC 1244 : SimpleLruRequest(.desc = &XactSlruDesc,
813 : : .name = "transaction",
814 : : .Dir = "pg_xact",
815 : : .long_segment_names = false,
816 : :
817 : : .nslots = CLOGShmemBuffers(),
818 : : .nlsns = CLOG_LSNS_PER_PAGE,
819 : :
820 : : .sync_handler = SYNC_HANDLER_CLOG,
821 : : .PagePrecedes = CLOGPagePrecedes,
822 : : .errdetail_for_io_error = clog_errdetail_for_io_error,
823 : :
824 : : .buffer_tranche_id = LWTRANCHE_XACT_BUFFER,
825 : : .bank_tranche_id = LWTRANCHE_XACT_SLRU,
826 : : );
827 : 1244 : }
828 : :
829 : : static void
830 : 1241 : CLOGShmemInit(void *arg)
831 : : {
1935 noah@leadboat.com 832 :CBC 1241 : SlruPagePrecedesUnitTests(XactCtl, CLOG_XACTS_PER_PAGE);
9019 tgl@sss.pgh.pa.us 833 : 1241 : }
834 : :
835 : : /*
836 : : * GUC check_hook for transaction_buffers
837 : : */
838 : : bool
797 alvherre@alvh.no-ip. 839 : 2525 : check_transaction_buffers(int *newval, void **extra, GucSource source)
840 : : {
841 : 2525 : return check_slru_buffers("transaction_buffers", newval);
842 : : }
843 : :
844 : : /*
845 : : * This func must be called ONCE on system install. It creates
846 : : * the initial CLOG segment. (The CLOG directory is assumed to
847 : : * have been created by initdb, and CLOGShmemInit must have been
848 : : * called already.)
849 : : */
850 : : void
9019 tgl@sss.pgh.pa.us 851 : 57 : BootStrapCLOG(void)
852 : : {
853 : : /* Zero the initial page and flush it to disk */
302 alvherre@kurilemu.de 854 :GNC 57 : SimpleLruZeroAndWritePage(XactCtl, 0);
9019 tgl@sss.pgh.pa.us 855 :GIC 57 : }
856 : :
857 : : /*
858 : : * This must be called ONCE during postmaster or standalone-backend startup,
859 : : * after StartupXLOG has initialized TransamVariables->nextXid.
860 : : */
861 : : void
9019 tgl@sss.pgh.pa.us 862 :CBC 1077 : StartupCLOG(void)
863 : : {
879 heikki.linnakangas@i 864 : 1077 : TransactionId xid = XidFromFullTransactionId(TransamVariables->nextXid);
888 akorotkov@postgresql 865 : 1077 : int64 pageno = TransactionIdToPage(xid);
866 : :
867 : : /*
868 : : * Initialize our idea of the latest page number.
869 : : */
819 alvherre@alvh.no-ip. 870 : 1077 : pg_atomic_write_u64(&XactCtl->shared->latest_page_number, pageno);
5298 simon@2ndQuadrant.co 871 : 1077 : }
872 : :
873 : : /*
874 : : * This must be called ONCE at the end of startup/recovery.
875 : : */
876 : : void
877 : 1010 : TrimCLOG(void)
878 : : {
879 heikki.linnakangas@i 879 : 1010 : TransactionId xid = XidFromFullTransactionId(TransamVariables->nextXid);
888 akorotkov@postgresql 880 : 1010 : int64 pageno = TransactionIdToPage(xid);
797 alvherre@alvh.no-ip. 881 : 1010 : LWLock *lock = SimpleLruGetBankLock(XactCtl, pageno);
882 : :
883 : 1010 : LWLockAcquire(lock, LW_EXCLUSIVE);
884 : :
885 : : /*
886 : : * Zero out the remainder of the current clog page. Under normal
887 : : * circumstances it should be zeroes already, but it seems at least
888 : : * theoretically possible that XLOG replay will have settled on a nextXID
889 : : * value that is less than the last XID actually used and marked by the
890 : : * previous database lifecycle (since subtransaction commit writes clog
891 : : * but makes no WAL entry). Let's just be safe. (We need not worry about
892 : : * pages beyond the current one, since those will be zeroed when first
893 : : * used. For the same reason, there is no need to do anything when
894 : : * nextXid is exactly at a page boundary; and it's likely that the
895 : : * "current" page doesn't exist yet in that case.)
896 : : */
7804 tgl@sss.pgh.pa.us 897 [ + + ]: 1010 : if (TransactionIdToPgIndex(xid) != 0)
898 : : {
899 : 1009 : int byteno = TransactionIdToByte(xid);
900 : 1009 : int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT;
901 : : int slotno;
902 : : char *byteptr;
903 : :
53 heikki.linnakangas@i 904 :GNC 1009 : slotno = SimpleLruReadPage(XactCtl, pageno, false, &xid);
2181 tgl@sss.pgh.pa.us 905 :CBC 1009 : byteptr = XactCtl->shared->page_buffer[slotno] + byteno;
906 : :
907 : : /* Zero so-far-unused positions in the current byte */
7804 908 : 1009 : *byteptr &= (1 << bshift) - 1;
909 : : /* Zero the rest of the page */
910 [ + + + - : 1009 : MemSet(byteptr + 1, 0, BLCKSZ - byteno - 1);
+ - - + -
- ]
911 : :
2181 912 : 1009 : XactCtl->shared->page_dirty[slotno] = true;
913 : : }
914 : :
797 alvherre@alvh.no-ip. 915 : 1010 : LWLockRelease(lock);
9019 tgl@sss.pgh.pa.us 916 : 1010 : }
917 : :
918 : : /*
919 : : * Perform a checkpoint --- either during shutdown, or on-the-fly
920 : : */
921 : : void
922 : 1944 : CheckPointCLOG(void)
923 : : {
924 : : /*
925 : : * Write dirty CLOG pages to disk. This may result in sync requests
926 : : * queued for later handling by ProcessSyncRequests(), as part of the
927 : : * checkpoint.
928 : : */
929 : : TRACE_POSTGRESQL_CLOG_CHECKPOINT_START(true);
2048 tmunro@postgresql.or 930 : 1944 : SimpleLruWriteAll(XactCtl, true);
931 : : TRACE_POSTGRESQL_CLOG_CHECKPOINT_DONE(true);
9019 tgl@sss.pgh.pa.us 932 : 1944 : }
933 : :
934 : :
935 : : /*
936 : : * Make sure that CLOG has room for a newly-allocated XID.
937 : : *
938 : : * NB: this is called while holding XidGenLock. We want it to be very fast
939 : : * most of the time; even when it's not so fast, no actual I/O need happen
940 : : * unless we're forced to write out a dirty clog or xlog page to make room
941 : : * in shared memory.
942 : : */
943 : : void
944 : 171470 : ExtendCLOG(TransactionId newestXact)
945 : : {
946 : : int64 pageno;
947 : : LWLock *lock;
948 : :
949 : : /*
950 : : * No work except at first XID of a page. But beware: just after
951 : : * wraparound, the first XID of page zero is FirstNormalTransactionId.
952 : : */
9018 953 [ + - + + ]: 171470 : if (TransactionIdToPgIndex(newestXact) != 0 &&
954 : : !TransactionIdEquals(newestXact, FirstNormalTransactionId))
9019 955 : 171415 : return;
956 : :
957 : 55 : pageno = TransactionIdToPage(newestXact);
797 alvherre@alvh.no-ip. 958 : 55 : lock = SimpleLruGetBankLock(XactCtl, pageno);
959 : :
960 : 55 : LWLockAcquire(lock, LW_EXCLUSIVE);
961 : :
962 : : /* Zero the page and make a WAL entry about it */
302 alvherre@kurilemu.de 963 :GNC 55 : SimpleLruZeroPage(XactCtl, pageno);
964 : 55 : XLogSimpleInsertInt64(RM_CLOG_ID, CLOG_ZEROPAGE, pageno);
965 : :
797 alvherre@alvh.no-ip. 966 :CBC 55 : LWLockRelease(lock);
967 : : }
968 : :
969 : :
970 : : /*
971 : : * Remove all CLOG segments before the one holding the passed transaction ID
972 : : *
973 : : * Before removing any CLOG data, we must flush XLOG to disk, to ensure that
974 : : * any recently-emitted records with freeze plans have reached disk; otherwise
975 : : * a crash and restart might leave us with some unfrozen tuples referencing
976 : : * removed CLOG data. We choose to emit a special TRUNCATE XLOG record too.
977 : : * Replaying the deletion from XLOG is not critical, since the files could
978 : : * just as well be removed later, but doing so prevents a long-running hot
979 : : * standby server from acquiring an unreasonably bloated CLOG directory.
980 : : *
981 : : * Since CLOG segments hold a large number of transactions, the opportunity to
982 : : * actually remove a segment is fairly rare, and so it seems best not to do
983 : : * the XLOG flush unless we have confirmed that there is a removable segment.
984 : : */
985 : : void
3330 rhaas@postgresql.org 986 : 114 : TruncateCLOG(TransactionId oldestXact, Oid oldestxid_datoid)
987 : : {
988 : : int64 cutoffPage;
989 : :
990 : : /*
991 : : * The cutoff point is the start of the segment containing oldestXact. We
992 : : * pass the *page* containing oldestXact to SimpleLruTruncate.
993 : : */
9019 tgl@sss.pgh.pa.us 994 : 114 : cutoffPage = TransactionIdToPage(oldestXact);
995 : :
996 : : /* Check to see if there's any files that could be removed */
2181 997 [ + - ]: 114 : if (!SlruScanDirectory(XactCtl, SlruScanDirCbReportPresence, &cutoffPage))
7925 998 : 114 : return; /* nothing to remove */
999 : :
1000 : : /*
1001 : : * Advance oldestClogXid before truncating clog, so concurrent xact status
1002 : : * lookups can ensure they don't attempt to access truncated-away clog.
1003 : : *
1004 : : * It's only necessary to do this if we will actually truncate away clog
1005 : : * pages.
1006 : : */
3330 rhaas@postgresql.org 1007 :UBC 0 : AdvanceOldestClogXid(oldestXact);
1008 : :
1009 : : /*
1010 : : * Write XLOG record and flush XLOG to disk. We record the oldest xid
1011 : : * we're keeping information about here so we can ensure that it's always
1012 : : * ahead of clog truncation in case we crash, and so a standby finds out
1013 : : * the new valid xid before the next checkpoint.
1014 : : */
1015 : 0 : WriteTruncateXlogRec(cutoffPage, oldestXact, oldestxid_datoid);
1016 : :
1017 : : /* Now we can remove the old CLOG segment(s) */
2181 tgl@sss.pgh.pa.us 1018 : 0 : SimpleLruTruncate(XactCtl, cutoffPage);
1019 : : }
1020 : :
1021 : :
1022 : : /*
1023 : : * Decide whether a CLOG page number is "older" for truncation purposes.
1024 : : *
1025 : : * We need to use comparison of TransactionIds here in order to do the right
1026 : : * thing with wraparound XID arithmetic. However, TransactionIdPrecedes()
1027 : : * would get weird about permanent xact IDs. So, offset both such that xid1,
1028 : : * xid2, and xid2 + CLOG_XACTS_PER_PAGE - 1 are all normal XIDs; this offset
1029 : : * is relevant to page 0 and to the page preceding page 0.
1030 : : *
1031 : : * The page containing oldestXact-2^31 is the important edge case. The
1032 : : * portion of that page equaling or following oldestXact-2^31 is expendable,
1033 : : * but the portion preceding oldestXact-2^31 is not. When oldestXact-2^31 is
1034 : : * the first XID of a page and segment, the entire page and segment is
1035 : : * expendable, and we could truncate the segment. Recognizing that case would
1036 : : * require making oldestXact, not just the page containing oldestXact,
1037 : : * available to this callback. The benefit would be rare and small, so we
1038 : : * don't optimize that edge case.
1039 : : */
1040 : : static bool
888 akorotkov@postgresql 1041 :CBC 48513 : CLOGPagePrecedes(int64 page1, int64 page2)
1042 : : {
1043 : : TransactionId xid1;
1044 : : TransactionId xid2;
1045 : :
8958 tgl@sss.pgh.pa.us 1046 : 48513 : xid1 = ((TransactionId) page1) * CLOG_XACTS_PER_PAGE;
1935 noah@leadboat.com 1047 : 48513 : xid1 += FirstNormalTransactionId + 1;
8958 tgl@sss.pgh.pa.us 1048 : 48513 : xid2 = ((TransactionId) page2) * CLOG_XACTS_PER_PAGE;
1935 noah@leadboat.com 1049 : 48513 : xid2 += FirstNormalTransactionId + 1;
1050 : :
1051 [ + + + + ]: 80779 : return (TransactionIdPrecedes(xid1, xid2) &&
1052 : 32266 : TransactionIdPrecedes(xid1, xid2 + CLOG_XACTS_PER_PAGE - 1));
1053 : : }
1054 : :
1055 : : static int
53 heikki.linnakangas@i 1056 :UNC 0 : clog_errdetail_for_io_error(const void *opaque_data)
1057 : : {
1058 : 0 : TransactionId xid = *(const TransactionId *) opaque_data;
1059 : :
1060 : 0 : return errdetail("Could not access commit status of transaction %u.", xid);
53 heikki.linnakangas@i 1061 :ECB (48) : }
1062 : :
1063 : :
1064 : : /*
1065 : : * Write a TRUNCATE xlog record
1066 : : *
1067 : : * We must flush the xlog record to disk before returning --- see notes
1068 : : * in TruncateCLOG().
1069 : : */
1070 : : static void
888 akorotkov@postgresql 1071 :UBC 0 : WriteTruncateXlogRec(int64 pageno, TransactionId oldestXact, Oid oldestXactDb)
1072 : : {
1073 : : XLogRecPtr recptr;
1074 : : xl_clog_truncate xlrec;
1075 : :
3330 rhaas@postgresql.org 1076 : 0 : xlrec.pageno = pageno;
1077 : 0 : xlrec.oldestXact = oldestXact;
1078 : 0 : xlrec.oldestXactDb = oldestXactDb;
1079 : :
4184 heikki.linnakangas@i 1080 : 0 : XLogBeginInsert();
448 peter@eisentraut.org 1081 : 0 : XLogRegisterData(&xlrec, sizeof(xl_clog_truncate));
4184 heikki.linnakangas@i 1082 : 0 : recptr = XLogInsert(RM_CLOG_ID, CLOG_TRUNCATE);
7121 tgl@sss.pgh.pa.us 1083 : 0 : XLogFlush(recptr);
1084 : 0 : }
1085 : :
1086 : : /*
1087 : : * CLOG resource manager's routines
1088 : : */
1089 : : void
4184 heikki.linnakangas@i 1090 : 0 : clog_redo(XLogReaderState *record)
1091 : : {
1092 : 0 : uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
1093 : :
1094 : : /* Backup blocks are not used in clog records */
1095 [ # # ]: 0 : Assert(!XLogRecHasAnyBlockRefs(record));
1096 : :
7925 tgl@sss.pgh.pa.us 1097 [ # # ]: 0 : if (info == CLOG_ZEROPAGE)
1098 : : {
1099 : : int64 pageno;
1100 : :
888 akorotkov@postgresql 1101 : 0 : memcpy(&pageno, XLogRecGetData(record), sizeof(pageno));
302 alvherre@kurilemu.de 1102 :UNC 0 : SimpleLruZeroAndWritePage(XactCtl, pageno);
1103 : : }
7121 tgl@sss.pgh.pa.us 1104 [ # # ]:UBC 0 : else if (info == CLOG_TRUNCATE)
1105 : : {
1106 : : xl_clog_truncate xlrec;
1107 : :
3330 rhaas@postgresql.org 1108 : 0 : memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_clog_truncate));
1109 : :
1110 : 0 : AdvanceOldestClogXid(xlrec.oldestXact);
1111 : :
2181 tgl@sss.pgh.pa.us 1112 : 0 : SimpleLruTruncate(XactCtl, xlrec.pageno);
1113 : : }
1114 : : else
7121 1115 [ # # ]: 0 : elog(PANIC, "clog_redo: unknown op code %u", info);
7925 1116 : 0 : }
1117 : :
1118 : : /*
1119 : : * Entrypoint for sync.c to sync clog files.
1120 : : */
1121 : : int
2048 tmunro@postgresql.or 1122 : 0 : clogsyncfiletag(const FileTag *ftag, char *path)
1123 : : {
1124 : 0 : return SlruSyncFileTag(XactCtl, ftag, path);
1125 : : }
|