Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * sync.c
4 : : * File synchronization management code.
5 : : *
6 : : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7 : : * Portions Copyright (c) 1994, Regents of the University of California
8 : : *
9 : : *
10 : : * IDENTIFICATION
11 : : * src/backend/storage/sync/sync.c
12 : : *
13 : : *-------------------------------------------------------------------------
14 : : */
15 : : #include "postgres.h"
16 : :
17 : : #include <unistd.h>
18 : : #include <fcntl.h>
19 : : #include <sys/file.h>
20 : :
21 : : #include "access/clog.h"
22 : : #include "access/commit_ts.h"
23 : : #include "access/multixact.h"
24 : : #include "access/xlog.h"
25 : : #include "miscadmin.h"
26 : : #include "pgstat.h"
27 : : #include "portability/instr_time.h"
28 : : #include "postmaster/bgwriter.h"
29 : : #include "storage/fd.h"
30 : : #include "storage/latch.h"
31 : : #include "storage/md.h"
32 : : #include "utils/hsearch.h"
33 : : #include "utils/memutils.h"
34 : : #include "utils/wait_event.h"
35 : :
36 : : /*
37 : : * In some contexts (currently, standalone backends and the checkpointer)
38 : : * we keep track of pending fsync operations: we need to remember all relation
39 : : * segments that have been written since the last checkpoint, so that we can
40 : : * fsync them down to disk before completing the next checkpoint. This hash
41 : : * table remembers the pending operations. We use a hash table mostly as
42 : : * a convenient way of merging duplicate requests.
43 : : *
44 : : * We use a similar mechanism to remember no-longer-needed files that can
45 : : * be deleted after the next checkpoint, but we use a linked list instead of
46 : : * a hash table, because we don't expect there to be any duplicate requests.
47 : : *
48 : : * These mechanisms are only used for non-temp relations; we never fsync
49 : : * temp rels, nor do we need to postpone their deletion (see comments in
50 : : * mdunlink).
51 : : *
52 : : * (Regular backends do not track pending operations locally, but forward
53 : : * them to the checkpointer.)
54 : : */
55 : : typedef uint16 CycleCtr; /* can be any convenient integer size */
56 : :
57 : : typedef struct
58 : : {
59 : : FileTag tag; /* identifies handler and file */
60 : : CycleCtr cycle_ctr; /* sync_cycle_ctr of oldest request */
61 : : bool canceled; /* canceled is true if we canceled "recently" */
62 : : } PendingFsyncEntry;
63 : :
64 : : typedef struct
65 : : {
66 : : FileTag tag; /* identifies handler and file */
67 : : CycleCtr cycle_ctr; /* checkpoint_cycle_ctr when request was made */
68 : : bool canceled; /* true if request has been canceled */
69 : : } PendingUnlinkEntry;
70 : :
71 : : static HTAB *pendingOps = NULL;
72 : : static List *pendingUnlinks = NIL;
73 : : static MemoryContext pendingOpsCxt; /* context for the above */
74 : :
75 : : static CycleCtr sync_cycle_ctr = 0;
76 : : static CycleCtr checkpoint_cycle_ctr = 0;
77 : :
78 : : /* Intervals for calling AbsorbSyncRequests */
79 : : #define FSYNCS_PER_ABSORB 10
80 : : #define UNLINKS_PER_ABSORB 10
81 : :
82 : : /*
83 : : * Function pointers for handling sync and unlink requests.
84 : : */
85 : : typedef struct SyncOps
86 : : {
87 : : int (*sync_syncfiletag) (const FileTag *ftag, char *path);
88 : : int (*sync_unlinkfiletag) (const FileTag *ftag, char *path);
89 : : bool (*sync_filetagmatches) (const FileTag *ftag,
90 : : const FileTag *candidate);
91 : : } SyncOps;
92 : :
93 : : /*
94 : : * These indexes must correspond to the values of the SyncRequestHandler enum.
95 : : */
96 : : static const SyncOps syncsw[] = {
97 : : /* magnetic disk */
98 : : [SYNC_HANDLER_MD] = {
99 : : .sync_syncfiletag = mdsyncfiletag,
100 : : .sync_unlinkfiletag = mdunlinkfiletag,
101 : : .sync_filetagmatches = mdfiletagmatches
102 : : },
103 : : /* pg_xact */
104 : : [SYNC_HANDLER_CLOG] = {
105 : : .sync_syncfiletag = clogsyncfiletag
106 : : },
107 : : /* pg_commit_ts */
108 : : [SYNC_HANDLER_COMMIT_TS] = {
109 : : .sync_syncfiletag = committssyncfiletag
110 : : },
111 : : /* pg_multixact/offsets */
112 : : [SYNC_HANDLER_MULTIXACT_OFFSET] = {
113 : : .sync_syncfiletag = multixactoffsetssyncfiletag
114 : : },
115 : : /* pg_multixact/members */
116 : : [SYNC_HANDLER_MULTIXACT_MEMBER] = {
117 : : .sync_syncfiletag = multixactmemberssyncfiletag
118 : : }
119 : : };
120 : :
121 : : /*
122 : : * Initialize data structures for the file sync tracking.
123 : : */
124 : : void
2537 tmunro@postgresql.or 125 :CBC 21566 : InitSync(void)
126 : : {
127 : : /*
128 : : * Create pending-operations hashtable if we need it. Currently, we need
129 : : * it if we are standalone (not under a postmaster) or if we are a
130 : : * checkpointer auxiliary process.
131 : : */
1686 132 [ + + + + ]: 21566 : if (!IsUnderPostmaster || AmCheckpointerProcess())
133 : : {
134 : : HASHCTL hash_ctl;
135 : :
136 : : /*
137 : : * XXX: The checkpointer needs to add entries to the pending ops table
138 : : * when absorbing fsync requests. That is done within a critical
139 : : * section, which isn't usually allowed, but we make an exception. It
140 : : * means that there's a theoretical possibility that you run out of
141 : : * memory while absorbing fsync requests, which leads to a PANIC.
142 : : * Fortunately the hash table is small so that's unlikely to happen in
143 : : * practice.
144 : : */
2537 145 : 700 : pendingOpsCxt = AllocSetContextCreate(TopMemoryContext,
146 : : "Pending ops context",
147 : : ALLOCSET_DEFAULT_SIZES);
148 : 700 : MemoryContextAllowInCriticalSection(pendingOpsCxt, true);
149 : :
150 : 700 : hash_ctl.keysize = sizeof(FileTag);
151 : 700 : hash_ctl.entrysize = sizeof(PendingFsyncEntry);
152 : 700 : hash_ctl.hcxt = pendingOpsCxt;
153 : 700 : pendingOps = hash_create("Pending Ops Table",
154 : : 100L,
155 : : &hash_ctl,
156 : : HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
157 : 700 : pendingUnlinks = NIL;
158 : : }
159 : 21566 : }
160 : :
161 : : /*
162 : : * SyncPreCheckpoint() -- Do pre-checkpoint work
163 : : *
164 : : * To distinguish unlink requests that arrived before this checkpoint
165 : : * started from those that arrived during the checkpoint, we use a cycle
166 : : * counter similar to the one we use for fsync requests. That cycle
167 : : * counter is incremented here.
168 : : *
169 : : * This must be called *before* the checkpoint REDO point is determined.
170 : : * That ensures that we won't delete files too soon. Since this calls
171 : : * AbsorbSyncRequests(), which performs memory allocations, it cannot be
172 : : * called within a critical section.
173 : : *
174 : : * Note that we can't do anything here that depends on the assumption
175 : : * that the checkpoint will be completed.
176 : : */
177 : : void
178 : 1600 : SyncPreCheckpoint(void)
179 : : {
180 : : /*
181 : : * Operations such as DROP TABLESPACE assume that the next checkpoint will
182 : : * process all recently forwarded unlink requests, but if they aren't
183 : : * absorbed prior to advancing the cycle counter, they won't be processed
184 : : * until a future checkpoint. The following absorb ensures that any
185 : : * unlink requests forwarded before the checkpoint began will be processed
186 : : * in the current checkpoint.
187 : : */
1460 188 : 1600 : AbsorbSyncRequests();
189 : :
190 : : /*
191 : : * Any unlink requests arriving after this point will be assigned the next
192 : : * cycle counter, and won't be unlinked until next checkpoint.
193 : : */
2537 194 : 1600 : checkpoint_cycle_ctr++;
195 : 1600 : }
196 : :
197 : : /*
198 : : * SyncPostCheckpoint() -- Do post-checkpoint work
199 : : *
200 : : * Remove any lingering files that can now be safely removed.
201 : : */
202 : : void
203 : 1598 : SyncPostCheckpoint(void)
204 : : {
205 : : int absorb_counter;
206 : : ListCell *lc;
207 : :
208 : 1598 : absorb_counter = UNLINKS_PER_ABSORB;
1594 tgl@sss.pgh.pa.us 209 [ + + + + : 36715 : foreach(lc, pendingUnlinks)
+ + ]
210 : : {
211 : 35255 : PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(lc);
212 : : char path[MAXPGPATH];
213 : :
214 : : /* Skip over any canceled entries */
215 [ + + ]: 35255 : if (entry->canceled)
216 : 1 : continue;
217 : :
218 : : /*
219 : : * New entries are appended to the end, so if the entry is new we've
220 : : * reached the end of old entries.
221 : : *
222 : : * Note: if just the right number of consecutive checkpoints fail, we
223 : : * could be fooled here by cycle_ctr wraparound. However, the only
224 : : * consequence is that we'd delay unlinking for one more checkpoint,
225 : : * which is perfectly tolerable.
226 : : */
2537 tmunro@postgresql.or 227 [ + + ]: 35254 : if (entry->cycle_ctr == checkpoint_cycle_ctr)
228 : 138 : break;
229 : :
230 : : /* Unlink the file */
231 [ + + ]: 35116 : if (syncsw[entry->tag.handler].sync_unlinkfiletag(&entry->tag,
232 : : path) < 0)
233 : : {
234 : : /*
235 : : * There's a race condition, when the database is dropped at the
236 : : * same time that we process the pending unlink requests. If the
237 : : * DROP DATABASE deletes the file before we do, we will get ENOENT
238 : : * here. rmtree() also has to ignore ENOENT errors, to deal with
239 : : * the possibility that we delete the file first.
240 : : */
241 [ - + ]: 3 : if (errno != ENOENT)
2537 tmunro@postgresql.or 242 [ # # ]:UBC 0 : ereport(WARNING,
243 : : (errcode_for_file_access(),
244 : : errmsg("could not remove file \"%s\": %m", path)));
245 : : }
246 : :
247 : : /* Mark the list entry as canceled, just in case */
1594 tgl@sss.pgh.pa.us 248 :CBC 35116 : entry->canceled = true;
249 : :
250 : : /*
251 : : * As in ProcessSyncRequests, we don't want to stop absorbing fsync
252 : : * requests for a long time when there are many deletions to be done.
253 : : * We can safely call AbsorbSyncRequests() at this point in the loop.
254 : : */
2537 tmunro@postgresql.or 255 [ + + ]: 35116 : if (--absorb_counter <= 0)
256 : : {
257 : 3384 : AbsorbSyncRequests();
258 : 3384 : absorb_counter = UNLINKS_PER_ABSORB;
259 : : }
260 : : }
261 : :
262 : : /*
263 : : * If we reached the end of the list, we can just remove the whole list
264 : : * (remembering to pfree all the PendingUnlinkEntry objects). Otherwise,
265 : : * we must keep the entries at or after "lc".
266 : : */
1594 tgl@sss.pgh.pa.us 267 [ + + ]: 1598 : if (lc == NULL)
268 : : {
269 : 1460 : list_free_deep(pendingUnlinks);
270 : 1460 : pendingUnlinks = NIL;
271 : : }
272 : : else
273 : : {
274 : 138 : int ntodelete = list_cell_number(pendingUnlinks, lc);
275 : :
276 [ + + ]: 18907 : for (int i = 0; i < ntodelete; i++)
277 : 18769 : pfree(list_nth(pendingUnlinks, i));
278 : :
279 : 138 : pendingUnlinks = list_delete_first_n(pendingUnlinks, ntodelete);
280 : : }
2537 tmunro@postgresql.or 281 : 1598 : }
282 : :
283 : : /*
284 : : * ProcessSyncRequests() -- Process queued fsync requests.
285 : : */
286 : : void
287 : 1802 : ProcessSyncRequests(void)
288 : : {
289 : : static bool sync_in_progress = false;
290 : :
291 : : HASH_SEQ_STATUS hstat;
292 : : PendingFsyncEntry *entry;
293 : : int absorb_counter;
294 : :
295 : : /* Statistics on sync times */
296 : 1802 : int processed = 0;
297 : : instr_time sync_start,
298 : : sync_end,
299 : : sync_diff;
300 : : uint64 elapsed;
301 : 1802 : uint64 longest = 0;
302 : 1802 : uint64 total_elapsed = 0;
303 : :
304 : : /*
305 : : * This is only called during checkpoints, and checkpoints should only
306 : : * occur in processes that have created a pendingOps.
307 : : */
308 [ - + ]: 1802 : if (!pendingOps)
2537 tmunro@postgresql.or 309 [ # # ]:UBC 0 : elog(ERROR, "cannot sync without a pendingOps table");
310 : :
311 : : /*
312 : : * If we are in the checkpointer, the sync had better include all fsync
313 : : * requests that were queued by backends up to this point. The tightest
314 : : * race condition that could occur is that a buffer that must be written
315 : : * and fsync'd for the checkpoint could have been dumped by a backend just
316 : : * before it was visited by BufferSync(). We know the backend will have
317 : : * queued an fsync request before clearing the buffer's dirtybit, so we
318 : : * are safe as long as we do an Absorb after completing BufferSync().
319 : : */
2537 tmunro@postgresql.or 320 :CBC 1802 : AbsorbSyncRequests();
321 : :
322 : : /*
323 : : * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
324 : : * checkpoint), we want to ignore fsync requests that are entered into the
325 : : * hashtable after this point --- they should be processed next time,
326 : : * instead. We use sync_cycle_ctr to tell old entries apart from new
327 : : * ones: new ones will have cycle_ctr equal to the incremented value of
328 : : * sync_cycle_ctr.
329 : : *
330 : : * In normal circumstances, all entries present in the table at this point
331 : : * will have cycle_ctr exactly equal to the current (about to be old)
332 : : * value of sync_cycle_ctr. However, if we fail partway through the
333 : : * fsync'ing loop, then older values of cycle_ctr might remain when we
334 : : * come back here to try again. Repeated checkpoint failures would
335 : : * eventually wrap the counter around to the point where an old entry
336 : : * might appear new, causing us to skip it, possibly allowing a checkpoint
337 : : * to succeed that should not have. To forestall wraparound, any time the
338 : : * previous ProcessSyncRequests() failed to complete, run through the
339 : : * table and forcibly set cycle_ctr = sync_cycle_ctr.
340 : : *
341 : : * Think not to merge this loop with the main loop, as the problem is
342 : : * exactly that that loop may fail before having visited all the entries.
343 : : * From a performance point of view it doesn't matter anyway, as this path
344 : : * will never be taken in a system that's functioning normally.
345 : : */
346 [ - + ]: 1802 : if (sync_in_progress)
347 : : {
348 : : /* prior try failed, so update any stale cycle_ctr values */
2537 tmunro@postgresql.or 349 :UBC 0 : hash_seq_init(&hstat, pendingOps);
350 [ # # ]: 0 : while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
351 : : {
352 : 0 : entry->cycle_ctr = sync_cycle_ctr;
353 : : }
354 : : }
355 : :
356 : : /* Advance counter so that new hashtable entries are distinguishable */
2537 tmunro@postgresql.or 357 :CBC 1802 : sync_cycle_ctr++;
358 : :
359 : : /* Set flag to detect failure if we don't reach the end of the loop */
360 : 1802 : sync_in_progress = true;
361 : :
362 : : /* Now scan the hashtable for fsync requests to process */
363 : 1802 : absorb_counter = FSYNCS_PER_ABSORB;
364 : 1802 : hash_seq_init(&hstat, pendingOps);
365 [ + + ]: 176159 : while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
366 : : {
367 : : int failures;
368 : :
369 : : /*
370 : : * If the entry is new then don't process it this time; it is new.
371 : : * Note "continue" bypasses the hash-remove call at the bottom of the
372 : : * loop.
373 : : */
374 [ - + ]: 174357 : if (entry->cycle_ctr == sync_cycle_ctr)
2537 tmunro@postgresql.or 375 :UBC 0 : continue;
376 : :
377 : : /* Else assert we haven't missed it */
2537 tmunro@postgresql.or 378 [ - + ]:CBC 174357 : Assert((CycleCtr) (entry->cycle_ctr + 1) == sync_cycle_ctr);
379 : :
380 : : /*
381 : : * If fsync is off then we don't have to bother opening the file at
382 : : * all. (We delay checking until this point so that changing fsync on
383 : : * the fly behaves sensibly.)
384 : : */
2131 heikki.linnakangas@i 385 [ - + ]: 174357 : if (enableFsync)
386 : : {
387 : : /*
388 : : * If in checkpointer, we want to absorb pending requests every so
389 : : * often to prevent overflow of the fsync request queue. It is
390 : : * unspecified whether newly-added entries will be visited by
391 : : * hash_seq_search, but we don't care since we don't need to
392 : : * process them anyway.
393 : : */
2131 heikki.linnakangas@i 394 [ # # ]:UBC 0 : if (--absorb_counter <= 0)
395 : : {
396 : 0 : AbsorbSyncRequests();
397 : 0 : absorb_counter = FSYNCS_PER_ABSORB;
398 : : }
399 : :
400 : : /*
401 : : * The fsync table could contain requests to fsync segments that
402 : : * have been deleted (unlinked) by the time we get to them. Rather
403 : : * than just hoping an ENOENT (or EACCES on Windows) error can be
404 : : * ignored, what we do on error is absorb pending requests and
405 : : * then retry. Since mdunlink() queues a "cancel" message before
406 : : * actually unlinking, the fsync request is guaranteed to be
407 : : * marked canceled after the absorb if it really was this case.
408 : : * DROP DATABASE likewise has to tell us to forget fsync requests
409 : : * before it starts deletions.
410 : : */
411 [ # # ]: 0 : for (failures = 0; !entry->canceled; failures++)
412 : : {
413 : : char path[MAXPGPATH];
414 : :
415 : 0 : INSTR_TIME_SET_CURRENT(sync_start);
416 [ # # ]: 0 : if (syncsw[entry->tag.handler].sync_syncfiletag(&entry->tag,
417 : : path) == 0)
418 : : {
419 : : /* Success; update statistics about sync timing */
420 : 0 : INSTR_TIME_SET_CURRENT(sync_end);
421 : 0 : sync_diff = sync_end;
422 : 0 : INSTR_TIME_SUBTRACT(sync_diff, sync_start);
423 : 0 : elapsed = INSTR_TIME_GET_MICROSEC(sync_diff);
424 [ # # ]: 0 : if (elapsed > longest)
425 : 0 : longest = elapsed;
426 : 0 : total_elapsed += elapsed;
427 : 0 : processed++;
428 : :
429 [ # # ]: 0 : if (log_checkpoints)
2113 peter@eisentraut.org 430 [ # # ]: 0 : elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f ms",
431 : : processed,
432 : : path,
433 : : (double) elapsed / 1000);
434 : :
2131 heikki.linnakangas@i 435 : 0 : break; /* out of retry loop */
436 : : }
437 : :
438 : : /*
439 : : * It is possible that the relation has been dropped or
440 : : * truncated since the fsync request was entered. Therefore,
441 : : * allow ENOENT, but only if we didn't fail already on this
442 : : * file.
443 : : */
444 [ # # # # ]: 0 : if (!FILE_POSSIBLY_DELETED(errno) || failures > 0)
445 [ # # ]: 0 : ereport(data_sync_elevel(ERROR),
446 : : (errcode_for_file_access(),
447 : : errmsg("could not fsync file \"%s\": %m",
448 : : path)));
449 : : else
450 [ # # ]: 0 : ereport(DEBUG1,
451 : : (errcode_for_file_access(),
452 : : errmsg_internal("could not fsync file \"%s\" but retrying: %m",
453 : : path)));
454 : :
455 : : /*
456 : : * Absorb incoming requests and check to see if a cancel
457 : : * arrived for this relation fork.
458 : : */
459 : 0 : AbsorbSyncRequests();
460 : 0 : absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */
461 : : } /* end retry loop */
462 : : }
463 : :
464 : : /* We are done with this entry, remove it */
2537 tmunro@postgresql.or 465 [ - + ]:CBC 174357 : if (hash_search(pendingOps, &entry->tag, HASH_REMOVE, NULL) == NULL)
2537 tmunro@postgresql.or 466 [ # # ]:UBC 0 : elog(ERROR, "pendingOps corrupted");
467 : : } /* end loop over hashtable entries */
468 : :
469 : : /* Return sync performance metrics for report at checkpoint end */
2537 tmunro@postgresql.or 470 :CBC 1802 : CheckpointStats.ckpt_sync_rels = processed;
471 : 1802 : CheckpointStats.ckpt_longest_sync = longest;
472 : 1802 : CheckpointStats.ckpt_agg_sync_time = total_elapsed;
473 : :
474 : : /* Flag successful completion of ProcessSyncRequests */
475 : 1802 : sync_in_progress = false;
476 : 1802 : }
477 : :
478 : : /*
479 : : * RememberSyncRequest() -- callback from checkpointer side of sync request
480 : : *
481 : : * We stuff fsync requests into the local hash table for execution
482 : : * during the checkpointer's next checkpoint. UNLINK requests go into a
483 : : * separate linked list, however, because they get processed separately.
484 : : *
485 : : * See sync.h for more information on the types of sync requests supported.
486 : : */
487 : : void
488 : 1096235 : RememberSyncRequest(const FileTag *ftag, SyncRequestType type)
489 : : {
490 [ - + ]: 1096235 : Assert(pendingOps);
491 : :
492 [ + + ]: 1096235 : if (type == SYNC_FORGET_REQUEST)
493 : : {
494 : : PendingFsyncEntry *entry;
495 : :
496 : : /* Cancel previously entered request */
497 : 138031 : entry = (PendingFsyncEntry *) hash_search(pendingOps,
498 : : ftag,
499 : : HASH_FIND,
500 : : NULL);
501 [ + + ]: 138031 : if (entry != NULL)
502 : 8120 : entry->canceled = true;
503 : : }
504 [ + + ]: 958204 : else if (type == SYNC_FILTER_REQUEST)
505 : : {
506 : : HASH_SEQ_STATUS hstat;
507 : : PendingFsyncEntry *pfe;
508 : : ListCell *cell;
509 : :
510 : : /* Cancel matching fsync requests */
511 : 42 : hash_seq_init(&hstat, pendingOps);
1342 512 [ + + ]: 7416 : while ((pfe = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
513 : : {
514 [ + + + + ]: 14659 : if (pfe->tag.handler == ftag->handler &&
515 : 7327 : syncsw[ftag->handler].sync_filetagmatches(ftag, &pfe->tag))
516 : 3952 : pfe->canceled = true;
517 : : }
518 : :
519 : : /* Cancel matching unlink requests */
2435 tgl@sss.pgh.pa.us 520 [ + + + + : 97 : foreach(cell, pendingUnlinks)
+ + ]
521 : : {
1342 tmunro@postgresql.or 522 : 55 : PendingUnlinkEntry *pue = (PendingUnlinkEntry *) lfirst(cell);
523 : :
524 [ + - + + ]: 110 : if (pue->tag.handler == ftag->handler &&
525 : 55 : syncsw[ftag->handler].sync_filetagmatches(ftag, &pue->tag))
526 : 1 : pue->canceled = true;
527 : : }
528 : : }
2537 529 [ + + ]: 958162 : else if (type == SYNC_UNLINK_REQUEST)
530 : : {
531 : : /* Unlink request: put it in the linked list */
532 : 35117 : MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
533 : : PendingUnlinkEntry *entry;
534 : :
95 michael@paquier.xyz 535 :GNC 35117 : entry = palloc_object(PendingUnlinkEntry);
2537 tmunro@postgresql.or 536 :CBC 35117 : entry->tag = *ftag;
537 : 35117 : entry->cycle_ctr = checkpoint_cycle_ctr;
1594 tgl@sss.pgh.pa.us 538 : 35117 : entry->canceled = false;
539 : :
2537 tmunro@postgresql.or 540 : 35117 : pendingUnlinks = lappend(pendingUnlinks, entry);
541 : :
542 : 35117 : MemoryContextSwitchTo(oldcxt);
543 : : }
544 : : else
545 : : {
546 : : /* Normal case: enter a request to fsync this segment */
547 : 923045 : MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
548 : : PendingFsyncEntry *entry;
549 : : bool found;
550 : :
551 [ - + ]: 923045 : Assert(type == SYNC_REQUEST);
552 : :
553 : 923045 : entry = (PendingFsyncEntry *) hash_search(pendingOps,
554 : : ftag,
555 : : HASH_ENTER,
556 : : &found);
557 : : /* if new entry, or was previously canceled, initialize it */
1997 558 [ + + + + ]: 923045 : if (!found || entry->canceled)
559 : : {
2537 560 : 176441 : entry->cycle_ctr = sync_cycle_ctr;
561 : 176441 : entry->canceled = false;
562 : : }
563 : :
564 : : /*
565 : : * NB: it's intentional that we don't change cycle_ctr if the entry
566 : : * already exists. The cycle_ctr must represent the oldest fsync
567 : : * request that could be in the entry.
568 : : */
569 : :
570 : 923045 : MemoryContextSwitchTo(oldcxt);
571 : : }
572 : 1096235 : }
573 : :
574 : : /*
575 : : * Register the sync request locally, or forward it to the checkpointer.
576 : : *
577 : : * If retryOnError is true, we'll keep trying if there is no space in the
578 : : * queue. Return true if we succeeded, or false if there wasn't space.
579 : : */
580 : : bool
581 : 1189463 : RegisterSyncRequest(const FileTag *ftag, SyncRequestType type,
582 : : bool retryOnError)
583 : : {
584 : : bool ret;
585 : :
586 [ + + ]: 1189463 : if (pendingOps != NULL)
587 : : {
588 : : /* standalone backend or startup process: fsync state is local */
589 : 357345 : RememberSyncRequest(ftag, type);
590 : 357345 : return true;
591 : : }
592 : :
593 : : for (;;)
594 : : {
595 : : /*
596 : : * Notify the checkpointer about it. If we fail to queue a message in
597 : : * retryOnError mode, we have to sleep and try again ... ugly, but
598 : : * hopefully won't happen often.
599 : : *
600 : : * XXX should we CHECK_FOR_INTERRUPTS in this loop? Escaping with an
601 : : * error in the case of SYNC_UNLINK_REQUEST would leave the
602 : : * no-longer-used file still present on disk, which would be bad, so
603 : : * I'm inclined to assume that the checkpointer will always empty the
604 : : * queue soon.
605 : : */
606 : 832131 : ret = ForwardSyncRequest(ftag, type);
607 : :
608 : : /*
609 : : * If we are successful in queueing the request, or we failed and were
610 : : * instructed not to retry on error, break.
611 : : */
612 [ + + + - : 832131 : if (ret || (!ret && !retryOnError))
+ + ]
613 : : break;
614 : :
1460 615 : 13 : WaitLatch(NULL, WL_EXIT_ON_PM_DEATH | WL_TIMEOUT, 10,
616 : : WAIT_EVENT_REGISTER_SYNC_REQUEST);
617 : : }
618 : :
2537 619 : 832118 : return ret;
620 : : }
|