Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : * slotsync.c
3 : : * Functionality for synchronizing slots to a standby server from the
4 : : * primary server.
5 : : *
6 : : * Copyright (c) 2024-2026, PostgreSQL Global Development Group
7 : : *
8 : : * IDENTIFICATION
9 : : * src/backend/replication/logical/slotsync.c
10 : : *
11 : : * This file contains the code for slot synchronization on a physical standby
12 : : * to fetch logical failover slots information from the primary server, create
13 : : * the slots on the standby and synchronize them periodically.
14 : : *
15 : : * Slot synchronization can be performed either automatically by enabling slot
16 : : * sync worker or manually by calling SQL function pg_sync_replication_slots().
17 : : *
18 : : * If the WAL corresponding to the remote's restart_lsn is not available on the
19 : : * physical standby or the remote's catalog_xmin precedes the oldest xid for
20 : : * which it is guaranteed that rows wouldn't have been removed then we cannot
21 : : * create the local standby slot because that would mean moving the local slot
22 : : * backward and decoding won't be possible via such a slot. In this case, the
23 : : * slot will be marked as RS_TEMPORARY. Once the primary server catches up,
24 : : * the slot will be marked as RS_PERSISTENT (which means sync-ready) after
25 : : * which slot sync worker can perform the sync periodically or user can call
26 : : * pg_sync_replication_slots() periodically to perform the syncs.
27 : : *
28 : : * If synchronized slots fail to build a consistent snapshot from the
29 : : * restart_lsn before reaching confirmed_flush_lsn, they would become
30 : : * unreliable after promotion due to potential data loss from changes
31 : : * before reaching a consistent point. This can happen because the slots can
32 : : * be synced at some random time and we may not reach the consistent point
33 : : * at the same WAL location as the primary. So, we mark such slots as
34 : : * RS_TEMPORARY. Once the decoding from corresponding LSNs can reach a
35 : : * consistent point, they will be marked as RS_PERSISTENT.
36 : : *
37 : : * If the WAL prior to the remote slot's confirmed_flush_lsn has not been
38 : : * flushed on the standby, the slot is marked as RS_TEMPORARY. Once the standby
39 : : * catches up and flushes that WAL, the slot will be marked as RS_PERSISTENT.
40 : : *
41 : : * The slot sync worker waits for some time before the next synchronization,
42 : : * with the duration varying based on whether any slots were updated during
43 : : * the last cycle. Refer to the comments above wait_for_slot_activity() for
44 : : * more details.
45 : : *
46 : : * If the SQL function pg_sync_replication_slots() is used to sync the slots,
47 : : * and if the slots are not ready to be synced and are marked as RS_TEMPORARY
48 : : * because of any of the reasons mentioned above, then the SQL function also
49 : : * waits and retries until the slots are marked as RS_PERSISTENT (which means
50 : : * sync-ready). Refer to the comments in SyncReplicationSlots() for more
51 : : * details.
52 : : *
53 : : * Any standby synchronized slots will be dropped if they no longer need
54 : : * to be synchronized. See comment atop drop_local_obsolete_slots() for more
55 : : * details.
56 : : *---------------------------------------------------------------------------
57 : : */
58 : :
59 : : #include "postgres.h"
60 : :
61 : : #include <time.h>
62 : :
63 : : #include "access/xlog_internal.h"
64 : : #include "access/xlogrecovery.h"
65 : : #include "catalog/pg_database.h"
66 : : #include "libpq/pqsignal.h"
67 : : #include "pgstat.h"
68 : : #include "postmaster/interrupt.h"
69 : : #include "replication/logical.h"
70 : : #include "replication/slotsync.h"
71 : : #include "replication/snapbuild.h"
72 : : #include "storage/ipc.h"
73 : : #include "storage/lmgr.h"
74 : : #include "storage/proc.h"
75 : : #include "storage/procarray.h"
76 : : #include "storage/subsystems.h"
77 : : #include "tcop/tcopprot.h"
78 : : #include "utils/builtins.h"
79 : : #include "utils/memutils.h"
80 : : #include "utils/pg_lsn.h"
81 : : #include "utils/ps_status.h"
82 : : #include "utils/timeout.h"
83 : : #include "utils/wait_event.h"
84 : :
85 : : /*
86 : : * Struct for sharing information to control slot synchronization.
87 : : *
88 : : * The 'pid' is either the slot sync worker's pid or the backend's pid running
89 : : * the SQL function pg_sync_replication_slots(). On promotion, the startup
90 : : * process sets 'stopSignaled' and uses this 'pid' to signal the synchronizing
91 : : * process with PROCSIG_SLOTSYNC_MESSAGE and also to wake it up so that the
92 : : * process can immediately stop its synchronizing work.
93 : : * Setting 'stopSignaled' on the other hand is used to handle the race
94 : : * condition when the postmaster has not noticed the promotion yet and thus may
95 : : * end up restarting the slot sync worker. If 'stopSignaled' is set, the worker
96 : : * will exit in such a case. The SQL function pg_sync_replication_slots() will
97 : : * also error out if this flag is set. Note that we don't need to reset this
98 : : * variable as after promotion the slot sync worker won't be restarted because
99 : : * the pmState changes to PM_RUN from PM_HOT_STANDBY and we don't support
100 : : * demoting primary without restarting the server.
101 : : * See LaunchMissingBackgroundProcesses.
102 : : *
103 : : * The 'syncing' flag is needed to prevent concurrent slot syncs to avoid slot
104 : : * overwrites.
105 : : *
106 : : * The 'last_start_time' is needed by postmaster to start the slot sync worker
107 : : * once per SLOTSYNC_RESTART_INTERVAL_SEC. In cases where an immediate restart
108 : : * is expected (e.g., slot sync GUCs change), slot sync worker will reset
109 : : * last_start_time before exiting, so that postmaster can start the worker
110 : : * without waiting for SLOTSYNC_RESTART_INTERVAL_SEC.
111 : : */
112 : : typedef struct SlotSyncCtxStruct
113 : : {
114 : : pid_t pid;
115 : : bool stopSignaled;
116 : : bool syncing;
117 : : time_t last_start_time;
118 : : slock_t mutex;
119 : : } SlotSyncCtxStruct;
120 : :
121 : : static SlotSyncCtxStruct *SlotSyncCtx = NULL;
122 : :
123 : : static void SlotSyncShmemRequest(void *arg);
124 : : static void SlotSyncShmemInit(void *arg);
125 : :
126 : : const ShmemCallbacks SlotSyncShmemCallbacks = {
127 : : .request_fn = SlotSyncShmemRequest,
128 : : .init_fn = SlotSyncShmemInit,
129 : : };
130 : :
131 : : /* GUC variable */
132 : : bool sync_replication_slots = false;
133 : :
134 : : /*
135 : : * The sleep time (ms) between slot-sync cycles varies dynamically
136 : : * (within a MIN/MAX range) according to slot activity. See
137 : : * wait_for_slot_activity() for details.
138 : : */
139 : : #define MIN_SLOTSYNC_WORKER_NAPTIME_MS 200
140 : : #define MAX_SLOTSYNC_WORKER_NAPTIME_MS 30000 /* 30s */
141 : :
142 : : static long sleep_ms = MIN_SLOTSYNC_WORKER_NAPTIME_MS;
143 : :
144 : : /* The restart interval for slot sync work used by postmaster */
145 : : #define SLOTSYNC_RESTART_INTERVAL_SEC 10
146 : :
147 : : /*
148 : : * Flag to tell if we are syncing replication slots. Unlike the 'syncing' flag
149 : : * in SlotSyncCtxStruct, this flag is true only if the current process is
150 : : * performing slot synchronization.
151 : : */
152 : : static bool syncing_slots = false;
153 : :
154 : : /*
155 : : * Interrupt flag set when PROCSIG_SLOTSYNC_MESSAGE is received, asking the
156 : : * slotsync worker or pg_sync_replication_slots() to stop because
157 : : * standby promotion has been triggered.
158 : : */
159 : : volatile sig_atomic_t SlotSyncShutdownPending = false;
160 : :
161 : : /*
162 : : * Structure to hold information fetched from the primary server about a logical
163 : : * replication slot.
164 : : */
165 : : typedef struct RemoteSlot
166 : : {
167 : : char *name;
168 : : char *plugin;
169 : : char *database;
170 : : bool two_phase;
171 : : bool failover;
172 : : XLogRecPtr restart_lsn;
173 : : XLogRecPtr confirmed_lsn;
174 : : XLogRecPtr two_phase_at;
175 : : TransactionId catalog_xmin;
176 : :
177 : : /* RS_INVAL_NONE if valid, or the reason of invalidation */
178 : : ReplicationSlotInvalidationCause invalidated;
179 : : } RemoteSlot;
180 : :
181 : : static void slotsync_failure_callback(int code, Datum arg);
182 : : static void update_synced_slots_inactive_since(void);
183 : :
184 : : /*
185 : : * Update slot sync skip stats. This function requires the caller to acquire
186 : : * the slot.
187 : : */
188 : : static void
183 akapila@postgresql.o 189 :GNC 43 : update_slotsync_skip_stats(SlotSyncSkipReason skip_reason)
190 : : {
191 : : ReplicationSlot *slot;
192 : :
193 [ - + ]: 43 : Assert(MyReplicationSlot);
194 : :
195 : 43 : slot = MyReplicationSlot;
196 : :
197 : : /*
198 : : * Update the slot sync related stats in pg_stat_replication_slots when a
199 : : * slot sync is skipped
200 : : */
201 [ + + ]: 43 : if (skip_reason != SS_SKIP_NONE)
202 : 7 : pgstat_report_replslotsync(slot);
203 : :
204 : : /* Update the slot sync skip reason */
205 [ + + ]: 43 : if (slot->slotsync_skip_reason != skip_reason)
206 : : {
207 : 4 : SpinLockAcquire(&slot->mutex);
208 : 4 : slot->slotsync_skip_reason = skip_reason;
209 : 4 : SpinLockRelease(&slot->mutex);
210 : : }
211 : 43 : }
212 : :
213 : : /*
214 : : * If necessary, update the local synced slot's metadata based on the data
215 : : * from the remote slot.
216 : : *
217 : : * If no update was needed (the data of the remote slot is the same as the
218 : : * local slot) return false, otherwise true.
219 : : */
220 : : static bool
107 221 : 43 : update_local_synced_slot(RemoteSlot *remote_slot, Oid remote_dbid)
222 : : {
836 akapila@postgresql.o 223 :CBC 43 : ReplicationSlot *slot = MyReplicationSlot;
778 224 : 43 : bool updated_xmin_or_lsn = false;
225 : 43 : bool updated_config = false;
183 akapila@postgresql.o 226 :GNC 43 : SlotSyncSkipReason skip_reason = SS_SKIP_NONE;
107 227 : 43 : XLogRecPtr latestFlushPtr = GetStandbyFlushRecPtr(NULL);
228 : :
836 akapila@postgresql.o 229 [ - + ]:CBC 43 : Assert(slot->data.invalidated == RS_INVAL_NONE);
230 : :
231 : : /*
232 : : * Make sure that concerned WAL is received and flushed before syncing
233 : : * slot to target lsn received from the primary server.
234 : : */
107 akapila@postgresql.o 235 [ - + ]:GNC 43 : if (remote_slot->confirmed_lsn > latestFlushPtr)
236 : : {
107 akapila@postgresql.o 237 :UNC 0 : update_slotsync_skip_stats(SS_SKIP_WAL_NOT_FLUSHED);
238 : :
239 : : /*
240 : : * Can get here only if GUC 'synchronized_standby_slots' on the
241 : : * primary server was not configured correctly.
242 : : */
85 243 [ # # ]: 0 : ereport(LOG,
244 : : errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
245 : : errmsg("skipping slot synchronization because the received slot sync"
246 : : " LSN %X/%08X for slot \"%s\" is ahead of the standby position %X/%08X",
247 : : LSN_FORMAT_ARGS(remote_slot->confirmed_lsn),
248 : : remote_slot->name,
249 : : LSN_FORMAT_ARGS(latestFlushPtr)));
250 : :
107 251 : 0 : return false;
252 : : }
253 : :
254 : : /*
255 : : * Don't overwrite if we already have a newer catalog_xmin and
256 : : * restart_lsn.
257 : : */
778 akapila@postgresql.o 258 [ + + + + ]:CBC 81 : if (remote_slot->restart_lsn < slot->data.restart_lsn ||
259 : 38 : TransactionIdPrecedes(remote_slot->catalog_xmin,
260 : : slot->data.catalog_xmin))
261 : : {
262 : : /* Update slot sync skip stats */
183 akapila@postgresql.o 263 :GNC 7 : update_slotsync_skip_stats(SS_SKIP_WAL_OR_ROWS_REMOVED);
264 : :
265 : : /*
266 : : * This can happen in following situations:
267 : : *
268 : : * If the slot is temporary, it means either the initial WAL location
269 : : * reserved for the local slot is ahead of the remote slot's
270 : : * restart_lsn or the initial xmin_horizon computed for the local slot
271 : : * is ahead of the remote slot.
272 : : *
273 : : * If the slot is persistent, both restart_lsn and catalog_xmin of the
274 : : * synced slot could still be ahead of the remote slot. Since we use
275 : : * slot advance functionality to keep snapbuild/slot updated, it is
276 : : * possible that the restart_lsn and catalog_xmin are advanced to a
277 : : * later position than it has on the primary. This can happen when
278 : : * slot advancing machinery finds running xacts record after reaching
279 : : * the consistent state at a later point than the primary where it
280 : : * serializes the snapshot and updates the restart_lsn.
281 : : *
282 : : * We LOG the message if the slot is temporary as it can help the user
283 : : * to understand why the slot is not sync-ready. In the case of a
284 : : * persistent slot, it would be a more common case and won't directly
285 : : * impact the users, so we used DEBUG1 level to log the message.
286 : : */
778 akapila@postgresql.o 287 [ + - + - ]:CBC 7 : ereport(slot->data.persistency == RS_TEMPORARY ? LOG : DEBUG1,
288 : : errmsg("could not synchronize replication slot \"%s\"",
289 : : remote_slot->name),
290 : : errdetail("Synchronization could lead to data loss, because the remote slot needs WAL at LSN %X/%08X and catalog xmin %u, but the standby has LSN %X/%08X and catalog xmin %u.",
291 : : LSN_FORMAT_ARGS(remote_slot->restart_lsn),
292 : : remote_slot->catalog_xmin,
293 : : LSN_FORMAT_ARGS(slot->data.restart_lsn),
294 : : slot->data.catalog_xmin));
295 : :
296 : : /*
297 : : * Skip updating the configuration. This is required to avoid syncing
298 : : * two_phase_at without syncing confirmed_lsn. Otherwise, the prepared
299 : : * transaction between old confirmed_lsn and two_phase_at will
300 : : * unexpectedly get decoded and sent to the downstream after
301 : : * promotion. See comments in ReorderBufferFinishPrepared.
302 : : */
396 303 : 7 : return false;
304 : : }
305 : :
306 : : /*
307 : : * Attempt to sync LSNs and xmins only if remote slot is ahead of local
308 : : * slot.
309 : : */
310 [ + + ]: 36 : if (remote_slot->confirmed_lsn > slot->data.confirmed_flush ||
311 [ + + - + ]: 49 : remote_slot->restart_lsn > slot->data.restart_lsn ||
312 : 24 : TransactionIdFollows(remote_slot->catalog_xmin,
313 : : slot->data.catalog_xmin))
314 : : {
315 : : /*
316 : : * We can't directly copy the remote slot's LSN or xmin unless there
317 : : * exists a consistent snapshot at that point. Otherwise, after
318 : : * promotion, the slots may not reach a consistent point before the
319 : : * confirmed_flush_lsn which can lead to a data loss. To avoid data
320 : : * loss, we let slot machinery advance the slot which ensures that
321 : : * snapbuilder/slot statuses are updated properly.
322 : : */
787 323 [ + + ]: 12 : if (SnapBuildSnapshotExists(remote_slot->restart_lsn))
324 : : {
325 : : /*
326 : : * Update the slot info directly if there is a serialized snapshot
327 : : * at the restart_lsn, as the slot can quickly reach consistency
328 : : * at restart_lsn by restoring the snapshot.
329 : : */
330 [ - + ]: 1 : SpinLockAcquire(&slot->mutex);
331 : 1 : slot->data.restart_lsn = remote_slot->restart_lsn;
332 : 1 : slot->data.confirmed_flush = remote_slot->confirmed_lsn;
333 : 1 : slot->data.catalog_xmin = remote_slot->catalog_xmin;
334 : 1 : SpinLockRelease(&slot->mutex);
335 : :
47 336 : 1 : updated_xmin_or_lsn = true;
337 : : }
338 : : else
339 : : {
340 : : bool found_consistent_snapshot;
341 : 11 : XLogRecPtr old_confirmed_lsn = slot->data.confirmed_flush;
342 : 11 : XLogRecPtr old_restart_lsn = slot->data.restart_lsn;
343 : 11 : XLogRecPtr old_catalog_xmin = slot->data.catalog_xmin;
344 : :
787 345 : 11 : LogicalSlotAdvanceAndCheckSnapState(remote_slot->confirmed_lsn,
346 : : &found_consistent_snapshot);
347 : :
348 : : /* Sanity check */
778 349 [ - + ]: 11 : if (slot->data.confirmed_flush != remote_slot->confirmed_lsn)
778 akapila@postgresql.o 350 [ # # ]:UBC 0 : ereport(ERROR,
351 : : errmsg_internal("synchronized confirmed_flush for slot \"%s\" differs from remote slot",
352 : : remote_slot->name),
353 : : errdetail_internal("Remote slot has LSN %X/%08X but local slot has LSN %X/%08X.",
354 : : LSN_FORMAT_ARGS(remote_slot->confirmed_lsn),
355 : : LSN_FORMAT_ARGS(slot->data.confirmed_flush)));
356 : :
357 : : /*
358 : : * If we can't reach a consistent snapshot, the slot won't be
359 : : * persisted. See update_and_persist_local_synced_slot().
360 : : */
107 akapila@postgresql.o 361 [ - + ]:GNC 11 : if (!found_consistent_snapshot)
362 : : {
107 akapila@postgresql.o 363 [ # # ]:UNC 0 : Assert(MyReplicationSlot->data.persistency == RS_TEMPORARY);
364 : :
365 [ # # ]: 0 : ereport(LOG,
366 : : errmsg("could not synchronize replication slot \"%s\"",
367 : : remote_slot->name),
368 : : errdetail("Synchronization could lead to data loss, because the standby could not build a consistent snapshot to decode WALs at LSN %X/%08X.",
369 : : LSN_FORMAT_ARGS(slot->data.restart_lsn)));
370 : :
183 371 : 0 : skip_reason = SS_SKIP_NO_CONSISTENT_SNAPSHOT;
372 : : }
373 : :
374 : : /*
375 : : * It is possible that the slot's xmin or LSNs are not updated,
376 : : * when the synced slot has reached consistent snapshot state or
377 : : * cannot build one at all.
378 : : */
47 akapila@postgresql.o 379 :CBC 11 : updated_xmin_or_lsn = (old_confirmed_lsn != slot->data.confirmed_flush ||
380 [ - + - - ]: 11 : old_restart_lsn != slot->data.restart_lsn ||
47 akapila@postgresql.o 381 [ # # ]:UBC 0 : old_catalog_xmin != slot->data.catalog_xmin);
382 : : }
383 : : }
384 : :
385 : : /* Update slot sync skip stats */
183 akapila@postgresql.o 386 :GNC 36 : update_slotsync_skip_stats(skip_reason);
387 : :
787 akapila@postgresql.o 388 [ + - ]:CBC 36 : if (remote_dbid != slot->data.database ||
389 [ + + ]: 36 : remote_slot->two_phase != slot->data.two_phase ||
390 [ + - ]: 35 : remote_slot->failover != slot->data.failover ||
422 391 [ + - ]: 35 : strcmp(remote_slot->plugin, NameStr(slot->data.plugin)) != 0 ||
392 [ - + ]: 35 : remote_slot->two_phase_at != slot->data.two_phase_at)
393 : : {
394 : : NameData plugin_name;
395 : :
396 : : /* Avoid expensive operations while holding a spinlock. */
787 397 : 1 : namestrcpy(&plugin_name, remote_slot->plugin);
398 : :
399 [ - + ]: 1 : SpinLockAcquire(&slot->mutex);
400 : 1 : slot->data.plugin = plugin_name;
401 : 1 : slot->data.database = remote_dbid;
402 : 1 : slot->data.two_phase = remote_slot->two_phase;
422 403 : 1 : slot->data.two_phase_at = remote_slot->two_phase_at;
787 404 : 1 : slot->data.failover = remote_slot->failover;
405 : 1 : SpinLockRelease(&slot->mutex);
406 : :
778 407 : 1 : updated_config = true;
408 : :
409 : : /*
410 : : * Ensure that there is no risk of sending prepared transactions
411 : : * unexpectedly after the promotion.
412 : : */
396 413 [ - + ]: 1 : Assert(slot->data.two_phase_at <= slot->data.confirmed_flush);
414 : : }
415 : :
416 : : /*
417 : : * We have to write the changed xmin to disk *before* we change the
418 : : * in-memory value, otherwise after a crash we wouldn't know that some
419 : : * catalog tuples might have been removed already.
420 : : */
778 421 [ + + + + ]: 36 : if (updated_config || updated_xmin_or_lsn)
422 : : {
423 : 13 : ReplicationSlotMarkDirty();
424 : 13 : ReplicationSlotSave();
425 : : }
426 : :
427 : : /*
428 : : * Now the new xmin is safely on disk, we can let the global value
429 : : * advance. We do not take ProcArrayLock or similar since we only advance
430 : : * xmin here and there's not much harm done by a concurrent computation
431 : : * missing that.
432 : : */
433 [ + + ]: 36 : if (updated_xmin_or_lsn)
434 : : {
435 [ - + ]: 12 : SpinLockAcquire(&slot->mutex);
436 : 12 : slot->effective_catalog_xmin = remote_slot->catalog_xmin;
437 : 12 : SpinLockRelease(&slot->mutex);
438 : :
439 : 12 : ReplicationSlotsComputeRequiredXmin(false);
440 : 12 : ReplicationSlotsComputeRequiredLSN();
441 : : }
442 : :
443 [ + + + + ]: 36 : return updated_config || updated_xmin_or_lsn;
444 : : }
445 : :
446 : : /*
447 : : * Get the list of local logical slots that are synchronized from the
448 : : * primary server.
449 : : */
450 : : static List *
836 451 : 27 : get_local_synced_slots(void)
452 : : {
453 : 27 : List *local_slots = NIL;
454 : :
455 : 27 : LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
456 : :
53 alvherre@kurilemu.de 457 [ + + ]:GNC 432 : for (int i = 0; i < max_replication_slots + max_repack_replication_slots; i++)
458 : : {
836 akapila@postgresql.o 459 :CBC 405 : ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
460 : :
461 : : /* Check if it is a synchronized slot */
462 [ + + + + ]: 405 : if (s->in_use && s->data.synced)
463 : : {
464 [ - + ]: 37 : Assert(SlotIsLogical(s));
465 : 37 : local_slots = lappend(local_slots, s);
466 : : }
467 : : }
468 : :
469 : 27 : LWLockRelease(ReplicationSlotControlLock);
470 : :
471 : 27 : return local_slots;
472 : : }
473 : :
474 : : /*
475 : : * Helper function to check if local_slot is required to be retained.
476 : : *
477 : : * Return false either if local_slot does not exist in the remote_slots list
478 : : * or is invalidated while the corresponding remote slot is still valid,
479 : : * otherwise true.
480 : : */
481 : : static bool
482 : 37 : local_sync_slot_required(ReplicationSlot *local_slot, List *remote_slots)
483 : : {
484 : 37 : bool remote_exists = false;
485 : 37 : bool locally_invalidated = false;
486 : :
487 [ + - + + : 89 : foreach_ptr(RemoteSlot, remote_slot, remote_slots)
+ + ]
488 : : {
489 [ + + ]: 51 : if (strcmp(remote_slot->name, NameStr(local_slot->data.name)) == 0)
490 : : {
491 : 36 : remote_exists = true;
492 : :
493 : : /*
494 : : * If remote slot is not invalidated but local slot is marked as
495 : : * invalidated, then set locally_invalidated flag.
496 : : */
497 [ - + ]: 36 : SpinLockAcquire(&local_slot->mutex);
498 : 36 : locally_invalidated =
499 [ + - ]: 72 : (remote_slot->invalidated == RS_INVAL_NONE) &&
500 [ + + ]: 36 : (local_slot->data.invalidated != RS_INVAL_NONE);
501 : 36 : SpinLockRelease(&local_slot->mutex);
502 : :
503 : 36 : break;
504 : : }
505 : : }
506 : :
507 [ + + + + ]: 37 : return (remote_exists && !locally_invalidated);
508 : : }
509 : :
510 : : /*
511 : : * Drop local obsolete slots.
512 : : *
513 : : * Drop the local slots that no longer need to be synced i.e. these either do
514 : : * not exist on the primary or are no longer enabled for failover.
515 : : *
516 : : * Additionally, drop any slots that are valid on the primary but got
517 : : * invalidated on the standby. This situation may occur due to the following
518 : : * reasons:
519 : : * - The 'max_slot_wal_keep_size' on the standby is insufficient to retain WAL
520 : : * records from the restart_lsn of the slot.
521 : : * - 'primary_slot_name' is temporarily reset to null and the physical slot is
522 : : * removed.
523 : : * These dropped slots will get recreated in next sync-cycle and it is okay to
524 : : * drop and recreate such slots as long as these are not consumable on the
525 : : * standby (which is the case currently).
526 : : *
527 : : * Note: Change of 'wal_level' on the primary server to a level lower than
528 : : * logical may also result in slot invalidation and removal on the standby.
529 : : * This is because such 'wal_level' change is only possible if the logical
530 : : * slots are removed on the primary server, so it's expected to see the
531 : : * slots being invalidated and removed on the standby too (and re-created
532 : : * if they are re-created on the primary server).
533 : : */
534 : : static void
535 : 27 : drop_local_obsolete_slots(List *remote_slot_list)
536 : : {
537 : 27 : List *local_slots = get_local_synced_slots();
538 : :
539 [ + + + + : 91 : foreach_ptr(ReplicationSlot, local_slot, local_slots)
+ + ]
540 : : {
541 : : /* Drop the local slot if it is not required to be retained. */
542 [ + + ]: 37 : if (!local_sync_slot_required(local_slot, remote_slot_list))
543 : : {
544 : : bool synced_slot;
545 : :
546 : : /*
547 : : * Use shared lock to prevent a conflict with
548 : : * ReplicationSlotsDropDBSlots(), trying to drop the same slot
549 : : * during a drop-database operation.
550 : : */
551 : 2 : LockSharedObject(DatabaseRelationId, local_slot->data.database,
552 : : 0, AccessShareLock);
553 : :
554 : : /*
555 : : * In the small window between getting the slot to drop and
556 : : * locking the database, there is a possibility of a parallel
557 : : * database drop by the startup process and the creation of a new
558 : : * slot by the user. This new user-created slot may end up using
559 : : * the same shared memory as that of 'local_slot'. Thus check if
560 : : * local_slot is still the synced one before performing the actual
561 : : * drop.
562 : : */
563 [ - + ]: 2 : SpinLockAcquire(&local_slot->mutex);
564 [ + - + - ]: 2 : synced_slot = local_slot->in_use && local_slot->data.synced;
565 : 2 : SpinLockRelease(&local_slot->mutex);
566 : :
567 [ + - ]: 2 : if (synced_slot)
568 : : {
569 : : /*
570 : : * Now acquire and drop the slot. Note we purposely don't
571 : : * request logical decoding to be disabled here: since this is
572 : : * a standby, which derives its logical decoding state from
573 : : * the primary, it would be wrong to do so.
574 : : */
484 575 : 2 : ReplicationSlotAcquire(NameStr(local_slot->data.name), true, false);
3 alvherre@kurilemu.de 576 :GNC 2 : ReplicationSlotDropAcquired(false);
577 : : }
578 : :
836 akapila@postgresql.o 579 :CBC 2 : UnlockSharedObject(DatabaseRelationId, local_slot->data.database,
580 : : 0, AccessShareLock);
581 : :
582 [ + - ]: 2 : ereport(LOG,
583 : : errmsg("dropped replication slot \"%s\" of database with OID %u",
584 : : NameStr(local_slot->data.name),
585 : : local_slot->data.database));
586 : : }
587 : : }
588 : 27 : }
589 : :
590 : : /*
591 : : * Reserve WAL for the currently active local slot using the specified WAL
592 : : * location (restart_lsn).
593 : : *
594 : : * If the given WAL location has been removed or is at risk of removal,
595 : : * reserve WAL using the oldest segment that is non-removable.
596 : : */
597 : : static void
598 : 8 : reserve_wal_for_local_slot(XLogRecPtr restart_lsn)
599 : : {
600 : : XLogRecPtr slot_min_lsn;
601 : : XLogRecPtr min_safe_lsn;
602 : : XLogSegNo segno;
603 : 8 : ReplicationSlot *slot = MyReplicationSlot;
604 : :
605 [ - + ]: 8 : Assert(slot != NULL);
205 alvherre@kurilemu.de 606 [ - + ]: 8 : Assert(!XLogRecPtrIsValid(slot->data.restart_lsn));
607 : :
608 : : /*
609 : : * Acquire an exclusive lock to prevent the checkpoint process from
610 : : * concurrently calculating the minimum slot LSN (see
611 : : * CheckPointReplicationSlots), ensuring that if WAL reservation occurs
612 : : * first, the checkpoint must wait for the restart_lsn update before
613 : : * calculating the minimum LSN.
614 : : *
615 : : * Note: Unlike ReplicationSlotReserveWal(), this lock does not protect a
616 : : * newly synced slot from being invalidated if a concurrent checkpoint has
617 : : * invoked CheckPointReplicationSlots() before the WAL reservation here.
618 : : * This can happen because the initial restart_lsn received from the
619 : : * remote server can precede the redo pointer. Therefore, when selecting
620 : : * the initial restart_lsn, we consider using the redo pointer or the
621 : : * minimum slot LSN (if those values are greater than the remote
622 : : * restart_lsn) instead of relying solely on the remote value.
623 : : */
123 akapila@postgresql.o 624 : 8 : LWLockAcquire(ReplicationSlotAllocationLock, LW_EXCLUSIVE);
625 : :
626 : : /*
627 : : * Determine the minimum non-removable LSN by comparing the redo pointer
628 : : * with the minimum slot LSN.
629 : : *
630 : : * The minimum slot LSN is considered because the redo pointer advances at
631 : : * every checkpoint, even when replication slots are present on the
632 : : * standby. In such scenarios, the redo pointer can exceed the remote
633 : : * restart_lsn, while WALs preceding the remote restart_lsn remain
634 : : * protected by a local replication slot.
635 : : */
636 : 8 : min_safe_lsn = GetRedoRecPtr();
637 : 8 : slot_min_lsn = XLogGetReplicationSlotMinimumLSN();
638 : :
639 [ + + - + ]: 8 : if (XLogRecPtrIsValid(slot_min_lsn) && min_safe_lsn > slot_min_lsn)
123 akapila@postgresql.o 640 :UBC 0 : min_safe_lsn = slot_min_lsn;
641 : :
642 : : /*
643 : : * If the minimum safe LSN is greater than the given restart_lsn, use it
644 : : * as the initial restart_lsn for the newly synced slot. Otherwise, use
645 : : * the given remote restart_lsn.
646 : : */
123 akapila@postgresql.o 647 [ - + ]:CBC 8 : SpinLockAcquire(&slot->mutex);
648 : 8 : slot->data.restart_lsn = Max(restart_lsn, min_safe_lsn);
649 : 8 : SpinLockRelease(&slot->mutex);
650 : :
651 : 8 : ReplicationSlotsComputeRequiredLSN();
652 : :
653 : 8 : XLByteToSeg(slot->data.restart_lsn, segno, wal_segment_size);
654 [ - + ]: 8 : if (XLogGetLastRemovedSegno() >= segno)
123 akapila@postgresql.o 655 [ # # ]:UBC 0 : elog(ERROR, "WAL required by replication slot %s has been removed concurrently",
656 : : NameStr(slot->data.name));
657 : :
123 akapila@postgresql.o 658 :CBC 8 : LWLockRelease(ReplicationSlotAllocationLock);
836 659 : 8 : }
660 : :
661 : : /*
662 : : * If the remote restart_lsn and catalog_xmin have caught up with the
663 : : * local ones, then update the LSNs and persist the local synced slot for
664 : : * future synchronization; otherwise, do nothing.
665 : : *
666 : : * *slot_persistence_pending is set to true if any of the slots fail to
667 : : * persist.
668 : : *
669 : : * Return true if the slot is marked as RS_PERSISTENT (sync-ready), otherwise
670 : : * false.
671 : : */
672 : : static bool
166 akapila@postgresql.o 673 :GNC 13 : update_and_persist_local_synced_slot(RemoteSlot *remote_slot, Oid remote_dbid,
674 : : bool *slot_persistence_pending)
675 : : {
836 akapila@postgresql.o 676 :CBC 13 : ReplicationSlot *slot = MyReplicationSlot;
677 : :
678 : : /* Slotsync skip stats are handled in function update_local_synced_slot() */
107 akapila@postgresql.o 679 :GNC 13 : (void) update_local_synced_slot(remote_slot, remote_dbid);
680 : :
681 : : /*
682 : : * Check if the slot cannot be synchronized. Refer to the comment atop the
683 : : * file for details on this check.
684 : : */
685 [ + + ]: 13 : if (slot->slotsync_skip_reason != SS_SKIP_NONE)
686 : : {
687 : : /*
688 : : * We reach this point when the remote slot didn't catch up to locally
689 : : * reserved position, or it cannot reach the consistent point from the
690 : : * restart_lsn, or the WAL prior to the remote confirmed flush LSN has
691 : : * not been received and flushed.
692 : : *
693 : : * We do not drop the slot because the restart_lsn and confirmed_lsn
694 : : * can be ahead of the current location when recreating the slot in
695 : : * the next cycle. It may take more time to create such a slot or
696 : : * reach the consistent point. Therefore, we keep this slot and
697 : : * attempt the synchronization in the next cycle.
698 : : *
699 : : * We also update the slot_persistence_pending parameter, so the SQL
700 : : * function can retry.
701 : : */
166 702 [ + + ]: 7 : if (slot_persistence_pending)
703 : 2 : *slot_persistence_pending = true;
704 : :
828 akapila@postgresql.o 705 :GBC 7 : return false;
706 : : }
707 : :
836 akapila@postgresql.o 708 :CBC 6 : ReplicationSlotPersist();
709 : :
710 [ + - ]: 6 : ereport(LOG,
711 : : errmsg("newly created replication slot \"%s\" is sync-ready now",
712 : : remote_slot->name));
713 : :
828 714 : 6 : return true;
715 : : }
716 : :
717 : : /*
718 : : * Synchronize a single slot to the given position.
719 : : *
720 : : * This creates a new slot if there is no existing one and updates the
721 : : * metadata of the slot as per the data received from the primary server.
722 : : *
723 : : * The slot is created as a temporary slot and stays in the same state until the
724 : : * remote_slot catches up with locally reserved position and local slot is
725 : : * updated. The slot is then persisted and is considered as sync-ready for
726 : : * periodic syncs.
727 : : *
728 : : * *slot_persistence_pending is set to true if any of the slots fail to
729 : : * persist.
730 : : *
731 : : * Returns TRUE if the local slot is updated.
732 : : */
733 : : static bool
166 akapila@postgresql.o 734 :GNC 43 : synchronize_one_slot(RemoteSlot *remote_slot, Oid remote_dbid,
735 : : bool *slot_persistence_pending)
736 : : {
737 : : ReplicationSlot *slot;
828 akapila@postgresql.o 738 :CBC 43 : bool slot_updated = false;
739 : :
740 : : /* Search for the named slot */
836 741 [ + + ]: 43 : if ((slot = SearchNamedReplicationSlot(remote_slot->name, true)))
742 : : {
743 : : bool synced;
744 : :
745 [ - + ]: 35 : SpinLockAcquire(&slot->mutex);
746 : 35 : synced = slot->data.synced;
747 : 35 : SpinLockRelease(&slot->mutex);
748 : :
749 : : /* User-created slot with the same name exists, raise ERROR. */
750 [ - + ]: 35 : if (!synced)
836 akapila@postgresql.o 751 [ # # ]:UBC 0 : ereport(ERROR,
752 : : errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
753 : : errmsg("exiting from slot synchronization because same"
754 : : " name slot \"%s\" already exists on the standby",
755 : : remote_slot->name));
756 : :
757 : : /*
758 : : * The slot has been synchronized before.
759 : : *
760 : : * It is important to acquire the slot here before checking
761 : : * invalidation. If we don't acquire the slot first, there could be a
762 : : * race condition that the local slot could be invalidated just after
763 : : * checking the 'invalidated' flag here and we could end up
764 : : * overwriting 'invalidated' flag to remote_slot's value. See
765 : : * InvalidatePossiblyObsoleteSlot() where it invalidates slot directly
766 : : * if the slot is not acquired by other processes.
767 : : *
768 : : * XXX: If it ever turns out that slot acquire/release is costly for
769 : : * cases when none of the slot properties is changed then we can do a
770 : : * pre-check to ensure that at least one of the slot properties is
771 : : * changed before acquiring the slot.
772 : : */
484 akapila@postgresql.o 773 :CBC 35 : ReplicationSlotAcquire(remote_slot->name, true, false);
774 : :
836 775 [ - + ]: 35 : Assert(slot == MyReplicationSlot);
776 : :
777 : : /*
778 : : * Copy the invalidation cause from remote only if local slot is not
779 : : * invalidated locally, we don't want to overwrite existing one.
780 : : */
781 [ + - ]: 35 : if (slot->data.invalidated == RS_INVAL_NONE &&
782 [ - + ]: 35 : remote_slot->invalidated != RS_INVAL_NONE)
783 : : {
836 akapila@postgresql.o 784 [ # # ]:UBC 0 : SpinLockAcquire(&slot->mutex);
785 : 0 : slot->data.invalidated = remote_slot->invalidated;
786 : 0 : SpinLockRelease(&slot->mutex);
787 : :
788 : : /* Make sure the invalidated state persists across server restart */
789 : 0 : ReplicationSlotMarkDirty();
790 : 0 : ReplicationSlotSave();
791 : :
828 792 : 0 : slot_updated = true;
793 : : }
794 : :
795 : : /* Skip the sync of an invalidated slot */
836 akapila@postgresql.o 796 [ - + ]:CBC 35 : if (slot->data.invalidated != RS_INVAL_NONE)
797 : : {
183 akapila@postgresql.o 798 :UNC 0 : update_slotsync_skip_stats(SS_SKIP_INVALID);
799 : :
836 akapila@postgresql.o 800 :UBC 0 : ReplicationSlotRelease();
828 801 : 0 : return slot_updated;
802 : : }
803 : :
804 : : /* Slot not ready yet, let's attempt to make it sync-ready now. */
836 akapila@postgresql.o 805 [ + + ]:CBC 35 : if (slot->data.persistency == RS_TEMPORARY)
806 : : {
828 807 : 5 : slot_updated = update_and_persist_local_synced_slot(remote_slot,
808 : : remote_dbid,
809 : : slot_persistence_pending);
810 : : }
811 : :
812 : : /* Slot ready for sync, so sync it. */
813 : : else
814 : : {
815 : : /*
816 : : * Sanity check: As long as the invalidations are handled
817 : : * appropriately as above, this should never happen.
818 : : *
819 : : * We don't need to check restart_lsn here. See the comments in
820 : : * update_local_synced_slot() for details.
821 : : */
778 822 [ - + ]: 30 : if (remote_slot->confirmed_lsn < slot->data.confirmed_flush)
778 akapila@postgresql.o 823 [ # # ]:UBC 0 : ereport(ERROR,
824 : : errmsg_internal("cannot synchronize local slot \"%s\"",
825 : : remote_slot->name),
826 : : errdetail_internal("Local slot's start streaming location LSN(%X/%08X) is ahead of remote slot's LSN(%X/%08X).",
827 : : LSN_FORMAT_ARGS(slot->data.confirmed_flush),
828 : : LSN_FORMAT_ARGS(remote_slot->confirmed_lsn)));
829 : :
107 akapila@postgresql.o 830 :GNC 30 : slot_updated = update_local_synced_slot(remote_slot, remote_dbid);
831 : : }
832 : : }
833 : : /* Otherwise create the slot first. */
834 : : else
835 : : {
836 : : NameData plugin_name;
836 akapila@postgresql.o 837 :CBC 8 : TransactionId xmin_horizon = InvalidTransactionId;
838 : :
839 : : /* Skip creating the local slot if remote_slot is invalidated already */
840 [ - + ]: 8 : if (remote_slot->invalidated != RS_INVAL_NONE)
828 akapila@postgresql.o 841 :UBC 0 : return false;
842 : :
843 : : /*
844 : : * We create temporary slots instead of ephemeral slots here because
845 : : * we want the slots to survive after releasing them. This is done to
846 : : * avoid dropping and re-creating the slots in each synchronization
847 : : * cycle if the restart_lsn or catalog_xmin of the remote slot has not
848 : : * caught up.
849 : : */
836 akapila@postgresql.o 850 :CBC 8 : ReplicationSlotCreate(remote_slot->name, true, RS_TEMPORARY,
851 : 8 : remote_slot->two_phase,
852 : : false,
853 : 8 : remote_slot->failover,
854 : : true);
855 : :
856 : : /* For shorter lines. */
857 : 8 : slot = MyReplicationSlot;
858 : :
859 : : /* Avoid expensive operations while holding a spinlock. */
860 : 8 : namestrcpy(&plugin_name, remote_slot->plugin);
861 : :
862 [ - + ]: 8 : SpinLockAcquire(&slot->mutex);
863 : 8 : slot->data.database = remote_dbid;
864 : 8 : slot->data.plugin = plugin_name;
865 : 8 : SpinLockRelease(&slot->mutex);
866 : :
867 : 8 : reserve_wal_for_local_slot(remote_slot->restart_lsn);
868 : :
151 msawada@postgresql.o 869 : 8 : LWLockAcquire(ReplicationSlotControlLock, LW_EXCLUSIVE);
836 akapila@postgresql.o 870 : 8 : LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
871 : 8 : xmin_horizon = GetOldestSafeDecodingTransactionId(true);
872 [ - + ]: 8 : SpinLockAcquire(&slot->mutex);
873 : 8 : slot->effective_catalog_xmin = xmin_horizon;
874 : 8 : slot->data.catalog_xmin = xmin_horizon;
875 : 8 : SpinLockRelease(&slot->mutex);
876 : 8 : ReplicationSlotsComputeRequiredXmin(true);
877 : 8 : LWLockRelease(ProcArrayLock);
151 msawada@postgresql.o 878 : 8 : LWLockRelease(ReplicationSlotControlLock);
879 : :
166 akapila@postgresql.o 880 :GNC 8 : update_and_persist_local_synced_slot(remote_slot, remote_dbid,
881 : : slot_persistence_pending);
882 : :
828 akapila@postgresql.o 883 :CBC 8 : slot_updated = true;
884 : : }
885 : :
836 886 : 43 : ReplicationSlotRelease();
887 : :
828 888 : 43 : return slot_updated;
889 : : }
890 : :
891 : : /*
892 : : * Fetch remote slots.
893 : : *
894 : : * If slot_names is NIL, fetches all failover logical slots from the
895 : : * primary server, otherwise fetches only the ones with names in slot_names.
896 : : *
897 : : * Returns a list of remote slot information structures, or NIL if none
898 : : * are found.
899 : : */
900 : : static List *
166 akapila@postgresql.o 901 :GNC 29 : fetch_remote_slots(WalReceiverConn *wrconn, List *slot_names)
902 : : {
903 : : #define SLOTSYNC_COLUMN_COUNT 10
836 akapila@postgresql.o 904 :CBC 29 : Oid slotRow[SLOTSYNC_COLUMN_COUNT] = {TEXTOID, TEXTOID, LSNOID,
905 : : LSNOID, XIDOID, BOOLOID, LSNOID, BOOLOID, TEXTOID, TEXTOID};
906 : :
907 : : WalRcvExecResult *res;
908 : : TupleTableSlot *tupslot;
909 : 29 : List *remote_slot_list = NIL;
910 : : StringInfoData query;
911 : :
166 akapila@postgresql.o 912 :GNC 29 : initStringInfo(&query);
913 : 29 : appendStringInfoString(&query,
914 : : "SELECT slot_name, plugin, confirmed_flush_lsn,"
915 : : " restart_lsn, catalog_xmin, two_phase,"
916 : : " two_phase_at, failover,"
917 : : " database, invalidation_reason"
918 : : " FROM pg_catalog.pg_replication_slots"
919 : : " WHERE failover and NOT temporary");
920 : :
921 [ + + ]: 29 : if (slot_names != NIL)
922 : : {
923 : 2 : bool first_slot = true;
924 : :
925 : : /*
926 : : * Construct the query to fetch only the specified slots
927 : : */
928 : 2 : appendStringInfoString(&query, " AND slot_name IN (");
929 : :
930 [ + - + + : 6 : foreach_ptr(char, slot_name, slot_names)
+ + ]
931 : : {
932 [ - + ]: 2 : if (!first_slot)
166 akapila@postgresql.o 933 :UNC 0 : appendStringInfoString(&query, ", ");
934 : :
47 drowley@postgresql.o 935 :GNC 2 : appendStringInfoString(&query, quote_literal_cstr(slot_name));
166 akapila@postgresql.o 936 : 2 : first_slot = false;
937 : : }
938 : 2 : appendStringInfoChar(&query, ')');
939 : : }
940 : :
941 : : /* Execute the query */
942 : 29 : res = walrcv_exec(wrconn, query.data, SLOTSYNC_COLUMN_COUNT, slotRow);
943 : 29 : pfree(query.data);
836 akapila@postgresql.o 944 [ + + ]:CBC 29 : if (res->status != WALRCV_OK_TUPLES)
945 [ + - ]: 2 : ereport(ERROR,
946 : : errmsg("could not fetch failover logical slots info from the primary server: %s",
947 : : res->err));
948 : :
949 : 27 : tupslot = MakeSingleTupleTableSlot(res->tupledesc, &TTSOpsMinimalTuple);
950 [ + + ]: 70 : while (tuplestore_gettupleslot(res->tuplestore, true, false, tupslot))
951 : : {
952 : : bool isnull;
171 michael@paquier.xyz 953 :GNC 43 : RemoteSlot *remote_slot = palloc0_object(RemoteSlot);
954 : : Datum d;
836 akapila@postgresql.o 955 :CBC 43 : int col = 0;
956 : :
957 : 43 : remote_slot->name = TextDatumGetCString(slot_getattr(tupslot, ++col,
958 : : &isnull));
959 [ - + ]: 43 : Assert(!isnull);
960 : :
961 : 43 : remote_slot->plugin = TextDatumGetCString(slot_getattr(tupslot, ++col,
962 : : &isnull));
963 [ - + ]: 43 : Assert(!isnull);
964 : :
965 : : /*
966 : : * It is possible to get null values for LSN and Xmin if slot is
967 : : * invalidated on the primary server, so handle accordingly.
968 : : */
969 : 43 : d = slot_getattr(tupslot, ++col, &isnull);
970 [ + - ]: 43 : remote_slot->confirmed_lsn = isnull ? InvalidXLogRecPtr :
971 : 43 : DatumGetLSN(d);
972 : :
973 : 43 : d = slot_getattr(tupslot, ++col, &isnull);
974 [ + - ]: 43 : remote_slot->restart_lsn = isnull ? InvalidXLogRecPtr : DatumGetLSN(d);
975 : :
976 : 43 : d = slot_getattr(tupslot, ++col, &isnull);
977 [ + - ]: 43 : remote_slot->catalog_xmin = isnull ? InvalidTransactionId :
978 : 43 : DatumGetTransactionId(d);
979 : :
980 : 43 : remote_slot->two_phase = DatumGetBool(slot_getattr(tupslot, ++col,
981 : : &isnull));
982 [ - + ]: 43 : Assert(!isnull);
983 : :
422 984 : 43 : d = slot_getattr(tupslot, ++col, &isnull);
985 [ + + ]: 43 : remote_slot->two_phase_at = isnull ? InvalidXLogRecPtr : DatumGetLSN(d);
986 : :
836 987 : 43 : remote_slot->failover = DatumGetBool(slot_getattr(tupslot, ++col,
988 : : &isnull));
989 [ - + ]: 43 : Assert(!isnull);
990 : :
991 : 43 : remote_slot->database = TextDatumGetCString(slot_getattr(tupslot,
992 : : ++col, &isnull));
993 [ - + ]: 43 : Assert(!isnull);
994 : :
995 : 43 : d = slot_getattr(tupslot, ++col, &isnull);
996 [ - + ]: 43 : remote_slot->invalidated = isnull ? RS_INVAL_NONE :
836 akapila@postgresql.o 997 :UBC 0 : GetSlotInvalidationCause(TextDatumGetCString(d));
998 : :
999 : : /* Sanity check */
836 akapila@postgresql.o 1000 [ - + ]:CBC 43 : Assert(col == SLOTSYNC_COLUMN_COUNT);
1001 : :
1002 : : /*
1003 : : * If restart_lsn, confirmed_lsn or catalog_xmin is invalid but the
1004 : : * slot is valid, that means we have fetched the remote_slot in its
1005 : : * RS_EPHEMERAL state. In such a case, don't sync it; we can always
1006 : : * sync it in the next sync cycle when the remote_slot is persisted
1007 : : * and has valid lsn(s) and xmin values.
1008 : : *
1009 : : * XXX: In future, if we plan to expose 'slot->data.persistency' in
1010 : : * pg_replication_slots view, then we can avoid fetching RS_EPHEMERAL
1011 : : * slots in the first place.
1012 : : */
205 alvherre@kurilemu.de 1013 [ + - ]:GNC 43 : if ((!XLogRecPtrIsValid(remote_slot->restart_lsn) ||
1014 [ + - ]: 43 : !XLogRecPtrIsValid(remote_slot->confirmed_lsn) ||
836 akapila@postgresql.o 1015 [ - + ]:CBC 43 : !TransactionIdIsValid(remote_slot->catalog_xmin)) &&
836 akapila@postgresql.o 1016 [ # # ]:UBC 0 : remote_slot->invalidated == RS_INVAL_NONE)
1017 : 0 : pfree(remote_slot);
1018 : : else
1019 : : /* Create list of remote slots */
836 akapila@postgresql.o 1020 :CBC 43 : remote_slot_list = lappend(remote_slot_list, remote_slot);
1021 : :
1022 : 43 : ExecClearTuple(tupslot);
1023 : : }
1024 : :
4 akapila@postgresql.o 1025 :GNC 27 : ExecDropSingleTupleTableSlot(tupslot);
166 1026 : 27 : walrcv_clear_result(res);
1027 : :
1028 : 27 : return remote_slot_list;
1029 : : }
1030 : :
1031 : : /*
1032 : : * Synchronize slots.
1033 : : *
1034 : : * This function takes a list of remote slots and synchronizes them locally. It
1035 : : * creates the slots if not present on the standby and updates existing ones.
1036 : : *
1037 : : * If slot_persistence_pending is not NULL, it will be set to true if one or
1038 : : * more slots could not be persisted. This allows callers such as
1039 : : * SyncReplicationSlots() to retry those slots.
1040 : : *
1041 : : * Returns TRUE if any of the slots gets updated in this sync-cycle.
1042 : : */
1043 : : static bool
1044 : 27 : synchronize_slots(WalReceiverConn *wrconn, List *remote_slot_list,
1045 : : bool *slot_persistence_pending)
1046 : : {
1047 : 27 : bool some_slot_updated = false;
1048 : :
1049 : : /* Drop local slots that no longer need to be synced. */
836 akapila@postgresql.o 1050 :CBC 27 : drop_local_obsolete_slots(remote_slot_list);
1051 : :
1052 : : /* Now sync the slots locally */
1053 [ + - + + : 97 : foreach_ptr(RemoteSlot, remote_slot, remote_slot_list)
+ + ]
1054 : : {
1055 : 43 : Oid remote_dbid = get_database_oid(remote_slot->database, false);
1056 : :
1057 : : /*
1058 : : * Use shared lock to prevent a conflict with
1059 : : * ReplicationSlotsDropDBSlots(), trying to drop the same slot during
1060 : : * a drop-database operation.
1061 : : */
1062 : 43 : LockSharedObject(DatabaseRelationId, remote_dbid, 0, AccessShareLock);
1063 : :
166 akapila@postgresql.o 1064 :GNC 43 : some_slot_updated |= synchronize_one_slot(remote_slot, remote_dbid,
1065 : : slot_persistence_pending);
1066 : :
836 akapila@postgresql.o 1067 :CBC 43 : UnlockSharedObject(DatabaseRelationId, remote_dbid, 0, AccessShareLock);
1068 : : }
1069 : :
828 1070 : 27 : return some_slot_updated;
1071 : : }
1072 : :
1073 : : /*
1074 : : * Checks the remote server info.
1075 : : *
1076 : : * We ensure that the 'primary_slot_name' exists on the remote server and the
1077 : : * remote server is not a standby node.
1078 : : */
1079 : : static void
836 1080 : 15 : validate_remote_info(WalReceiverConn *wrconn)
1081 : : {
1082 : : #define PRIMARY_INFO_OUTPUT_COL_COUNT 2
1083 : : WalRcvExecResult *res;
1084 : 15 : Oid slotRow[PRIMARY_INFO_OUTPUT_COL_COUNT] = {BOOLOID, BOOLOID};
1085 : : StringInfoData cmd;
1086 : : bool isnull;
1087 : : TupleTableSlot *tupslot;
1088 : : bool remote_in_recovery;
1089 : : bool primary_slot_valid;
828 1090 : 15 : bool started_tx = false;
1091 : :
836 1092 : 15 : initStringInfo(&cmd);
1093 : 15 : appendStringInfo(&cmd,
1094 : : "SELECT pg_is_in_recovery(), count(*) = 1"
1095 : : " FROM pg_catalog.pg_replication_slots"
1096 : : " WHERE slot_type='physical' AND slot_name=%s",
1097 : : quote_literal_cstr(PrimarySlotName));
1098 : :
1099 : : /* The syscache access in walrcv_exec() needs a transaction env. */
828 1100 [ + + ]: 15 : if (!IsTransactionState())
1101 : : {
1102 : 6 : StartTransactionCommand();
1103 : 6 : started_tx = true;
1104 : : }
1105 : :
836 1106 : 15 : res = walrcv_exec(wrconn, cmd.data, PRIMARY_INFO_OUTPUT_COL_COUNT, slotRow);
1107 : 15 : pfree(cmd.data);
1108 : :
1109 [ - + ]: 15 : if (res->status != WALRCV_OK_TUPLES)
836 akapila@postgresql.o 1110 [ # # ]:UBC 0 : ereport(ERROR,
1111 : : errmsg("could not fetch primary slot name \"%s\" info from the primary server: %s",
1112 : : PrimarySlotName, res->err),
1113 : : errhint("Check if \"primary_slot_name\" is configured correctly."));
1114 : :
836 akapila@postgresql.o 1115 :CBC 15 : tupslot = MakeSingleTupleTableSlot(res->tupledesc, &TTSOpsMinimalTuple);
1116 [ - + ]: 15 : if (!tuplestore_gettupleslot(res->tuplestore, true, false, tupslot))
836 akapila@postgresql.o 1117 [ # # ]:UBC 0 : elog(ERROR,
1118 : : "failed to fetch tuple for the primary server slot specified by \"primary_slot_name\"");
1119 : :
836 akapila@postgresql.o 1120 :CBC 15 : remote_in_recovery = DatumGetBool(slot_getattr(tupslot, 1, &isnull));
1121 [ - + ]: 15 : Assert(!isnull);
1122 : :
1123 : : /*
1124 : : * Slot sync is currently not supported on a cascading standby. This is
1125 : : * because if we allow it, the primary server needs to wait for all the
1126 : : * cascading standbys, otherwise, logical subscribers can still be ahead
1127 : : * of one of the cascading standbys which we plan to promote. Thus, to
1128 : : * avoid this additional complexity, we restrict it for the time being.
1129 : : */
1130 [ + + ]: 15 : if (remote_in_recovery)
1131 [ + - ]: 1 : ereport(ERROR,
1132 : : errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1133 : : errmsg("cannot synchronize replication slots from a standby server"));
1134 : :
1135 : 14 : primary_slot_valid = DatumGetBool(slot_getattr(tupslot, 2, &isnull));
1136 [ - + ]: 14 : Assert(!isnull);
1137 : :
1138 [ - + ]: 14 : if (!primary_slot_valid)
836 akapila@postgresql.o 1139 [ # # ]:UBC 0 : ereport(ERROR,
1140 : : errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1141 : : /* translator: second %s is a GUC variable name */
1142 : : errmsg("replication slot \"%s\" specified by \"%s\" does not exist on primary server",
1143 : : PrimarySlotName, "primary_slot_name"));
1144 : :
4 akapila@postgresql.o 1145 :GNC 14 : ExecDropSingleTupleTableSlot(tupslot);
836 akapila@postgresql.o 1146 :CBC 14 : walrcv_clear_result(res);
1147 : :
828 1148 [ + + ]: 14 : if (started_tx)
1149 : 6 : CommitTransactionCommand();
836 1150 : 14 : }
1151 : :
1152 : : /*
1153 : : * Checks if dbname is specified in 'primary_conninfo'.
1154 : : *
1155 : : * Error out if not specified otherwise return it.
1156 : : */
1157 : : char *
828 1158 : 16 : CheckAndGetDbnameFromConninfo(void)
1159 : : {
1160 : : char *dbname;
1161 : :
1162 : : /*
1163 : : * The slot synchronization needs a database connection for walrcv_exec to
1164 : : * work.
1165 : : */
1166 : 16 : dbname = walrcv_get_dbname_from_conninfo(PrimaryConnInfo);
1167 [ + + ]: 16 : if (dbname == NULL)
1168 [ + - ]: 1 : ereport(ERROR,
1169 : : errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1170 : :
1171 : : /*
1172 : : * translator: first %s is a connection option; second %s is a GUC
1173 : : * variable name
1174 : : */
1175 : : errmsg("replication slot synchronization requires \"%s\" to be specified in \"%s\"",
1176 : : "dbname", "primary_conninfo"));
1177 : 15 : return dbname;
1178 : : }
1179 : :
1180 : : /*
1181 : : * Return true if all necessary GUCs for slot synchronization are set
1182 : : * appropriately, otherwise, return false.
1183 : : */
1184 : : bool
1185 : 22 : ValidateSlotSyncParams(int elevel)
1186 : : {
1187 : : /*
1188 : : * Logical slot sync/creation requires logical decoding to be enabled.
1189 : : */
158 msawada@postgresql.o 1190 [ - + ]:GNC 22 : if (!IsLogicalDecodingEnabled())
1191 : : {
299 fujii@postgresql.org 1192 [ # # ]:UBC 0 : ereport(elevel,
1193 : : errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1194 : : errmsg("replication slot synchronization requires \"effective_wal_level\" >= \"logical\" on the primary"),
1195 : : errhint("To enable logical decoding on primary, set \"wal_level\" >= \"logical\" or create at least one logical slot when \"wal_level\" = \"replica\"."));
1196 : :
1197 : 0 : return false;
1198 : : }
1199 : :
1200 : : /*
1201 : : * A physical replication slot(primary_slot_name) is required on the
1202 : : * primary to ensure that the rows needed by the standby are not removed
1203 : : * after restarting, so that the synchronized slot on the standby will not
1204 : : * be invalidated.
1205 : : */
836 akapila@postgresql.o 1206 [ + - - + ]:CBC 22 : if (PrimarySlotName == NULL || *PrimarySlotName == '\0')
1207 : : {
828 akapila@postgresql.o 1208 [ # # ]:UBC 0 : ereport(elevel,
1209 : : errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1210 : : /* translator: %s is a GUC variable name */
1211 : : errmsg("replication slot synchronization requires \"%s\" to be set", "primary_slot_name"));
1212 : 0 : return false;
1213 : : }
1214 : :
1215 : : /*
1216 : : * hot_standby_feedback must be enabled to cooperate with the physical
1217 : : * replication slot, which allows informing the primary about the xmin and
1218 : : * catalog_xmin values on the standby.
1219 : : */
836 akapila@postgresql.o 1220 [ + + ]:CBC 22 : if (!hot_standby_feedback)
1221 : : {
828 1222 [ + - ]: 1 : ereport(elevel,
1223 : : errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1224 : : /* translator: %s is a GUC variable name */
1225 : : errmsg("replication slot synchronization requires \"%s\" to be enabled",
1226 : : "hot_standby_feedback"));
1227 : 1 : return false;
1228 : : }
1229 : :
1230 : : /*
1231 : : * The primary_conninfo is required to make connection to primary for
1232 : : * getting slots information.
1233 : : */
836 1234 [ + - - + ]: 21 : if (PrimaryConnInfo == NULL || *PrimaryConnInfo == '\0')
1235 : : {
828 akapila@postgresql.o 1236 [ # # ]:UBC 0 : ereport(elevel,
1237 : : errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1238 : : /* translator: %s is a GUC variable name */
1239 : : errmsg("replication slot synchronization requires \"%s\" to be set",
1240 : : "primary_conninfo"));
1241 : 0 : return false;
1242 : : }
1243 : :
828 akapila@postgresql.o 1244 :CBC 21 : return true;
1245 : : }
1246 : :
1247 : : /*
1248 : : * Re-read the config file for slot synchronization.
1249 : : *
1250 : : * Exit or throw error if relevant GUCs have changed depending on whether
1251 : : * called from slot sync worker or from the SQL function pg_sync_replication_slots()
1252 : : */
1253 : : static void
1254 : 1 : slotsync_reread_config(void)
1255 : : {
1256 : 1 : char *old_primary_conninfo = pstrdup(PrimaryConnInfo);
1257 : 1 : char *old_primary_slotname = pstrdup(PrimarySlotName);
1258 : 1 : bool old_sync_replication_slots = sync_replication_slots;
1259 : 1 : bool old_hot_standby_feedback = hot_standby_feedback;
1260 : : bool conninfo_changed;
1261 : : bool primary_slotname_changed;
170 1262 : 1 : bool is_slotsync_worker = AmLogicalSlotSyncWorkerProcess();
1263 : 1 : bool parameter_changed = false;
1264 : :
1265 [ + - ]: 1 : if (is_slotsync_worker)
1266 [ - + ]: 1 : Assert(sync_replication_slots);
1267 : :
828 1268 : 1 : ConfigReloadPending = false;
1269 : 1 : ProcessConfigFile(PGC_SIGHUP);
1270 : :
1271 : 1 : conninfo_changed = strcmp(old_primary_conninfo, PrimaryConnInfo) != 0;
1272 : 1 : primary_slotname_changed = strcmp(old_primary_slotname, PrimarySlotName) != 0;
1273 : 1 : pfree(old_primary_conninfo);
1274 : 1 : pfree(old_primary_slotname);
1275 : :
1276 [ - + ]: 1 : if (old_sync_replication_slots != sync_replication_slots)
1277 : : {
170 akapila@postgresql.o 1278 [ # # ]:UBC 0 : if (is_slotsync_worker)
1279 : : {
1280 [ # # ]: 0 : ereport(LOG,
1281 : : /* translator: %s is a GUC variable name */
1282 : : errmsg("replication slot synchronization worker will stop because \"%s\" is disabled",
1283 : : "sync_replication_slots"));
1284 : :
1285 : 0 : proc_exit(0);
1286 : : }
1287 : :
1288 : 0 : parameter_changed = true;
1289 : : }
1290 : : else
1291 : : {
170 akapila@postgresql.o 1292 [ + - + - ]:CBC 1 : if (conninfo_changed ||
1293 : 1 : primary_slotname_changed ||
1294 [ + - ]: 1 : (old_hot_standby_feedback != hot_standby_feedback))
1295 : : {
1296 : :
1297 [ + - ]: 1 : if (is_slotsync_worker)
1298 : : {
1299 [ + - ]: 1 : ereport(LOG,
1300 : : errmsg("replication slot synchronization worker will restart because of a parameter change"));
1301 : :
1302 : : /*
1303 : : * Reset the last-start time for this worker so that the
1304 : : * postmaster can restart it without waiting for
1305 : : * SLOTSYNC_RESTART_INTERVAL_SEC.
1306 : : */
1307 : 1 : SlotSyncCtx->last_start_time = 0;
1308 : :
1309 : 1 : proc_exit(0);
1310 : : }
1311 : :
170 akapila@postgresql.o 1312 :UBC 0 : parameter_changed = true;
1313 : : }
1314 : : }
1315 : :
1316 : : /*
1317 : : * If we have reached here with a parameter change, we must be running in
1318 : : * SQL function, emit error in such a case.
1319 : : */
1320 [ # # ]: 0 : if (parameter_changed)
1321 : : {
1322 [ # # ]: 0 : Assert(!is_slotsync_worker);
1323 [ # # ]: 0 : ereport(ERROR,
1324 : : errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1325 : : errmsg("replication slot synchronization will stop because of a parameter change"));
1326 : : }
1327 : :
828 1328 : 0 : }
1329 : :
1330 : : /*
1331 : : * Handle receipt of an interrupt indicating a slotsync shutdown message.
1332 : : *
1333 : : * This is called within the SIGUSR1 handler. All we do here is set a flag
1334 : : * that will cause the next CHECK_FOR_INTERRUPTS() to invoke
1335 : : * ProcessSlotSyncMessage().
1336 : : */
1337 : : void
52 fujii@postgresql.org 1338 :CBC 1 : HandleSlotSyncMessageInterrupt(void)
1339 : : {
1340 : 1 : InterruptPending = true;
1341 : 1 : SlotSyncShutdownPending = true;
1342 : : /* latch will be set by procsignal_sigusr1_handler */
1343 : 1 : }
1344 : :
1345 : : /*
1346 : : * Handle a PROCSIG_SLOTSYNC_MESSAGE signal, called from ProcessInterrupts().
1347 : : *
1348 : : * If the current process is the slotsync background worker, log a message
1349 : : * and exit cleanly. If it is a backend executing pg_sync_replication_slots(),
1350 : : * raise an error, unless the sync has already finished, in which case there
1351 : : * is no need to interrupt the caller.
1352 : : */
1353 : : void
1354 : 1 : ProcessSlotSyncMessage(void)
1355 : : {
1356 : 1 : SlotSyncShutdownPending = false;
1357 : :
1358 [ + - ]: 1 : if (AmLogicalSlotSyncWorkerProcess())
1359 : : {
1360 [ + - ]: 1 : ereport(LOG,
1361 : : errmsg("replication slot synchronization worker will stop because promotion is triggered"));
1362 : 1 : proc_exit(0);
1363 : : }
1364 : : else
1365 : : {
1366 : : /*
1367 : : * If sync has already completed, there is no need to interrupt the
1368 : : * caller with an error.
1369 : : */
52 fujii@postgresql.org 1370 [ # # ]:UBC 0 : if (!IsSyncingReplicationSlots())
1371 : 0 : return;
1372 : :
1373 [ # # ]: 0 : ereport(ERROR,
1374 : : errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1375 : : errmsg("replication slot synchronization will stop because promotion is triggered"));
1376 : : }
1377 : : }
1378 : :
1379 : : /*
1380 : : * Connection cleanup function for slotsync worker.
1381 : : *
1382 : : * Called on slotsync worker exit.
1383 : : */
1384 : : static void
765 akapila@postgresql.o 1385 :CBC 6 : slotsync_worker_disconnect(int code, Datum arg)
1386 : : {
1387 : 6 : WalReceiverConn *wrconn = (WalReceiverConn *) DatumGetPointer(arg);
1388 : :
1389 : 6 : walrcv_disconnect(wrconn);
1390 : 6 : }
1391 : :
1392 : : /*
1393 : : * Cleanup function for slotsync worker.
1394 : : *
1395 : : * Called on slotsync worker exit.
1396 : : */
1397 : : static void
828 1398 : 6 : slotsync_worker_onexit(int code, Datum arg)
1399 : : {
1400 : : /*
1401 : : * We need to do slots cleanup here just like WalSndErrorCleanup() does.
1402 : : *
1403 : : * The startup process during promotion invokes ShutDownSlotSync() which
1404 : : * waits for slot sync to finish and it does that by checking the
1405 : : * 'syncing' flag. Thus the slot sync worker must be done with slots'
1406 : : * release and cleanup to avoid any dangling temporary slots or active
1407 : : * slots before it marks itself as finished syncing.
1408 : : */
1409 : :
1410 : : /* Make sure active replication slots are released */
765 1411 [ - + ]: 6 : if (MyReplicationSlot != NULL)
765 akapila@postgresql.o 1412 :UBC 0 : ReplicationSlotRelease();
1413 : :
1414 : : /* Also cleanup the temporary slots. */
765 akapila@postgresql.o 1415 :CBC 6 : ReplicationSlotCleanup(false);
1416 : :
828 1417 [ - + ]: 6 : SpinLockAcquire(&SlotSyncCtx->mutex);
1418 : :
1419 : 6 : SlotSyncCtx->pid = InvalidPid;
1420 : :
1421 : : /*
1422 : : * If syncing_slots is true, it indicates that the process errored out
1423 : : * without resetting the flag. So, we need to clean up shared memory and
1424 : : * reset the flag here.
1425 : : */
765 1426 [ + - ]: 6 : if (syncing_slots)
1427 : : {
1428 : 6 : SlotSyncCtx->syncing = false;
1429 : 6 : syncing_slots = false;
1430 : : }
1431 : :
828 1432 : 6 : SpinLockRelease(&SlotSyncCtx->mutex);
1433 : 6 : }
1434 : :
1435 : : /*
1436 : : * Sleep for long enough that we believe it's likely that the slots on primary
1437 : : * get updated.
1438 : : *
1439 : : * If there is no slot activity the wait time between sync-cycles will double
1440 : : * (to a maximum of 30s). If there is some slot activity the wait time between
1441 : : * sync-cycles is reset to the minimum (200ms).
1442 : : */
1443 : : static void
1444 : 19 : wait_for_slot_activity(bool some_slot_updated)
1445 : : {
1446 : : int rc;
1447 : :
1448 [ + + ]: 19 : if (!some_slot_updated)
1449 : : {
1450 : : /*
1451 : : * No slots were updated, so double the sleep time, but not beyond the
1452 : : * maximum allowable value.
1453 : : */
821 1454 : 10 : sleep_ms = Min(sleep_ms * 2, MAX_SLOTSYNC_WORKER_NAPTIME_MS);
1455 : : }
1456 : : else
1457 : : {
1458 : : /*
1459 : : * Some slots were updated since the last sleep, so reset the sleep
1460 : : * time.
1461 : : */
1462 : 9 : sleep_ms = MIN_SLOTSYNC_WORKER_NAPTIME_MS;
1463 : : }
1464 : :
828 1465 : 19 : rc = WaitLatch(MyLatch,
1466 : : WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
1467 : : sleep_ms,
1468 : : WAIT_EVENT_REPLICATION_SLOTSYNC_MAIN);
1469 : :
1470 [ + + ]: 19 : if (rc & WL_LATCH_SET)
1471 : 3 : ResetLatch(MyLatch);
1472 : 19 : }
1473 : :
1474 : : /*
1475 : : * Emit an error if a concurrent sync call is in progress.
1476 : : * Otherwise, advertise that a sync is in progress.
1477 : : */
1478 : : static void
170 1479 : 15 : check_and_set_sync_info(pid_t sync_process_pid)
1480 : : {
765 1481 [ - + ]: 15 : SpinLockAcquire(&SlotSyncCtx->mutex);
1482 : :
1483 : : /*
1484 : : * Exit immediately if promotion has been triggered. This guards against
1485 : : * a new worker (or a call to pg_sync_replication_slots()) that starts
1486 : : * after the old worker was stopped by ShutDownSlotSync().
1487 : : */
52 fujii@postgresql.org 1488 [ - + ]: 15 : if (SlotSyncCtx->stopSignaled)
1489 : : {
52 fujii@postgresql.org 1490 :UBC 0 : SpinLockRelease(&SlotSyncCtx->mutex);
1491 : :
1492 [ # # ]: 0 : if (AmLogicalSlotSyncWorkerProcess())
1493 : : {
1494 [ # # ]: 0 : ereport(DEBUG1,
1495 : : errmsg("replication slot synchronization worker will not start because promotion was triggered"));
1496 : :
1497 : 0 : proc_exit(0);
1498 : : }
1499 : : else
1500 : : {
1501 : : /*
1502 : : * For the backend executing SQL function
1503 : : * pg_sync_replication_slots().
1504 : : */
1505 [ # # ]: 0 : ereport(ERROR,
1506 : : errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1507 : : errmsg("replication slot synchronization will not start because promotion was triggered"));
1508 : : }
1509 : : }
1510 : :
765 akapila@postgresql.o 1511 [ - + ]:CBC 15 : if (SlotSyncCtx->syncing)
1512 : : {
765 akapila@postgresql.o 1513 :UBC 0 : SpinLockRelease(&SlotSyncCtx->mutex);
1514 [ # # ]: 0 : ereport(ERROR,
1515 : : errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1516 : : errmsg("cannot synchronize replication slots concurrently"));
1517 : : }
1518 : :
1519 : : /* The pid must not be already assigned in SlotSyncCtx */
170 akapila@postgresql.o 1520 [ - + ]:CBC 15 : Assert(SlotSyncCtx->pid == InvalidPid);
1521 : :
765 1522 : 15 : SlotSyncCtx->syncing = true;
1523 : :
1524 : : /*
1525 : : * Advertise the required PID so that the startup process can kill the
1526 : : * slot sync process on promotion.
1527 : : */
170 1528 : 15 : SlotSyncCtx->pid = sync_process_pid;
1529 : :
765 1530 : 15 : SpinLockRelease(&SlotSyncCtx->mutex);
1531 : :
1532 : 15 : syncing_slots = true;
1533 : 15 : }
1534 : :
1535 : : /*
1536 : : * Reset syncing flag.
1537 : : */
1538 : : static void
178 nathan@postgresql.or 1539 :GNC 9 : reset_syncing_flag(void)
1540 : : {
765 akapila@postgresql.o 1541 [ - + ]:CBC 9 : SpinLockAcquire(&SlotSyncCtx->mutex);
1542 : 9 : SlotSyncCtx->syncing = false;
170 1543 : 9 : SlotSyncCtx->pid = InvalidPid;
765 1544 : 9 : SpinLockRelease(&SlotSyncCtx->mutex);
1545 : :
1546 : 9 : syncing_slots = false;
261 peter@eisentraut.org 1547 : 9 : }
1548 : :
1549 : : /*
1550 : : * The main loop of our worker process.
1551 : : *
1552 : : * It connects to the primary server, fetches logical failover slots
1553 : : * information periodically in order to create and sync the slots.
1554 : : *
1555 : : * Note: If any changes are made here, check if the corresponding SQL
1556 : : * function logic in SyncReplicationSlots() also needs to be changed.
1557 : : */
1558 : : void
463 1559 : 6 : ReplSlotSyncWorkerMain(const void *startup_data, size_t startup_data_len)
1560 : : {
828 akapila@postgresql.o 1561 : 6 : WalReceiverConn *wrconn = NULL;
1562 : : char *dbname;
1563 : : char *err;
1564 : : sigjmp_buf local_sigjmp_buf;
1565 : : StringInfoData app_name;
1566 : :
803 heikki.linnakangas@i 1567 [ - + ]: 6 : Assert(startup_data_len == 0);
1568 : :
1569 : : /* Release postmaster's working memory context */
54 fujii@postgresql.org 1570 [ + - ]:GNC 6 : if (PostmasterContext)
1571 : : {
1572 : 6 : MemoryContextDelete(PostmasterContext);
1573 : 6 : PostmasterContext = NULL;
1574 : : }
1575 : :
828 akapila@postgresql.o 1576 :CBC 6 : init_ps_display(NULL);
1577 : :
697 heikki.linnakangas@i 1578 [ - + ]: 6 : Assert(GetProcessingMode() == InitProcessing);
1579 : :
1580 : : /*
1581 : : * Create a per-backend PGPROC struct in shared memory. We must do this
1582 : : * before we access any shared memory.
1583 : : */
828 akapila@postgresql.o 1584 : 6 : InitProcess();
1585 : :
1586 : : /*
1587 : : * Early initialization.
1588 : : */
1589 : 6 : BaseInit();
1590 : :
1591 [ - + ]: 6 : Assert(SlotSyncCtx != NULL);
1592 : :
1593 : : /*
1594 : : * If an exception is encountered, processing resumes here.
1595 : : *
1596 : : * We just need to clean up, report the error, and go away.
1597 : : *
1598 : : * If we do not have this handling here, then since this worker process
1599 : : * operates at the bottom of the exception stack, ERRORs turn into FATALs.
1600 : : * Therefore, we create our own exception handler to catch ERRORs.
1601 : : */
1602 [ + + ]: 6 : if (sigsetjmp(local_sigjmp_buf, 1) != 0)
1603 : : {
1604 : : /* since not using PG_TRY, must reset error stack by hand */
1605 : 2 : error_context_stack = NULL;
1606 : :
1607 : : /* Prevents interrupts while cleaning up */
1608 : 2 : HOLD_INTERRUPTS();
1609 : :
1610 : : /* Report the error to the server log */
1611 : 2 : EmitErrorReport();
1612 : :
1613 : : /*
1614 : : * We can now go away. Note that because we called InitProcess, a
1615 : : * callback was registered to do ProcKill, which will clean up
1616 : : * necessary state.
1617 : : */
1618 : 2 : proc_exit(0);
1619 : : }
1620 : :
1621 : : /* We can now handle ereport(ERROR) */
1622 : 6 : PG_exception_stack = &local_sigjmp_buf;
1623 : :
1624 : : /* Setup signal handling */
765 1625 : 6 : pqsignal(SIGHUP, SignalHandlerForConfigReload);
172 1626 : 6 : pqsignal(SIGINT, StatementCancelHandler);
765 1627 : 6 : pqsignal(SIGTERM, die);
1628 : 6 : pqsignal(SIGFPE, FloatExceptionHandler);
1629 : 6 : pqsignal(SIGUSR1, procsignal_sigusr1_handler);
46 andrew@dunslane.net 1630 :GNC 6 : pqsignal(SIGUSR2, PG_SIG_IGN);
1631 : 6 : pqsignal(SIGPIPE, PG_SIG_IGN);
1632 : 6 : pqsignal(SIGCHLD, PG_SIG_DFL);
1633 : :
765 akapila@postgresql.o 1634 :CBC 6 : check_and_set_sync_info(MyProcPid);
1635 : :
1636 [ + - ]: 6 : ereport(LOG, errmsg("slot sync worker started"));
1637 : :
1638 : : /* Register it as soon as SlotSyncCtx->pid is initialized. */
1639 : 6 : before_shmem_exit(slotsync_worker_onexit, (Datum) 0);
1640 : :
1641 : : /*
1642 : : * Establishes SIGALRM handler and initialize timeout module. It is needed
1643 : : * by InitPostgres to register different timeouts.
1644 : : */
1645 : 6 : InitializeTimeouts();
1646 : :
1647 : : /* Load the libpq-specific functions */
1648 : 6 : load_file("libpqwalreceiver", false);
1649 : :
1650 : : /*
1651 : : * Unblock signals (they were blocked when the postmaster forked us)
1652 : : */
828 1653 : 6 : sigprocmask(SIG_SETMASK, &UnBlockSig, NULL);
1654 : :
1655 : : /*
1656 : : * Set always-secure search path, so malicious users can't redirect user
1657 : : * code (e.g. operators).
1658 : : *
1659 : : * It's not strictly necessary since we won't be scanning or writing to
1660 : : * any user table locally, but it's good to retain it here for added
1661 : : * precaution.
1662 : : */
821 1663 : 6 : SetConfigOption("search_path", "", PGC_SUSET, PGC_S_OVERRIDE);
1664 : :
828 1665 : 6 : dbname = CheckAndGetDbnameFromConninfo();
1666 : :
1667 : : /*
1668 : : * Connect to the database specified by the user in primary_conninfo. We
1669 : : * need a database connection for walrcv_exec to work which we use to
1670 : : * fetch slot information from the remote node. See comments atop
1671 : : * libpqrcv_exec.
1672 : : *
1673 : : * We do not specify a specific user here since the slot sync worker will
1674 : : * operate as a superuser. This is safe because the slot sync worker does
1675 : : * not interact with user tables, eliminating the risk of executing
1676 : : * arbitrary code within triggers.
1677 : : */
1678 : 6 : InitPostgres(dbname, InvalidOid, NULL, InvalidOid, 0, NULL);
1679 : :
1680 : 6 : SetProcessingMode(NormalProcessing);
1681 : :
1682 : 6 : initStringInfo(&app_name);
1683 [ + - ]: 6 : if (cluster_name[0])
1684 : 6 : appendStringInfo(&app_name, "%s_%s", cluster_name, "slotsync worker");
1685 : : else
780 drowley@postgresql.o 1686 :UBC 0 : appendStringInfoString(&app_name, "slotsync worker");
1687 : :
1688 : : /*
1689 : : * Establish the connection to the primary server for slot
1690 : : * synchronization.
1691 : : */
828 akapila@postgresql.o 1692 :CBC 6 : wrconn = walrcv_connect(PrimaryConnInfo, false, false, false,
1693 : : app_name.data, &err);
1694 : :
1695 [ - + ]: 6 : if (!wrconn)
828 akapila@postgresql.o 1696 [ # # ]:UBC 0 : ereport(ERROR,
1697 : : errcode(ERRCODE_CONNECTION_FAILURE),
1698 : : errmsg("synchronization worker \"%s\" could not connect to the primary server: %s",
1699 : : app_name.data, err));
1700 : :
269 akapila@postgresql.o 1701 :CBC 6 : pfree(app_name.data);
1702 : :
1703 : : /*
1704 : : * Register the disconnection callback.
1705 : : *
1706 : : * XXX: This can be combined with previous cleanup registration of
1707 : : * slotsync_worker_onexit() but that will need the connection to be made
1708 : : * global and we want to avoid introducing global for this purpose.
1709 : : */
765 1710 : 6 : before_shmem_exit(slotsync_worker_disconnect, PointerGetDatum(wrconn));
1711 : :
1712 : : /*
1713 : : * Using the specified primary server connection, check that we are not a
1714 : : * cascading standby and slot configured in 'primary_slot_name' exists on
1715 : : * the primary server.
1716 : : */
828 1717 : 6 : validate_remote_info(wrconn);
1718 : :
1719 : : /* Main loop to synchronize slots */
1720 : : for (;;)
1721 : 17 : {
1722 : 23 : bool some_slot_updated = false;
166 akapila@postgresql.o 1723 :GNC 23 : bool started_tx = false;
1724 : : List *remote_slots;
1725 : :
52 fujii@postgresql.org 1726 [ + + ]:CBC 23 : CHECK_FOR_INTERRUPTS();
1727 : :
1728 [ + + ]: 20 : if (ConfigReloadPending)
1729 : 1 : slotsync_reread_config();
1730 : :
1731 : : /*
1732 : : * The syscache access in fetch_remote_slots() needs a transaction
1733 : : * env.
1734 : : */
166 akapila@postgresql.o 1735 [ + - ]:GNC 19 : if (!IsTransactionState())
1736 : : {
1737 : 19 : StartTransactionCommand();
1738 : 19 : started_tx = true;
1739 : : }
1740 : :
1741 : 19 : remote_slots = fetch_remote_slots(wrconn, NIL);
1742 : 17 : some_slot_updated = synchronize_slots(wrconn, remote_slots, NULL);
1743 : 17 : list_free_deep(remote_slots);
1744 : :
1745 [ + - ]: 17 : if (started_tx)
1746 : 17 : CommitTransactionCommand();
1747 : :
828 akapila@postgresql.o 1748 :CBC 17 : wait_for_slot_activity(some_slot_updated);
1749 : : }
1750 : :
1751 : : /*
1752 : : * The slot sync worker can't get here because it will only stop when it
1753 : : * receives a stop request from the startup process, or when there is an
1754 : : * error.
1755 : : */
1756 : : Assert(false);
1757 : : }
1758 : :
1759 : : /*
1760 : : * Update the inactive_since property for synced slots.
1761 : : *
1762 : : * Note that this function is currently called when we shutdown the slot
1763 : : * sync machinery.
1764 : : */
1765 : : static void
785 1766 : 1017 : update_synced_slots_inactive_since(void)
1767 : : {
1768 : 1017 : TimestampTz now = 0;
1769 : :
1770 : : /*
1771 : : * We need to update inactive_since only when we are promoting standby to
1772 : : * correctly interpret the inactive_since if the standby gets promoted
1773 : : * without a restart. We don't want the slots to appear inactive for a
1774 : : * long time after promotion if they haven't been synchronized recently.
1775 : : * Whoever acquires the slot, i.e., makes the slot active, will reset it.
1776 : : */
1777 [ + + ]: 1017 : if (!StandbyMode)
1778 : 962 : return;
1779 : :
1780 : : /* The slot sync worker or the SQL function mustn't be running by now */
765 1781 [ + - - + ]: 55 : Assert((SlotSyncCtx->pid == InvalidPid) && !SlotSyncCtx->syncing);
1782 : :
785 1783 : 55 : LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
1784 : :
53 alvherre@kurilemu.de 1785 [ + + ]:GNC 868 : for (int i = 0; i < max_replication_slots + max_repack_replication_slots; i++)
1786 : : {
785 akapila@postgresql.o 1787 :CBC 813 : ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
1788 : :
1789 : : /* Check if it is a synchronized slot */
1790 [ + + + + ]: 813 : if (s->in_use && s->data.synced)
1791 : : {
1792 [ - + ]: 3 : Assert(SlotIsLogical(s));
1793 : :
1794 : : /* The slot must not be acquired by any process */
109 heikki.linnakangas@i 1795 [ - + ]:GNC 3 : Assert(s->active_proc == INVALID_PROC_NUMBER);
1796 : :
1797 : : /* Use the same inactive_since time for all the slots. */
785 akapila@postgresql.o 1798 [ + + ]:CBC 3 : if (now == 0)
1799 : 2 : now = GetCurrentTimestamp();
1800 : :
479 1801 : 3 : ReplicationSlotSetInactiveSince(s, now, true);
1802 : : }
1803 : : }
1804 : :
785 1805 : 55 : LWLockRelease(ReplicationSlotControlLock);
1806 : : }
1807 : :
1808 : : /*
1809 : : * Shut down slot synchronization.
1810 : : *
1811 : : * This function sets stopSignaled=true and wakes up the slot sync process
1812 : : * (either worker or backend running the SQL function pg_sync_replication_slots())
1813 : : * so that worker can exit or the SQL function pg_sync_replication_slots() can
1814 : : * finish. It also waits till the slot sync worker has exited or
1815 : : * pg_sync_replication_slots() has finished.
1816 : : */
1817 : : void
828 1818 : 1017 : ShutDownSlotSync(void)
1819 : : {
1820 : : pid_t sync_process_pid;
1821 : :
1822 [ - + ]: 1017 : SpinLockAcquire(&SlotSyncCtx->mutex);
1823 : :
1824 : 1017 : SlotSyncCtx->stopSignaled = true;
1825 : :
1826 : : /*
1827 : : * Return if neither the slot sync worker is running nor the function
1828 : : * pg_sync_replication_slots() is executing.
1829 : : */
765 1830 [ + + ]: 1017 : if (!SlotSyncCtx->syncing)
1831 : : {
828 1832 : 1016 : SpinLockRelease(&SlotSyncCtx->mutex);
785 1833 : 1016 : update_synced_slots_inactive_since();
828 1834 : 1016 : return;
1835 : : }
1836 : :
170 1837 : 1 : sync_process_pid = SlotSyncCtx->pid;
1838 : :
828 1839 : 1 : SpinLockRelease(&SlotSyncCtx->mutex);
1840 : :
1841 : : /*
1842 : : * Signal process doing slotsync, if any, asking it to stop.
1843 : : */
170 1844 [ + - ]: 1 : if (sync_process_pid != InvalidPid)
52 fujii@postgresql.org 1845 : 1 : SendProcSignal(sync_process_pid, PROCSIG_SLOTSYNC_MESSAGE,
1846 : : INVALID_PROC_NUMBER);
1847 : :
1848 : : /* Wait for slot sync to end */
1849 : : for (;;)
828 akapila@postgresql.o 1850 :LBC (1) : {
1851 : : int rc;
1852 : :
1853 : : /* Wait a bit, we don't expect to have to wait long */
828 akapila@postgresql.o 1854 :CBC 1 : rc = WaitLatch(MyLatch,
1855 : : WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
1856 : : 10L, WAIT_EVENT_REPLICATION_SLOTSYNC_SHUTDOWN);
1857 : :
1858 [ - + ]: 1 : if (rc & WL_LATCH_SET)
1859 : : {
828 akapila@postgresql.o 1860 :LBC (1) : ResetLatch(MyLatch);
1861 [ # # ]: (1) : CHECK_FOR_INTERRUPTS();
1862 : : }
1863 : :
828 akapila@postgresql.o 1864 [ - + ]:CBC 1 : SpinLockAcquire(&SlotSyncCtx->mutex);
1865 : :
1866 : : /* Ensure that no process is syncing the slots. */
765 1867 [ + - ]: 1 : if (!SlotSyncCtx->syncing)
828 1868 : 1 : break;
1869 : :
828 akapila@postgresql.o 1870 :LBC (1) : SpinLockRelease(&SlotSyncCtx->mutex);
1871 : : }
1872 : :
828 akapila@postgresql.o 1873 :CBC 1 : SpinLockRelease(&SlotSyncCtx->mutex);
1874 : :
785 1875 : 1 : update_synced_slots_inactive_since();
1876 : : }
1877 : :
1878 : : /*
1879 : : * SlotSyncWorkerCanRestart
1880 : : *
1881 : : * Return true, indicating worker is allowed to restart, if enough time has
1882 : : * passed since it was last launched to reach SLOTSYNC_RESTART_INTERVAL_SEC.
1883 : : * Otherwise return false.
1884 : : *
1885 : : * This is a safety valve to protect against continuous respawn attempts if the
1886 : : * worker is dying immediately at launch. Note that since we will retry to
1887 : : * launch the worker from the postmaster main loop, we will get another
1888 : : * chance later.
1889 : : */
1890 : : bool
828 1891 : 11 : SlotSyncWorkerCanRestart(void)
1892 : : {
1893 : 11 : time_t curtime = time(NULL);
1894 : :
1895 : : /*
1896 : : * If first time through, or time somehow went backwards, always update
1897 : : * last_start_time to match the current clock and allow worker start.
1898 : : * Otherwise allow it only once enough time has elapsed.
1899 : : */
220 tgl@sss.pgh.pa.us 1900 [ + + ]:GNC 11 : if (SlotSyncCtx->last_start_time == 0 ||
1901 [ + - ]: 5 : curtime < SlotSyncCtx->last_start_time ||
1902 [ - + ]: 5 : curtime - SlotSyncCtx->last_start_time >= SLOTSYNC_RESTART_INTERVAL_SEC)
1903 : : {
1904 : 6 : SlotSyncCtx->last_start_time = curtime;
1905 : 6 : return true;
1906 : : }
1907 : 5 : return false;
1908 : : }
1909 : :
1910 : : /*
1911 : : * Is current process syncing replication slots?
1912 : : *
1913 : : * Could be either backend executing SQL function or slot sync worker.
1914 : : */
1915 : : bool
836 akapila@postgresql.o 1916 :CBC 72 : IsSyncingReplicationSlots(void)
1917 : : {
1918 : 72 : return syncing_slots;
1919 : : }
1920 : :
1921 : : /*
1922 : : * Register shared memory space needed for slot synchronization.
1923 : : */
1924 : : static void
54 heikki.linnakangas@i 1925 :GNC 1251 : SlotSyncShmemRequest(void *arg)
1926 : : {
1927 : 1251 : ShmemRequestStruct(.name = "Slot Sync Data",
1928 : : .size = sizeof(SlotSyncCtxStruct),
1929 : : .ptr = (void **) &SlotSyncCtx,
1930 : : );
836 akapila@postgresql.o 1931 :GIC 1251 : }
1932 : :
1933 : : /*
1934 : : * Initialize shared memory for slot synchronization.
1935 : : */
1936 : : static void
54 heikki.linnakangas@i 1937 :GNC 1248 : SlotSyncShmemInit(void *arg)
1938 : : {
1939 : 1248 : memset(SlotSyncCtx, 0, sizeof(SlotSyncCtxStruct));
1940 : 1248 : SlotSyncCtx->pid = InvalidPid;
1941 : 1248 : SpinLockInit(&SlotSyncCtx->mutex);
836 akapila@postgresql.o 1942 :CBC 1248 : }
1943 : :
1944 : : /*
1945 : : * Error cleanup callback for slot sync SQL function.
1946 : : */
1947 : : static void
1948 : 1 : slotsync_failure_callback(int code, Datum arg)
1949 : : {
1950 : 1 : WalReceiverConn *wrconn = (WalReceiverConn *) DatumGetPointer(arg);
1951 : :
1952 : : /*
1953 : : * We need to do slots cleanup here just like WalSndErrorCleanup() does.
1954 : : *
1955 : : * The startup process during promotion invokes ShutDownSlotSync() which
1956 : : * waits for slot sync to finish and it does that by checking the
1957 : : * 'syncing' flag. Thus the SQL function must be done with slots' release
1958 : : * and cleanup to avoid any dangling temporary slots or active slots
1959 : : * before it marks itself as finished syncing.
1960 : : */
1961 : :
1962 : : /* Make sure active replication slots are released */
765 1963 [ - + ]: 1 : if (MyReplicationSlot != NULL)
765 akapila@postgresql.o 1964 :UBC 0 : ReplicationSlotRelease();
1965 : :
1966 : : /* Also cleanup the synced temporary slots. */
765 akapila@postgresql.o 1967 :CBC 1 : ReplicationSlotCleanup(true);
1968 : :
1969 : : /*
1970 : : * The set syncing_slots indicates that the process errored out without
1971 : : * resetting the flag. So, we need to clean up shared memory and reset the
1972 : : * flag here.
1973 : : */
1974 [ + - ]: 1 : if (syncing_slots)
1975 : 1 : reset_syncing_flag();
1976 : :
836 1977 : 1 : walrcv_disconnect(wrconn);
1978 : 1 : }
1979 : :
1980 : : /*
1981 : : * Helper function to extract slot names from a list of remote slots
1982 : : */
1983 : : static List *
166 akapila@postgresql.o 1984 :GNC 1 : extract_slot_names(List *remote_slots)
1985 : : {
1986 : 1 : List *slot_names = NIL;
1987 : :
1988 [ + - + + : 3 : foreach_ptr(RemoteSlot, remote_slot, remote_slots)
+ + ]
1989 : : {
1990 : : char *slot_name;
1991 : :
1992 : 1 : slot_name = pstrdup(remote_slot->name);
1993 : 1 : slot_names = lappend(slot_names, slot_name);
1994 : : }
1995 : :
1996 : 1 : return slot_names;
1997 : : }
1998 : :
1999 : : /*
2000 : : * Synchronize the failover enabled replication slots using the specified
2001 : : * primary server connection.
2002 : : *
2003 : : * Repeatedly fetches and updates replication slot information from the
2004 : : * primary until all slots are at least "sync ready".
2005 : : *
2006 : : * Exits early if promotion is triggered or certain critical
2007 : : * configuration parameters have changed.
2008 : : */
2009 : : void
836 akapila@postgresql.o 2010 :CBC 9 : SyncReplicationSlots(WalReceiverConn *wrconn)
2011 : : {
2012 [ + + ]: 9 : PG_ENSURE_ERROR_CLEANUP(slotsync_failure_callback, PointerGetDatum(wrconn));
2013 : : {
166 akapila@postgresql.o 2014 :GNC 9 : List *remote_slots = NIL;
2015 : 9 : List *slot_names = NIL; /* List of slot names to track */
2016 : : MemoryContext sync_retry_ctx;
2017 : :
170 akapila@postgresql.o 2018 :CBC 9 : check_and_set_sync_info(MyProcPid);
2019 : :
836 2020 : 9 : validate_remote_info(wrconn);
2021 : :
2022 : : /*
2023 : : * Setup and use a per-sync-cycle memory context, which is reset every
2024 : : * time we loop below. This avoids having to retail freeing the memory
2025 : : * used in each sync cycle.
2026 : : */
4 akapila@postgresql.o 2027 :GNC 8 : sync_retry_ctx = AllocSetContextCreate(CurrentMemoryContext,
2028 : : "slot sync retry context",
2029 : : ALLOCSET_DEFAULT_SIZES);
2030 : :
2031 : : /* Retry until all the slots are sync-ready */
2032 : : for (;;)
166 2033 : 2 : {
2034 : 10 : bool slot_persistence_pending = false;
2035 : 10 : bool some_slot_updated = false;
2036 : : MemoryContext oldctx;
2037 : :
2038 : : /* Check for interrupts and config changes */
52 fujii@postgresql.org 2039 [ - + ]: 10 : CHECK_FOR_INTERRUPTS();
2040 : :
2041 [ - + ]: 10 : if (ConfigReloadPending)
52 fujii@postgresql.org 2042 :UNC 0 : slotsync_reread_config();
2043 : :
2044 : : /* We must be in a valid transaction state */
166 akapila@postgresql.o 2045 [ - + ]:GNC 10 : Assert(IsTransactionState());
2046 : :
4 2047 : 10 : MemoryContextReset(sync_retry_ctx);
2048 : 10 : oldctx = MemoryContextSwitchTo(sync_retry_ctx);
2049 : :
2050 : : /*
2051 : : * Fetch remote slot info for the given slot_names. If slot_names
2052 : : * is NIL, fetch all failover-enabled slots. Note that we reuse
2053 : : * slot_names from the first iteration; re-fetching all failover
2054 : : * slots each time could cause an endless loop. Instead of
2055 : : * reprocessing only the pending slots in each iteration, it's
2056 : : * better to process all the slots received in the first
2057 : : * iteration. This ensures that by the time we're done, all slots
2058 : : * reflect the latest values.
2059 : : */
166 2060 : 10 : remote_slots = fetch_remote_slots(wrconn, slot_names);
2061 : :
2062 : : /* Attempt to synchronize slots */
2063 : 10 : some_slot_updated = synchronize_slots(wrconn, remote_slots,
2064 : : &slot_persistence_pending);
2065 : :
2066 : : /*
2067 : : * slot_names must survive later sync_retry_ctx resets, so copy it
2068 : : * in the outer context.
2069 : : */
4 2070 : 10 : MemoryContextSwitchTo(oldctx);
2071 : :
2072 : : /*
2073 : : * If slot_persistence_pending is true, extract slot names for
2074 : : * future iterations (only needed if we haven't done it yet)
2075 : : */
166 2076 [ + + + + ]: 10 : if (slot_names == NIL && slot_persistence_pending)
2077 : 1 : slot_names = extract_slot_names(remote_slots);
2078 : :
2079 : : /* Done if all slots are persisted i.e are sync-ready */
2080 [ + + ]: 10 : if (!slot_persistence_pending)
2081 : 8 : break;
2082 : :
2083 : : /* wait before retrying again */
2084 : 2 : wait_for_slot_activity(some_slot_updated);
2085 : : }
2086 : :
4 2087 : 8 : MemoryContextDelete(sync_retry_ctx);
2088 : :
166 2089 [ + + ]: 8 : if (slot_names)
2090 : 1 : list_free_deep(slot_names);
2091 : :
2092 : : /* Cleanup the synced temporary slots */
765 akapila@postgresql.o 2093 :CBC 8 : ReplicationSlotCleanup(true);
2094 : :
2095 : : /* We are done with sync, so reset sync flag */
2096 : 8 : reset_syncing_flag();
2097 : : }
836 2098 [ - + ]: 9 : PG_END_ENSURE_ERROR_CLEANUP(slotsync_failure_callback, PointerGetDatum(wrconn));
2099 : 8 : }
|