Skip to content

Commit 218d40f

Browse files
skkyalCommitfest Bot
authored andcommitted
Add slotsync_skip_reason to pg_replication_slots
This patch introduces a new column slotsync_skip_reason to pg_replication_slots view. This indicates the reason for last slot synchronization skip.
1 parent 42473b3 commit 218d40f

File tree

12 files changed

+144
-16
lines changed

12 files changed

+144
-16
lines changed

doc/src/sgml/monitoring.sgml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1665,7 +1665,7 @@ description | Waiting for a newly initialized WAL file to reach durable storage
16651665
</para>
16661666
<para>
16671667
Number of times the slot synchronization is skipped. Slot
1668-
synchronization occur only on standby servers and thus this column has
1668+
synchronization occurs only on standby servers and thus this column has
16691669
no meaning on the primary server.
16701670
</para>
16711671
</entry>
@@ -1677,7 +1677,7 @@ description | Waiting for a newly initialized WAL file to reach durable storage
16771677
</para>
16781678
<para>
16791679
Time at which last slot synchronization was skipped. Slot
1680-
synchronization occur only on standby servers and thus this column has
1680+
synchronization occurs only on standby servers and thus this column has
16811681
no meaning on the primary server.
16821682
</para>
16831683
</entry>

doc/src/sgml/system-views.sgml

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3102,6 +3102,48 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
31023102
</para></entry>
31033103
</row>
31043104

3105+
<row>
3106+
<entry role="catalog_table_entry"><para role="column_definition">
3107+
<structfield>slotsync_skip_reason</structfield><type>text</type>
3108+
</para>
3109+
<para>
3110+
The reason for the last slot synchronization skip. Slot
3111+
synchronization occurs only on standby servers and thus this column has
3112+
no meaning on the primary server. It is relevant mainly for logical slots
3113+
on standby servers whose <structfield>synced</structfield> field is
3114+
<literal>true</literal>. It is <literal>NULL</literal> if slot
3115+
synchronization is successful.
3116+
Possible values are:
3117+
<itemizedlist spacing="compact">
3118+
<listitem>
3119+
<para>
3120+
<literal>wal_or_rows_removed</literal> means that the required WALs or
3121+
catalog rows have already been removed or are at the risk of removal
3122+
from the standby.
3123+
</para>
3124+
</listitem>
3125+
<listitem>
3126+
<para>
3127+
<literal>wal_not_flushed</literal> means that the standby had not
3128+
flushed the WAL corresponding to the position reserved on the failover
3129+
slot.
3130+
</para>
3131+
</listitem>
3132+
<listitem>
3133+
<para>
3134+
<literal>no_consistent_snapshot</literal> means that the standby could
3135+
not build a consistent snapshot to decode WALs from
3136+
<structfield>restart_lsn</structfield>.
3137+
</para>
3138+
</listitem>
3139+
<listitem>
3140+
<para>
3141+
<literal>slot_invalidated</literal> means that the slot is invalidated.
3142+
</para>
3143+
</listitem>
3144+
</itemizedlist>
3145+
</para></entry>
3146+
</row>
31053147
</tbody>
31063148
</tgroup>
31073149
</table>

src/backend/catalog/system_views.sql

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1060,7 +1060,8 @@ CREATE VIEW pg_replication_slots AS
10601060
L.conflicting,
10611061
L.invalidation_reason,
10621062
L.failover,
1063-
L.synced
1063+
L.synced,
1064+
L.slotsync_skip_reason
10641065
FROM pg_get_replication_slots() AS L
10651066
LEFT JOIN pg_database D ON (L.datoid = D.oid);
10661067

src/backend/replication/logical/slotsync.c

Lines changed: 38 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,35 @@ typedef struct RemoteSlot
148148
static void slotsync_failure_callback(int code, Datum arg);
149149
static void update_synced_slots_inactive_since(void);
150150

151+
/*
152+
* Update slot sync skip stats. This function requires the caller to acquire
153+
* the slot.
154+
*/
155+
static void
156+
update_slotsync_skip_stats(SlotSyncSkipReason skip_reason)
157+
{
158+
ReplicationSlot *slot;
159+
160+
Assert(MyReplicationSlot);
161+
162+
slot = MyReplicationSlot;
163+
164+
/*
165+
* Update the slot sync related stats in pg_stat_replication_slot when a
166+
* slot sync is skipped
167+
*/
168+
if (skip_reason != SS_SKIP_NONE)
169+
pgstat_report_replslotsync(slot);
170+
171+
/* Update the slot sync skip reason */
172+
if (slot->slotsync_skip_reason != skip_reason)
173+
{
174+
SpinLockAcquire(&slot->mutex);
175+
slot->slotsync_skip_reason = skip_reason;
176+
SpinLockRelease(&slot->mutex);
177+
}
178+
}
179+
151180
/*
152181
* If necessary, update the local synced slot's metadata based on the data
153182
* from the remote slot.
@@ -170,6 +199,7 @@ update_local_synced_slot(RemoteSlot *remote_slot, Oid remote_dbid,
170199
ReplicationSlot *slot = MyReplicationSlot;
171200
bool updated_xmin_or_lsn = false;
172201
bool updated_config = false;
202+
SlotSyncSkipReason skip_reason = SS_SKIP_NONE;
173203

174204
Assert(slot->data.invalidated == RS_INVAL_NONE);
175205

@@ -188,7 +218,7 @@ update_local_synced_slot(RemoteSlot *remote_slot, Oid remote_dbid,
188218
slot->data.catalog_xmin))
189219
{
190220
/* Update slot sync skip stats */
191-
pgstat_report_replslotsync(slot);
221+
update_slotsync_skip_stats(SS_SKIP_WAL_OR_ROWS_REMOVED);
192222

193223
/*
194224
* This can happen in following situations:
@@ -286,12 +316,15 @@ update_local_synced_slot(RemoteSlot *remote_slot, Oid remote_dbid,
286316
* persisted. See update_and_persist_local_synced_slot().
287317
*/
288318
if (found_consistent_snapshot && !(*found_consistent_snapshot))
289-
pgstat_report_replslotsync(slot);
319+
skip_reason = SS_SKIP_NO_CONSISTENT_SNAPSHOT;
290320
}
291321

292322
updated_xmin_or_lsn = true;
293323
}
294324

325+
/* Update slot sync skip stats */
326+
update_slotsync_skip_stats(skip_reason);
327+
295328
if (remote_dbid != slot->data.database ||
296329
remote_slot->two_phase != slot->data.two_phase ||
297330
remote_slot->failover != slot->data.failover ||
@@ -696,7 +729,7 @@ synchronize_one_slot(RemoteSlot *remote_slot, Oid remote_dbid)
696729
/* Skip the sync of an invalidated slot */
697730
if (slot->data.invalidated != RS_INVAL_NONE)
698731
{
699-
pgstat_report_replslotsync(slot);
732+
update_slotsync_skip_stats(SS_SKIP_INVALID);
700733

701734
ReplicationSlotRelease();
702735
return slot_updated;
@@ -711,7 +744,7 @@ synchronize_one_slot(RemoteSlot *remote_slot, Oid remote_dbid)
711744
*/
712745
if (remote_slot->confirmed_lsn > latestFlushPtr)
713746
{
714-
pgstat_report_replslotsync(slot);
747+
update_slotsync_skip_stats(SS_SKIP_WAL_NOT_FLUSHED);
715748

716749
/*
717750
* Can get here only if GUC 'synchronized_standby_slots' on the
@@ -812,7 +845,7 @@ synchronize_one_slot(RemoteSlot *remote_slot, Oid remote_dbid)
812845
*/
813846
if (remote_slot->confirmed_lsn > latestFlushPtr)
814847
{
815-
pgstat_report_replslotsync(slot);
848+
update_slotsync_skip_stats(SS_SKIP_WAL_NOT_FLUSHED);
816849

817850
/*
818851
* Can get here only if GUC 'synchronized_standby_slots' on the

src/backend/replication/slot.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -491,6 +491,7 @@ ReplicationSlotCreate(const char *name, bool db_specific,
491491
slot->last_saved_confirmed_flush = InvalidXLogRecPtr;
492492
slot->last_saved_restart_lsn = InvalidXLogRecPtr;
493493
slot->inactive_since = 0;
494+
slot->slotsync_skip_reason = SS_SKIP_NONE;
494495

495496
/*
496497
* Create the slot on disk. We haven't actually marked the slot allocated

src/backend/replication/slotfuncs.c

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,17 @@
2424
#include "utils/guc.h"
2525
#include "utils/pg_lsn.h"
2626

27+
/*
28+
* Map SlotSyncSkipReason enum values to human-readable names.
29+
*/
30+
static const char *SlotSyncSkipReasonNames[] = {
31+
[SS_SKIP_NONE] = "none",
32+
[SS_SKIP_WAL_NOT_FLUSHED] = "wal_not_flushed",
33+
[SS_SKIP_WAL_OR_ROWS_REMOVED] = "wal_or_rows_removed",
34+
[SS_SKIP_NO_CONSISTENT_SNAPSHOT] = "no_consistent_snapshot",
35+
[SS_SKIP_INVALID] = "slot_invalidated"
36+
};
37+
2738
/*
2839
* Helper function for creating a new physical replication slot with
2940
* given arguments. Note that this function doesn't release the created
@@ -235,7 +246,7 @@ pg_drop_replication_slot(PG_FUNCTION_ARGS)
235246
Datum
236247
pg_get_replication_slots(PG_FUNCTION_ARGS)
237248
{
238-
#define PG_GET_REPLICATION_SLOTS_COLS 20
249+
#define PG_GET_REPLICATION_SLOTS_COLS 21
239250
ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
240251
XLogRecPtr currlsn;
241252
int slotno;
@@ -443,6 +454,11 @@ pg_get_replication_slots(PG_FUNCTION_ARGS)
443454

444455
values[i++] = BoolGetDatum(slot_contents.data.synced);
445456

457+
if (slot_contents.slotsync_skip_reason == SS_SKIP_NONE)
458+
nulls[i++] = true;
459+
else
460+
values[i++] = CStringGetTextDatum(SlotSyncSkipReasonNames[slot_contents.slotsync_skip_reason]);
461+
446462
Assert(i == PG_GET_REPLICATION_SLOTS_COLS);
447463

448464
tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,

src/backend/utils/activity/pgstat_replslot.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -115,8 +115,8 @@ pgstat_report_replslotsync(ReplicationSlot *slot)
115115
PgStatShared_ReplSlot *shstatent;
116116
PgStat_StatReplSlotEntry *statent;
117117

118-
/* Slot sync stats are valid only for logical slots on standby. */
119-
Assert(SlotIsLogical(slot));
118+
/* Slot sync stats are valid only for synced logical slots on standby. */
119+
Assert(slot->data.synced);
120120
Assert(RecoveryInProgress());
121121

122122
entry_ref = pgstat_get_entry_ref_locked(PGSTAT_KIND_REPLSLOT, InvalidOid,

src/include/catalog/pg_proc.dat

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11519,9 +11519,9 @@
1151911519
proname => 'pg_get_replication_slots', prorows => '10', proisstrict => 'f',
1152011520
proretset => 't', provolatile => 's', prorettype => 'record',
1152111521
proargtypes => '',
11522-
proallargtypes => '{name,name,text,oid,bool,bool,int4,xid,xid,pg_lsn,pg_lsn,text,int8,bool,pg_lsn,timestamptz,bool,text,bool,bool}',
11523-
proargmodes => '{o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o}',
11524-
proargnames => '{slot_name,plugin,slot_type,datoid,temporary,active,active_pid,xmin,catalog_xmin,restart_lsn,confirmed_flush_lsn,wal_status,safe_wal_size,two_phase,two_phase_at,inactive_since,conflicting,invalidation_reason,failover,synced}',
11522+
proallargtypes => '{name,name,text,oid,bool,bool,int4,xid,xid,pg_lsn,pg_lsn,text,int8,bool,pg_lsn,timestamptz,bool,text,bool,bool,text}',
11523+
proargmodes => '{o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o,o}',
11524+
proargnames => '{slot_name,plugin,slot_type,datoid,temporary,active,active_pid,xmin,catalog_xmin,restart_lsn,confirmed_flush_lsn,wal_status,safe_wal_size,two_phase,two_phase_at,inactive_since,conflicting,invalidation_reason,failover,synced,slotsync_skip_reason}',
1152511525
prosrc => 'pg_get_replication_slots' },
1152611526
{ oid => '3786', descr => 'set up a logical replication slot',
1152711527
proname => 'pg_create_logical_replication_slot', provolatile => 'v',

src/include/replication/slot.h

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,23 @@ typedef enum ReplicationSlotInvalidationCause
7171
/* Maximum number of invalidation causes */
7272
#define RS_INVAL_MAX_CAUSES 4
7373

74+
/*
75+
* When slot sync worker is running or pg_sync_replication_slots is run, the
76+
* slot sync can be skipped. This enum keeps a list of reasons of slot sync
77+
* skip.
78+
*/
79+
typedef enum SlotSyncSkipReason
80+
{
81+
SS_SKIP_NONE, /* No skip */
82+
SS_SKIP_WAL_NOT_FLUSHED, /* Standby did not flush the wal corresponding
83+
* to confirmed flush of remote slot */
84+
SS_SKIP_WAL_OR_ROWS_REMOVED, /* Remote slot is behind; required WAL or
85+
* rows may be removed or at risk */
86+
SS_SKIP_NO_CONSISTENT_SNAPSHOT, /* Standby could not build a consistent
87+
* snapshot */
88+
SS_SKIP_INVALID /* Local slot is invalid */
89+
} SlotSyncSkipReason;
90+
7491
/*
7592
* On-Disk data of a replication slot, preserved across restarts.
7693
*/
@@ -249,6 +266,16 @@ typedef struct ReplicationSlot
249266
*/
250267
XLogRecPtr last_saved_restart_lsn;
251268

269+
/*
270+
* The reason for last slot sync skip.
271+
*
272+
* A slotsync skip typically occurs only for temporary slots. For
273+
* persistent slots it is extremely rare (e.g., cases like
274+
* SS_SKIP_WAL_NOT_FLUSHED or SS_SKIP_WAL_OR_ROWS_REMOVED). Since,
275+
* temporary slots are dropped after server restart, so there is no value
276+
* in persisting the slotsync_skip_reason.
277+
*/
278+
SlotSyncSkipReason slotsync_skip_reason;
252279
} ReplicationSlot;
253280

254281
#define SlotIsPhysical(slot) ((slot)->data.database == InvalidOid)

src/test/recovery/t/040_standby_failover_slots_sync.pl

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1049,6 +1049,12 @@
10491049
$standby2->wait_for_log(
10501050
qr/could not synchronize replication slot \"lsub1_slot\"/, $log_offset);
10511051

1052+
# Confirm that the slotsync skip reason is updated
1053+
$result = $standby2->safe_psql('postgres',
1054+
"SELECT slotsync_skip_reason FROM pg_replication_slots WHERE slot_name = 'lsub1_slot'"
1055+
);
1056+
is($result, 'wal_or_rows_removed', "check slot sync skip reason");
1057+
10521058
# Confirm that the slotsync skip statistics is updated
10531059
$result = $standby2->safe_psql('postgres',
10541060
"SELECT slotsync_skip_count > 0 FROM pg_stat_replication_slots WHERE slot_name = 'lsub1_slot'"

0 commit comments

Comments
 (0)