Fix watchdog communication race condition.
authorTatsuo Ishii <ishii@sraoss.co.jp>
Fri, 21 May 2021 23:00:05 +0000 (08:00 +0900)
committerTatsuo Ishii <ishii@sraoss.co.jp>
Sat, 22 May 2021 04:00:46 +0000 (13:00 +0900)
Watchdog sends information from the watchdog process to the Pgpool-II
main process using SIGUSR1. To pass detailed messages it uses shared
memory area. First it sets a message to the shared memory area then
sends SIGUSR1 to the main process. The main process received the
signal and the signal handler sets a global variable so that
sigusr1_interrupt_processor() processes it. However it is possible
that while sigusr1_interrupt_processor() is running new signal
arrives. In this case the new signal is caught but the global variable
is set to 0 after sigusr1_interrupt_processor() returns. This means
that the new message is not processed until new signal arrives, which
could cause significant delay before the message was processed.

To fix the problem, sigusr1_interrupt_processor() is repeatedly called
until there's no pending message.

Discussion: https://www.pgpool.net/pipermail/pgpool-hackers/2021-May/003901.html

src/main/pgpool_main.c

index 040a47f4c1ae1e6e63c5b60849c6ad414382c8ed..feb375332a37dfc8bf9526d79174ff625c183f5a 100644 (file)
@@ -94,8 +94,10 @@ typedef struct User1SignalSlot
                } \
                if (sigusr1_request) \
                { \
-                       sigusr1_interrupt_processor(); \
-                       sigusr1_request = 0; \
+                       do {\
+                               sigusr1_request = 0; \
+                               sigusr1_interrupt_processor(); \
+                       } while (sigusr1_request == 1); \
                } \
                if (sigchld_request) \
                { \
@@ -330,8 +332,10 @@ int PgpoolMain(bool discard_status, bool clear_memcache_oidmaps)
 
                if (sigusr1_request)
                {
-                       sigusr1_interrupt_processor();
-                       sigusr1_request = 0;
+                       do {
+                               sigusr1_request = 0;
+                               sigusr1_interrupt_processor();
+                       } while (sigusr1_request == 1);
                }
        }
 
@@ -695,6 +699,9 @@ void register_watchdog_state_change_interrupt(void)
 }
 static void signal_user1_to_parent_with_reason(User1SignalReason reason)
 {
+       ereport(LOG,
+                       (errmsg("signal_user1_to_parent_with_reason(%d)", reason)));
+
        user1SignalSlot->signalFlags[reason] = true;
        pool_signal_parent(SIGUSR1);
 }
@@ -1550,13 +1557,13 @@ static RETSIGTYPE sigusr1_handler(int sig)
 
 static void sigusr1_interrupt_processor(void)
 {
-       ereport(DEBUG1,
+       ereport(LOG,
                        (errmsg("Pgpool-II parent process received SIGUSR1")));
 
        if (user1SignalSlot->signalFlags[SIG_WATCHDOG_STATE_CHANGED])
        {
-               ereport(DEBUG1,
-                               (errmsg("Pgpool-II parent process received SIGUSR1 from watchdog")));
+               ereport(LOG,
+                               (errmsg("Pgpool-II parent process received watchdog state change signal from watchdog")));
 
                user1SignalSlot->signalFlags[SIG_WATCHDOG_STATE_CHANGED] = false;
                if (get_watchdog_local_node_state() == WD_STANDBY)