diff --git a/contrib/pg_buffercache/expected/pg_buffercache.out b/contrib/pg_buffercache/expected/pg_buffercache.out
index 9a9216dc7b1b..632b12abbf84 100644
--- a/contrib/pg_buffercache/expected/pg_buffercache.out
+++ b/contrib/pg_buffercache/expected/pg_buffercache.out
@@ -1,8 +1,9 @@
CREATE EXTENSION pg_buffercache;
-select count(*) = (select setting::bigint
- from pg_settings
- where name = 'shared_buffers')
-from pg_buffercache;
+select pg_size_bytes(setting)/(select setting::bigint from pg_settings where name = 'block_size') AS nbuffers
+ from pg_settings
+ where name = 'shared_buffers'
+\gset
+select count(*) = :nbuffers from pg_buffercache;
?column?
----------
t
@@ -23,6 +24,20 @@ SELECT count(*) > 0 FROM pg_buffercache_usage_counts() WHERE buffers >= 0;
t
(1 row)
+-- Test the buffer lookup table function and count is <= shared_buffers
+select count(*) <= :nbuffers from pg_buffercache_lookup_table_entries();
+ ?column?
+----------
+ t
+(1 row)
+
+-- Check that pg_buffercache_lookup_table view works and count is <= shared_buffers
+select count(*) <= :nbuffers from pg_buffercache_lookup_table;
+ ?column?
+----------
+ t
+(1 row)
+
-- Check that the functions / views can't be accessed by default. To avoid
-- having to create a dedicated user, use the pg_database_owner pseudo-role.
SET ROLE pg_database_owner;
@@ -34,6 +49,10 @@ SELECT * FROM pg_buffercache_summary();
ERROR: permission denied for function pg_buffercache_summary
SELECT * FROM pg_buffercache_usage_counts();
ERROR: permission denied for function pg_buffercache_usage_counts
+SELECT * FROM pg_buffercache_lookup_table_entries();
+ERROR: permission denied for function pg_buffercache_lookup_table_entries
+SELECT * FROM pg_buffercache_lookup_table;
+ERROR: permission denied for view pg_buffercache_lookup_table
RESET role;
-- Check that pg_monitor is allowed to query view / function
SET ROLE pg_monitor;
@@ -55,6 +74,21 @@ SELECT count(*) > 0 FROM pg_buffercache_usage_counts();
t
(1 row)
+RESET role;
+-- Check that pg_read_all_stats is allowed to query buffer lookup table
+SET ROLE pg_read_all_stats;
+SELECT count(*) >= 0 FROM pg_buffercache_lookup_table_entries();
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT count(*) >= 0 FROM pg_buffercache_lookup_table;
+ ?column?
+----------
+ t
+(1 row)
+
RESET role;
------
---- Test pg_buffercache_evict* functions
diff --git a/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql b/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql
index 458f054a6917..9bf58567878d 100644
--- a/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql
+++ b/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql
@@ -44,3 +44,27 @@ CREATE FUNCTION pg_buffercache_evict_all(
OUT buffers_skipped int4)
AS 'MODULE_PATHNAME', 'pg_buffercache_evict_all'
LANGUAGE C PARALLEL SAFE VOLATILE;
+
+-- Add the buffer lookup table function
+CREATE FUNCTION pg_buffercache_lookup_table_entries(
+ OUT tablespace oid,
+ OUT database oid,
+ OUT relfilenode oid,
+ OUT forknum int2,
+ OUT blocknum int8,
+ OUT bufferid int4)
+RETURNS SETOF record
+AS 'MODULE_PATHNAME', 'pg_buffercache_lookup_table_entries'
+LANGUAGE C PARALLEL SAFE VOLATILE;
+
+-- Create a view for convenient access.
+CREATE VIEW pg_buffercache_lookup_table AS
+ SELECT * FROM pg_buffercache_lookup_table_entries();
+
+-- Don't want these to be available to public.
+REVOKE ALL ON FUNCTION pg_buffercache_lookup_table_entries() FROM PUBLIC;
+REVOKE ALL ON pg_buffercache_lookup_table FROM PUBLIC;
+
+-- Grant access to monitoring role.
+GRANT EXECUTE ON FUNCTION pg_buffercache_lookup_table_entries() TO pg_read_all_stats;
+GRANT SELECT ON pg_buffercache_lookup_table TO pg_read_all_stats;
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index c29b784dfa1a..45efc6a314bf 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -16,6 +16,7 @@
#include "storage/buf_internals.h"
#include "storage/bufmgr.h"
#include "utils/rel.h"
+#include "utils/tuplestore.h"
#define NUM_BUFFERCACHE_PAGES_MIN_ELEM 8
@@ -100,6 +101,7 @@ PG_FUNCTION_INFO_V1(pg_buffercache_usage_counts);
PG_FUNCTION_INFO_V1(pg_buffercache_evict);
PG_FUNCTION_INFO_V1(pg_buffercache_evict_relation);
PG_FUNCTION_INFO_V1(pg_buffercache_evict_all);
+PG_FUNCTION_INFO_V1(pg_buffercache_lookup_table_entries);
/* Only need to touch memory once per backend process lifetime */
@@ -116,6 +118,7 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
TupleDesc tupledesc;
TupleDesc expected_tupledesc;
HeapTuple tuple;
+ int currentNBuffers = pg_atomic_read_u32(&ShmemCtrl->currentNBuffers);
if (SRF_IS_FIRSTCALL())
{
@@ -172,10 +175,10 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
/* Allocate NBuffers worth of BufferCachePagesRec records. */
fctx->record = (BufferCachePagesRec *)
MemoryContextAllocHuge(CurrentMemoryContext,
- sizeof(BufferCachePagesRec) * NBuffers);
+ sizeof(BufferCachePagesRec) * currentNBuffers);
/* Set max calls and remember the user function context. */
- funcctx->max_calls = NBuffers;
+ funcctx->max_calls = currentNBuffers;
funcctx->user_fctx = fctx;
/* Return to original context when allocating transient memory */
@@ -189,13 +192,24 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
* snapshot across all buffers, but we do grab the buffer header
* locks, so the information of each buffer is self-consistent.
*/
- for (i = 0; i < NBuffers; i++)
+ for (i = 0; i < currentNBuffers; i++)
{
BufferDesc *bufHdr;
uint32 buf_state;
CHECK_FOR_INTERRUPTS();
+ /*
+ * TODO: We should just scan the entire buffer descriptor
+ * array instead of relying on curent buffer pool size. But that can
+ * happen if only we setup the descriptor array large enough at the
+ * server startup time.
+ */
+ if (currentNBuffers != pg_atomic_read_u32(&ShmemCtrl->currentNBuffers))
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("number of shared buffers changed during scan of buffer cache")));
+
bufHdr = GetBufferDescriptor(i);
/* Lock each buffer header before inspecting. */
buf_state = LockBufHdr(bufHdr);
@@ -776,3 +790,19 @@ pg_buffercache_evict_all(PG_FUNCTION_ARGS)
PG_RETURN_DATUM(result);
}
+
+/*
+ * Return lookup table content as a set of records.
+ */
+Datum
+pg_buffercache_lookup_table_entries(PG_FUNCTION_ARGS)
+{
+ ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+
+ InitMaterializedSRF(fcinfo, 0);
+
+ /* Fill the tuplestore */
+ BufTableGetContents(rsinfo->setResult, rsinfo->setDesc);
+
+ return (Datum) 0;
+}
diff --git a/contrib/pg_buffercache/sql/pg_buffercache.sql b/contrib/pg_buffercache/sql/pg_buffercache.sql
index 47cca1907c74..11fe85ceb3bb 100644
--- a/contrib/pg_buffercache/sql/pg_buffercache.sql
+++ b/contrib/pg_buffercache/sql/pg_buffercache.sql
@@ -1,9 +1,10 @@
CREATE EXTENSION pg_buffercache;
-select count(*) = (select setting::bigint
- from pg_settings
- where name = 'shared_buffers')
-from pg_buffercache;
+select pg_size_bytes(setting)/(select setting::bigint from pg_settings where name = 'block_size') AS nbuffers
+ from pg_settings
+ where name = 'shared_buffers'
+\gset
+select count(*) = :nbuffers from pg_buffercache;
select buffers_used + buffers_unused > 0,
buffers_dirty <= buffers_used,
@@ -12,6 +13,12 @@ from pg_buffercache_summary();
SELECT count(*) > 0 FROM pg_buffercache_usage_counts() WHERE buffers >= 0;
+-- Test the buffer lookup table function and count is <= shared_buffers
+select count(*) <= :nbuffers from pg_buffercache_lookup_table_entries();
+
+-- Check that pg_buffercache_lookup_table view works and count is <= shared_buffers
+select count(*) <= :nbuffers from pg_buffercache_lookup_table;
+
-- Check that the functions / views can't be accessed by default. To avoid
-- having to create a dedicated user, use the pg_database_owner pseudo-role.
SET ROLE pg_database_owner;
@@ -19,6 +26,8 @@ SELECT * FROM pg_buffercache;
SELECT * FROM pg_buffercache_pages() AS p (wrong int);
SELECT * FROM pg_buffercache_summary();
SELECT * FROM pg_buffercache_usage_counts();
+SELECT * FROM pg_buffercache_lookup_table_entries();
+SELECT * FROM pg_buffercache_lookup_table;
RESET role;
-- Check that pg_monitor is allowed to query view / function
@@ -28,6 +37,12 @@ SELECT buffers_used + buffers_unused > 0 FROM pg_buffercache_summary();
SELECT count(*) > 0 FROM pg_buffercache_usage_counts();
RESET role;
+-- Check that pg_read_all_stats is allowed to query buffer lookup table
+SET ROLE pg_read_all_stats;
+SELECT count(*) >= 0 FROM pg_buffercache_lookup_table_entries();
+SELECT count(*) >= 0 FROM pg_buffercache_lookup_table;
+RESET role;
+
------
---- Test pg_buffercache_evict* functions
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 023b3f03ba93..d007055eed79 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1724,7 +1724,6 @@ include_dir 'conf.d'
that is BLCKSZ bytes, typically 8kB.
(Non-default values of BLCKSZ change the minimum
value.)
- This parameter can only be set at server start.
@@ -1747,6 +1746,49 @@ include_dir 'conf.d'
appropriate, so as to leave adequate space for the operating system.
+
+ The shared memory consumed by the buffer pool is allocated and
+ initialized according to the value of the GUC at the time of starting
+ the server. A desired new value of GUC can be loaded while the server is
+ running using SIGHUP. But the buffer pool will
+ not be resized immediately. Use
+ pg_resize_shared_buffers() to dynamically resize
+ the shared buffer pool (see for details).
+ SHOW shared_buffers shows the current number of
+ shared buffers and pending number, if any. Please note that when the GUC
+ is changed, the other GUCS which use this GUCs value to set their
+ defaults will not be changed. They may still require a server restart to
+ consider new value.
+
+
+
+
+
+ max_shared_buffers (integer)
+
+ max_shared_buffers configuration parameter
+
+
+
+
+ Sets the upper limit for the shared_buffers value.
+ The default value is 0,
+ which means no explicit limit is set and max_shared_buffers
+ will be automatically set to the value of shared_buffers
+ at server startup.
+ If this value is specified without units, it is taken as blocks,
+ that is BLCKSZ bytes, typically 8kB.
+ This parameter can only be set at server start.
+
+
+
+ This parameter determines the amount of memory address space to reserve
+ in each backend for expanding the buffer pool in future. While the
+ memory for buffer pool is allocated on demand as it is resized, the
+ memory required to hold the buffer manager metadata is allocated
+ statically at the server start accounting for the largest buffer pool
+ size allowed by this parameter.
+
diff --git a/doc/src/sgml/func/func-admin.sgml b/doc/src/sgml/func/func-admin.sgml
index 1b465bc8ba71..0dc89b07c765 100644
--- a/doc/src/sgml/func/func-admin.sgml
+++ b/doc/src/sgml/func/func-admin.sgml
@@ -99,6 +99,63 @@
off
+
+
+
+
+ pg_resize_shared_buffers
+
+ pg_resize_shared_buffers ()
+ boolean
+
+
+ Dynamically resizes the shared buffer pool to match the current
+ value of the shared_buffers parameter. This
+ function implements a coordinated resize process that ensures all
+ backend processes acknowledge the change before completing the
+ operation. The resize happens in multiple phases to maintain
+ data consistency and system stability. Returns true
+ if the resize was successful, or raises an error if the operation
+ fails. This function can only be called by superusers.
+
+
+ To resize shared buffers, first update the shared_buffers
+ setting and reload the configuration, then verify the new value is loaded
+ before calling this function. For example:
+
+postgres=# ALTER SYSTEM SET shared_buffers = '256MB';
+ALTER SYSTEM
+postgres=# SELECT pg_reload_conf();
+ pg_reload_conf
+----------------
+ t
+(1 row)
+
+postgres=# SHOW shared_buffers;
+ shared_buffers
+-------------------------
+ 128MB (pending: 256MB)
+(1 row)
+
+postgres=# SELECT pg_resize_shared_buffers();
+ pg_resize_shared_buffers
+--------------------------
+ t
+(1 row)
+
+postgres=# SHOW shared_buffers;
+ shared_buffers
+----------------
+ 256MB
+(1 row)
+
+ The SHOW shared_buffers step is important to verify
+ that the configuration reload was successful and the new value is
+ available to the current session before attempting the resize. The
+ output shows both the current and pending values when a change is waiting
+ to be applied.
+
+
diff --git a/doc/src/sgml/system-views.sgml b/doc/src/sgml/system-views.sgml
index 0e623e7fb867..7cb1e5e17f81 100644
--- a/doc/src/sgml/system-views.sgml
+++ b/doc/src/sgml/system-views.sgml
@@ -71,6 +71,11 @@
backend memory contexts
+
+ pg_buffer_lookup_table
+ shared buffer lookup table
+
+
pg_configcompile-time configuration parameters
@@ -901,6 +906,90 @@ AND c1.path[c2.level] = c2.path[c2.level];
+
+ pg_buffer_lookup_table
+
+ pg_buffer_lookup_table
+
+
+ The pg_buffer_lookup_table view exposes the current
+ contents of the shared buffer lookup table. Each row represents an entry in
+ the lookup table mapping a relation page to the ID of buffer in which it is
+ cached. The shared buffer lookup table is locked for a short duration while
+ reading so as to ensure consistency. This may affect performance if this view
+ is queried very frequently.
+
+
+ pg_buffer_lookup_table View
+
+
+
+
+ Column Type
+
+
+ Description
+
+
+
+
+
+
+ tablespaceoid
+
+
+ OID of the tablespace containing the relation
+
+
+
+
+ databaseoid
+
+
+ OID of the database containing the relation (zero for shared relations)
+
+
+
+
+ relfilenodeoid
+
+
+ relfilenode identifying the relation
+
+
+
+
+ forknumint2
+
+
+ Fork number within the relation (see )
+
+
+
+
+ blocknumint8
+
+
+ Block number within the relation
+
+
+
+
+ bufferidint4
+
+
+ ID of the buffer caching the page
+
+
+
+
+
+
+ Access to this view is restricted to members of the
+ pg_read_all_stats role by default.
+
+
+
pg_config
@@ -4144,6 +4233,15 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
+
+
+ segmenttext
+
+
+ The name of the shared memory segment concerning the allocation.
+
+
+
offint8
diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index 77676d6d0359..73df59098866 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -232,7 +232,7 @@ SimpleLruAutotuneBuffers(int divisor, int max)
{
return Min(max - (max % SLRU_BANK_SIZE),
Max(SLRU_BANK_SIZE,
- NBuffers / divisor - (NBuffers / divisor) % SLRU_BANK_SIZE));
+ NBuffersPending / divisor - (NBuffersPending / divisor) % SLRU_BANK_SIZE));
}
/*
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 22d0a2e8c3a6..f4363e0035d9 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -4676,7 +4676,7 @@ XLOGChooseNumBuffers(void)
{
int xbuffers;
- xbuffers = NBuffers / 32;
+ xbuffers = NBuffersPending / 32;
if (xbuffers > (wal_segment_size / XLOG_BLCKSZ))
xbuffers = (wal_segment_size / XLOG_BLCKSZ);
if (xbuffers < 8)
diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c
index fc8638c1b61b..226944e45882 100644
--- a/src/backend/bootstrap/bootstrap.c
+++ b/src/backend/bootstrap/bootstrap.c
@@ -335,6 +335,8 @@ BootstrapModeMain(int argc, char *argv[], bool check_only)
InitializeFastPathLocks();
+ InitializeMaxNBuffers();
+
CreateSharedMemoryAndSemaphores();
/*
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 95ad29a64b98..6a0180f39be4 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -668,6 +668,13 @@ GRANT SELECT ON pg_shmem_allocations TO pg_read_all_stats;
REVOKE EXECUTE ON FUNCTION pg_get_shmem_allocations() FROM PUBLIC;
GRANT EXECUTE ON FUNCTION pg_get_shmem_allocations() TO pg_read_all_stats;
+CREATE VIEW pg_shmem_segments AS
+ SELECT * FROM pg_get_shmem_segments();
+
+REVOKE ALL ON pg_shmem_segments FROM PUBLIC;
+GRANT SELECT ON pg_shmem_segments TO pg_read_all_stats;
+REVOKE EXECUTE ON FUNCTION pg_get_shmem_segments() FROM PUBLIC;
+GRANT EXECUTE ON FUNCTION pg_get_shmem_segments() TO pg_read_all_stats;
CREATE VIEW pg_shmem_allocations_numa AS
SELECT * FROM pg_get_shmem_allocations_numa();
diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c
index 197926d44f6b..68de301441bb 100644
--- a/src/backend/port/sysv_shmem.c
+++ b/src/backend/port/sysv_shmem.c
@@ -30,13 +30,19 @@
#include "miscadmin.h"
#include "port/pg_bitutils.h"
#include "portability/mem.h"
+#include "storage/bufmgr.h"
#include "storage/dsm.h"
#include "storage/fd.h"
#include "storage/ipc.h"
+#include "storage/lwlock.h"
#include "storage/pg_shmem.h"
+#include "storage/pmsignal.h"
+#include "storage/procsignal.h"
+#include "storage/shmem.h"
#include "utils/guc.h"
#include "utils/guc_hooks.h"
#include "utils/pidfile.h"
+#include "utils/wait_event.h"
/*
@@ -90,12 +96,90 @@ typedef enum
SHMSTATE_UNATTACHED, /* pertinent to DataDir, no attached PIDs */
} IpcMemoryState;
-
+/*
+ * TODO: These should be moved into ShmemSegment, now that there can be multiple
+ * shared memory segments. But there's windows specific code which will need
+ * adjustment, so leaving it here.
+ */
unsigned long UsedShmemSegID = 0;
void *UsedShmemSegAddr = NULL;
-static Size AnonymousShmemSize;
-static void *AnonymousShmem = NULL;
+volatile bool delay_shmem_resize = false;
+
+/*
+ * Anonymous mapping layout we use looks like this:
+ *
+ * 00400000-00c2a000 r-xp /bin/postgres
+ * ...
+ * 3f526000-3f590000 rw-p [heap]
+ * 7fbd827fe000-7fbd8bdde000 rw-s /memfd:main (deleted)
+ * 7fbd8bdde000-7fbe82800000 ---s /memfd:main (deleted)
+ * 7fbe82800000-7fbe90670000 r--p /usr/lib/locale/locale-archive
+ * 7fbe90800000-7fbe90941000 r-xp /usr/lib64/libstdc++.so.6.0.34
+ * ...
+ *
+ * We need to place shared memory mappings in such a way, that there will be
+ * gaps between them in the address space. Those gaps have to be large enough
+ * to resize the mapping up to certain size, without counting towards the total
+ * memory consumption.
+ *
+ * To achieve this, for each shared memory segment we first create an anonymous
+ * file of specified size using memfd_create, which will accomodate actual
+ * shared memory mapping content. It is represented by the first /memfd:main
+ * with rw permissions. Then we create a mapping for this file using mmap, with
+ * size much larger than required and flags PROT_NONE (allows to make sure the
+ * reserved space will not be used) and MAP_NORESERVE (prevents the space from
+ * being counted against memory limits). The mapping serves as an address space
+ * reservation, into which shared memory segment can be extended and is
+ * represented by the second /memfd:main with no permissions.
+ *
+ * The reserved space for buffer manager related segments is calculated based on
+ * MaxNBuffers.
+ */
+
+/*
+ * Flag telling that we have decided to use huge pages.
+ *
+ * XXX: It's possible to use GetConfigOption("huge_pages_status", false, false)
+ * instead, but it feels like an overkill.
+ */
+static bool huge_pages_on = false;
+
+/*
+ * Currently broadcasted value of NBuffers in shared memory.
+ *
+ * Most of the time this value is going to be equal to NBuffers. But if
+ * postmaster is resizing shared memory and a new backend was created
+ * at the same time, there is a possibility for the new backend to inherit the
+ * old NBuffers value, but miss the resize signal if ProcSignal infrastructure
+ * was not initialized yet. Consider this situation:
+ *
+ * Postmaster ------> New Backend
+ * | |
+ * | Launch
+ * | |
+ * | Inherit NBuffers
+ * | |
+ * Resize NBuffers |
+ * | |
+ * Emit Barrier |
+ * | Init ProcSignal
+ * | |
+ * Finish resize |
+ * | |
+ * New NBuffers Old NBuffers
+ *
+ * In this case the backend is not yet ready to receive a signal from
+ * EmitProcSignalBarrier, and will be ignored. The same happens if ProcSignal
+ * is initialized even later, after the resizing was finished.
+ *
+ * To address resulting inconsistency, postmaster broadcasts the current
+ * NBuffers value via shared memory. Every new backend has to verify this value
+ * before it will access the buffer pool: if it differs from its own value,
+ * this indicates a shared memory resize has happened and the backend has to
+ * first synchronize with rest of the pack.
+ */
+ShmemControl *ShmemCtrl = NULL;
static void *InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size);
static void IpcMemoryDetach(int status, Datum shmaddr);
@@ -104,6 +188,25 @@ static IpcMemoryState PGSharedMemoryAttach(IpcMemoryId shmId,
void *attachAt,
PGShmemHeader **addr);
+const char*
+MappingName(int shmem_segment)
+{
+ switch (shmem_segment)
+ {
+ case MAIN_SHMEM_SEGMENT:
+ return "main";
+ case BUFFERS_SHMEM_SEGMENT:
+ return "buffers";
+ case BUFFER_DESCRIPTORS_SHMEM_SEGMENT:
+ return "descriptors";
+ case BUFFER_IOCV_SHMEM_SEGMENT:
+ return "iocv";
+ case CHECKPOINT_BUFFERS_SHMEM_SEGMENT:
+ return "checkpoint";
+ default:
+ return "unknown";
+ }
+}
/*
* InternalIpcMemoryCreate(memKey, size)
@@ -470,19 +573,20 @@ PGSharedMemoryAttach(IpcMemoryId shmId,
* hugepage sizes, we might want to think about more invasive strategies,
* such as increasing shared_buffers to absorb the extra space.
*
- * Returns the (real, assumed or config provided) page size into
- * *hugepagesize, and the hugepage-related mmap flags to use into
- * *mmap_flags if requested by the caller. If huge pages are not supported,
- * *hugepagesize and *mmap_flags are set to 0.
+ * Returns the (real, assumed or config provided) page size into *hugepagesize,
+ * the hugepage-related mmap and memfd flags to use into *mmap_flags and
+ * *memfd_flags if requested by the caller. If huge pages are not supported,
+ * *hugepagesize, *mmap_flags and *memfd_flags are set to 0.
*/
void
-GetHugePageSize(Size *hugepagesize, int *mmap_flags)
+GetHugePageSize(Size *hugepagesize, int *mmap_flags, int *memfd_flags)
{
#ifdef MAP_HUGETLB
Size default_hugepagesize = 0;
Size hugepagesize_local = 0;
int mmap_flags_local = 0;
+ int memfd_flags_local = 0;
/*
* System-dependent code to find out the default huge page size.
@@ -541,6 +645,7 @@ GetHugePageSize(Size *hugepagesize, int *mmap_flags)
}
mmap_flags_local = MAP_HUGETLB;
+ memfd_flags_local = MFD_HUGETLB;
/*
* On recent enough Linux, also include the explicit page size, if
@@ -551,7 +656,16 @@ GetHugePageSize(Size *hugepagesize, int *mmap_flags)
{
int shift = pg_ceil_log2_64(hugepagesize_local);
- mmap_flags_local |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT;
+ memfd_flags_local |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT;
+ }
+#endif
+
+#if defined(MFD_HUGE_MASK) && defined(MFD_HUGE_SHIFT)
+ if (hugepagesize_local != default_hugepagesize)
+ {
+ int shift = pg_ceil_log2_64(hugepagesize_local);
+
+ memfd_flags_local |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT;
}
#endif
@@ -560,6 +674,8 @@ GetHugePageSize(Size *hugepagesize, int *mmap_flags)
*mmap_flags = mmap_flags_local;
if (hugepagesize)
*hugepagesize = hugepagesize_local;
+ if (memfd_flags)
+ *memfd_flags = memfd_flags_local;
#else
@@ -567,6 +683,8 @@ GetHugePageSize(Size *hugepagesize, int *mmap_flags)
*hugepagesize = 0;
if (mmap_flags)
*mmap_flags = 0;
+ if (memfd_flags)
+ *memfd_flags = 0;
#endif /* MAP_HUGETLB */
}
@@ -588,83 +706,242 @@ check_huge_page_size(int *newval, void **extra, GucSource source)
return true;
}
+/*
+ * Wrapper around posix_fallocate() to allocate memory for a given shared memory
+ * segment.
+ *
+ * Performs retry on EINTR, and raises error upon failure.
+ */
+static void
+shmem_fallocate(int fd, const char *mapping_name, Size size, int elevel)
+{
+#if defined(HAVE_POSIX_FALLOCATE) && defined(__linux__)
+ int ret;
+
+
+ /*
+ * If there is not enough memory, trying to access a hole in address space
+ * will cause SIGBUS. If supported, avoid that by allocating memory upfront.
+ *
+ * We still use a traditional EINTR retry loop to handle SIGCONT.
+ * posix_fallocate() doesn't restart automatically, and we don't want this to
+ * fail if you attach a debugger.
+ */
+ do
+ {
+ ret = posix_fallocate(fd, 0, size);
+ } while (ret == EINTR);
+
+ if (ret != 0)
+ {
+ ereport(elevel,
+ (errmsg("segment[%s]: could not allocate space for anonymous file: %s",
+ mapping_name, strerror(ret)),
+ (ret == ENOMEM) ?
+ errhint("This error usually means that PostgreSQL's request "
+ "for a shared memory segment exceeded available memory, "
+ "swap space, or huge pages. To reduce the request size "
+ "(currently %zu bytes), reduce PostgreSQL's shared "
+ "memory usage, perhaps by reducing \"shared_buffers\" or "
+ "\"max_connections\".",
+ size) : 0));
+ }
+#endif /* HAVE_POSIX_FALLOCATE && __linux__ */
+}
+
+/*
+ * Round up the required amount of memory and the amount of required reserved
+ * address space to the nearest huge page size.
+ */
+static inline void
+round_off_mapping_sizes_for_hugepages(MemoryMappingSizes *mapping, int hugepagesize)
+{
+ if (hugepagesize == 0)
+ return;
+
+ if (mapping->shmem_req_size % hugepagesize != 0)
+ mapping->shmem_req_size += hugepagesize -
+ (mapping->shmem_req_size % hugepagesize);
+
+ if (mapping->shmem_reserved % hugepagesize != 0)
+ mapping->shmem_reserved = mapping->shmem_reserved + hugepagesize -
+ (mapping->shmem_reserved % hugepagesize);
+}
+
/*
* Creates an anonymous mmap()ed shared memory segment.
*
- * Pass the requested size in *size. This function will modify *size to the
- * actual size of the allocation, if it ends up allocating a segment that is
- * larger than requested.
+ * This function will modify mapping size to the actual size of the allocation,
+ * if it ends up allocating a segment that is larger than requested. If needed,
+ * it also rounds up the mapping reserved size to be a multiple of huge page
+ * size.
+ *
+ * Note that we do not fallback from huge pages to regular pages in this
+ * function, this decision was already made in ReserveAnonymousMemory and we
+ * stick to it.
+ *
+ * TODO: Update the prologue to be consistent with the code.
*/
-static void *
-CreateAnonymousSegment(Size *size)
+static void
+CreateAnonymousSegment(MemoryMappingSizes *mapping, int segment_id)
{
- Size allocsize = *size;
void *ptr = MAP_FAILED;
- int mmap_errno = 0;
+ int save_errno = 0;
+ int mmap_flags = PG_MMAP_FLAGS, memfd_flags = 0;
+ ShmemSegment *segment = &Segments[segment_id];
#ifndef MAP_HUGETLB
- /* PGSharedMemoryCreate should have dealt with this case */
- Assert(huge_pages != HUGE_PAGES_ON);
+ /* PrepareHugePages should have dealt with this case */
+ Assert(huge_pages != HUGE_PAGES_ON && !huge_pages_on);
#else
- if (huge_pages == HUGE_PAGES_ON || huge_pages == HUGE_PAGES_TRY)
+ if (huge_pages_on)
{
- /*
- * Round up the request size to a suitable large value.
- */
Size hugepagesize;
- int mmap_flags;
- GetHugePageSize(&hugepagesize, &mmap_flags);
+ /* Make sure nothing is messed up */
+ Assert(huge_pages == HUGE_PAGES_ON || huge_pages == HUGE_PAGES_TRY);
- if (allocsize % hugepagesize != 0)
- allocsize += hugepagesize - (allocsize % hugepagesize);
+ /* Round up the request size to a suitable large value */
+ GetHugePageSize(&hugepagesize, &mmap_flags, &memfd_flags);
+ round_off_mapping_sizes_for_hugepages(mapping, hugepagesize);
- ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE,
- PG_MMAP_FLAGS | mmap_flags, -1, 0);
- mmap_errno = errno;
- if (huge_pages == HUGE_PAGES_TRY && ptr == MAP_FAILED)
- elog(DEBUG1, "mmap(%zu) with MAP_HUGETLB failed, huge pages disabled: %m",
- allocsize);
+ /* Verify that the new size is withing the reserved boundaries */
+ Assert(mapping->shmem_reserved >= mapping->shmem_req_size);
+
+ mmap_flags = PG_MMAP_FLAGS | mmap_flags;
}
#endif
/*
- * Report whether huge pages are in use. This needs to be tracked before
- * the second mmap() call if attempting to use huge pages failed
- * previously.
+ * Prepare an anonymous file backing the segment. Its size will be
+ * specified later via ftruncate.
+ *
+ * The file behaves like a regular file, but lives in memory. Once all
+ * references to the file are dropped, it is automatically released.
+ * Anonymous memory is used for all backing pages of the file, thus it has
+ * the same semantics as anonymous memory allocations using mmap with the
+ * MAP_ANONYMOUS flag.
*/
- SetConfigOption("huge_pages_status", (ptr == MAP_FAILED) ? "off" : "on",
- PGC_INTERNAL, PGC_S_DYNAMIC_DEFAULT);
+ segment->segment_fd = memfd_create(MappingName(segment_id), memfd_flags);
+ if (segment->segment_fd == -1)
+ ereport(FATAL,
+ (errmsg("segment[%s]: could not create anonymous shared memory file: %m",
+ MappingName(segment_id))));
- if (ptr == MAP_FAILED && huge_pages != HUGE_PAGES_ON)
- {
- /*
- * Use the original size, not the rounded-up value, when falling back
- * to non-huge pages.
- */
- allocsize = *size;
- ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE,
- PG_MMAP_FLAGS, -1, 0);
- mmap_errno = errno;
- }
+ elog(DEBUG1, "segment[%s]: mmap(%zu)", MappingName(segment_id), mapping->shmem_req_size);
+ /*
+ * Reserve maximum required address space for future expansion of this
+ * memory segment. MAP_NORESERVE ensures that no memory is allocated. The
+ * whole address space will be setup for read/write access, so that memory
+ * allocated to this address space can be read or written to even if it is
+ * resized.
+ */
+ ptr = mmap(NULL, mapping->shmem_reserved, PROT_READ | PROT_WRITE,
+ mmap_flags | MAP_NORESERVE, segment->segment_fd, 0);
if (ptr == MAP_FAILED)
+ ereport(FATAL,
+ (errmsg("segment[%s]: could not map anonymous shared memory: %m",
+ MappingName(segment_id))));
+
+ /*
+ * Resize the backing file to the required size. On platforms where it is
+ * supported, we also allocate the required memory upfront. On other
+ * platform the memory upto the size of file will be allocated on demand.
+ */
+ if(ftruncate(segment->segment_fd, mapping->shmem_req_size) == -1)
{
- errno = mmap_errno;
+ save_errno = errno;
+
+ close(segment->segment_fd);
+
+ errno = save_errno;
ereport(FATAL,
- (errmsg("could not map anonymous shared memory: %m"),
- (mmap_errno == ENOMEM) ?
+ (errmsg("segment[%s]: could not truncate anonymous file to size %zu: %m",
+ MappingName(segment_id), mapping->shmem_req_size),
+ (save_errno == ENOMEM) ?
errhint("This error usually means that PostgreSQL's request "
"for a shared memory segment exceeded available memory, "
"swap space, or huge pages. To reduce the request size "
"(currently %zu bytes), reduce PostgreSQL's shared "
"memory usage, perhaps by reducing \"shared_buffers\" or "
"\"max_connections\".",
- allocsize) : 0));
+ mapping->shmem_req_size) : 0));
+ }
+ shmem_fallocate(segment->segment_fd, MappingName(segment_id), mapping->shmem_req_size, FATAL);
+
+ segment->shmem = ptr;
+ segment->shmem_size = mapping->shmem_req_size;
+ segment->shmem_reserved = mapping->shmem_reserved;
+}
+
+/*
+ * PrepareHugePages
+ *
+ * Figure out if there are enough huge pages to allocate all shared memory
+ * segments, and report that information via huge_pages_status and
+ * huge_pages_on. It needs to be called before creating shared memory segments.
+ *
+ * It is necessary to maintain the same semantic (simple on/off) for
+ * huge_pages_status, even if there are multiple shared memory segments: all
+ * segments either use huge pages or not, there is no mix of segments with
+ * different page size. The latter might be actually beneficial, in particular
+ * because only some segments may require large amount of memory, but for now
+ * we go with a simple solution.
+ */
+void
+PrepareHugePages()
+{
+ void *ptr = MAP_FAILED;
+ MemoryMappingSizes mapping_sizes[NUM_MEMORY_MAPPINGS];
+
+ CalculateShmemSize(mapping_sizes);
+
+ /* Complain if hugepages demanded but we can't possibly support them */
+#if !defined(MAP_HUGETLB)
+ if (huge_pages == HUGE_PAGES_ON)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("huge pages not supported on this platform")));
+#else
+ if (huge_pages == HUGE_PAGES_ON || huge_pages == HUGE_PAGES_TRY)
+ {
+ Size hugepagesize, total_size = 0;
+ int mmap_flags;
+
+ GetHugePageSize(&hugepagesize, &mmap_flags, NULL);
+
+ /*
+ * Figure out how much memory is needed for all segments, keeping in
+ * mind that for every segment this value will be rounding up by the
+ * huge page size. The resulting value will be used to probe memory and
+ * decide whether we will allocate huge pages or not.
+ */
+ for(int segment = 0; segment < NUM_MEMORY_MAPPINGS; segment++)
+ {
+ Size segment_size = mapping_sizes[segment].shmem_req_size;
+
+ if (segment_size % hugepagesize != 0)
+ segment_size += hugepagesize - (segment_size % hugepagesize);
+
+ total_size += segment_size;
+ }
+
+ /* Map total amount of memory to test its availability. */
+ elog(DEBUG1, "reserving space: probe mmap(%zu) with MAP_HUGETLB",
+ total_size);
+ ptr = mmap(NULL, total_size, PROT_NONE,
+ PG_MMAP_FLAGS | MAP_ANONYMOUS | mmap_flags, -1, 0);
}
+#endif
- *size = allocsize;
- return ptr;
+ /*
+ * Report whether huge pages are in use. This needs to be tracked before
+ * creating shared memory segments.
+ */
+ SetConfigOption("huge_pages_status", (ptr == MAP_FAILED) ? "off" : "on",
+ PGC_INTERNAL, PGC_S_DYNAMIC_DEFAULT);
+ huge_pages_on = ptr != MAP_FAILED;
}
/*
@@ -674,20 +951,133 @@ CreateAnonymousSegment(Size *size)
static void
AnonymousShmemDetach(int status, Datum arg)
{
- /* Release anonymous shared memory block, if any. */
- if (AnonymousShmem != NULL)
+ for(int i = 0; i < NUM_MEMORY_MAPPINGS; i++)
{
- if (munmap(AnonymousShmem, AnonymousShmemSize) < 0)
- elog(LOG, "munmap(%p, %zu) failed: %m",
- AnonymousShmem, AnonymousShmemSize);
- AnonymousShmem = NULL;
+ ShmemSegment *segment = &Segments[i];
+
+ /* Release anonymous shared memory block, if any. */
+ if (segment->shmem != NULL)
+ {
+ if (munmap(segment->shmem, segment->shmem_size) < 0)
+ elog(LOG, "munmap(%p, %zu) failed: %m",
+ segment->shmem, segment->shmem_size);
+ segment->shmem = NULL;
+ }
}
}
+/*
+ * Resize all shared memory segments based on the new shared_buffers value (saved
+ * in ShmemCtrl area). The actual segment resizing is done via ftruncate, which
+ * will fail if there is not sufficient space to expand the anon file.
+ *
+ * TODO: Rename this to BufferShmemResize() or something. Only buffer manager's
+ * memory should be resized in this function.
+ *
+ * TODO: This function changes the amount of shared memory used. So it should
+ * also update the show only GUCs shared_memory_size and
+ * shared_memory_size_in_huge_pages in all backends. SetConfigOption() may be
+ * used for that. But it's not clear whether is_reload parameter is safe to use
+ * while resizing is going on; also at what stage it should be done.
+ */
+bool
+AnonymousShmemResize(void)
+{
+ int mmap_flags = PG_MMAP_FLAGS;
+ Size hugepagesize;
+ MemoryMappingSizes mapping_sizes[NUM_MEMORY_MAPPINGS];
+
+ Assert(!pg_atomic_unlocked_test_flag(&ShmemCtrl->resize_in_progress));
+
+ /* TODO: This is a hack. NBuffersPending should never be written by anything
+ * other than GUC system. Find a way to pass new NBuffers value to
+ * BufferManagerShmemSize(). */
+ NBuffersPending = pg_atomic_read_u32(&ShmemCtrl->targetNBuffers);
+ elog(DEBUG1, "Resize shmem from %d to %d", NBuffers, NBuffersPending);
+
+#ifndef MAP_HUGETLB
+ /* PrepareHugePages should have dealt with this case */
+ Assert(huge_pages != HUGE_PAGES_ON && !huge_pages_on);
+#else
+ if (huge_pages_on)
+ {
+ Assert(huge_pages == HUGE_PAGES_ON || huge_pages == HUGE_PAGES_TRY);
+ GetHugePageSize(&hugepagesize, &mmap_flags, NULL);
+ }
+#endif
+
+ /* Note that BufferManagerShmemSize() indirectly depends on NBuffersPending. */
+ BufferManagerShmemSize(mapping_sizes);
+
+ for(int i = 0; i < NUM_MEMORY_MAPPINGS; i++)
+ {
+ MemoryMappingSizes *mapping = &mapping_sizes[i];
+ ShmemSegment *segment = &Segments[i];
+ PGShmemHeader *shmem_hdr = segment->ShmemSegHdr;
+
+ /* Main shared memory segment is always static. Ignore it. */
+ if (i == MAIN_SHMEM_SEGMENT)
+ continue;
+
+ round_off_mapping_sizes(mapping);
+ round_off_mapping_sizes_for_hugepages(mapping, hugepagesize);
+
+ /*
+ * Size of the reserved address space should not change, since it depends
+ * upon MaxNBuffers, which can be changed only on restart.
+ */
+ Assert(segment->shmem_reserved == mapping->shmem_reserved);
+#ifdef MAP_HUGETLB
+ if (huge_pages_on && (mapping_sizes->shmem_req_size % hugepagesize != 0))
+ mapping_sizes->shmem_req_size += hugepagesize - (mapping_sizes->shmem_req_size % hugepagesize);
+#endif
+ elog(DEBUG1, "segment[%s]: requested size %zu, current size %zu, reserved %zu",
+ MappingName(i), mapping->shmem_req_size, segment->shmem_size,
+ segment->shmem_reserved);
+
+ if (segment->shmem == NULL)
+ continue;
+
+ if (segment->shmem_size == mapping->shmem_req_size)
+ continue;
+
+ /*
+ * We should have reserved enough address space for resizing. PANIC if
+ * that's not the case.
+ */
+ if (segment->shmem_reserved < mapping->shmem_req_size)
+ ereport(PANIC,
+ (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+ errmsg("not enough shared memory is reserved")));
+
+ elog(DEBUG1, "segment[%s]: resize from %zu to %zu at address %p",
+ MappingName(i), segment->shmem_size,
+ mapping->shmem_req_size, segment->shmem);
+
+ /*
+ * Resize the backing file to resize the allocated memory, and allocate
+ * more memory on supported platforms if required.
+ */
+ if(ftruncate(segment->segment_fd, mapping->shmem_req_size) == -1)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYSTEM_ERROR),
+ errmsg("could not truncate anonymous file for \"%s\": %m",
+ MappingName(i))));
+ if (mapping->shmem_req_size > segment->shmem_size)
+ shmem_fallocate(segment->segment_fd, MappingName(i), mapping->shmem_req_size, ERROR);
+
+ segment->shmem_size = mapping->shmem_req_size;
+ shmem_hdr->totalsize = segment->shmem_size;
+ segment->ShmemEnd = segment->shmem + segment->shmem_size;
+ }
+
+ return true;
+}
+
/*
* PGSharedMemoryCreate
*
- * Create a shared memory segment of the given size and initialize its
+ * Create a shared memory segment for the given mapping and initialize its
* standard header. Also, register an on_shmem_exit callback to release
* the storage.
*
@@ -697,7 +1087,7 @@ AnonymousShmemDetach(int status, Datum arg)
* postmaster or backend.
*/
PGShmemHeader *
-PGSharedMemoryCreate(Size size,
+PGSharedMemoryCreate(MemoryMappingSizes *mapping, int segment_id,
PGShmemHeader **shim)
{
IpcMemoryKey NextShmemSegID;
@@ -705,6 +1095,7 @@ PGSharedMemoryCreate(Size size,
PGShmemHeader *hdr;
struct stat statbuf;
Size sysvsize;
+ ShmemSegment *segment = &Segments[segment_id];
/*
* We use the data directory's ID info (inode and device numbers) to
@@ -717,14 +1108,6 @@ PGSharedMemoryCreate(Size size,
errmsg("could not stat data directory \"%s\": %m",
DataDir)));
- /* Complain if hugepages demanded but we can't possibly support them */
-#if !defined(MAP_HUGETLB)
- if (huge_pages == HUGE_PAGES_ON)
- ereport(ERROR,
- (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
- errmsg("huge pages not supported on this platform")));
-#endif
-
/* For now, we don't support huge pages in SysV memory */
if (huge_pages == HUGE_PAGES_ON && shared_memory_type != SHMEM_TYPE_MMAP)
ereport(ERROR,
@@ -732,12 +1115,12 @@ PGSharedMemoryCreate(Size size,
errmsg("huge pages not supported with the current \"shared_memory_type\" setting")));
/* Room for a header? */
- Assert(size > MAXALIGN(sizeof(PGShmemHeader)));
+ Assert(mapping->shmem_req_size > MAXALIGN(sizeof(PGShmemHeader)));
if (shared_memory_type == SHMEM_TYPE_MMAP)
{
- AnonymousShmem = CreateAnonymousSegment(&size);
- AnonymousShmemSize = size;
+ /* On success, mapping data will be modified. */
+ CreateAnonymousSegment(mapping, segment_id);
/* Register on-exit routine to unmap the anonymous segment */
on_shmem_exit(AnonymousShmemDetach, (Datum) 0);
@@ -747,7 +1130,7 @@ PGSharedMemoryCreate(Size size,
}
else
{
- sysvsize = size;
+ sysvsize = mapping->shmem_req_size;
/* huge pages are only available with mmap */
SetConfigOption("huge_pages_status", "off",
@@ -760,7 +1143,7 @@ PGSharedMemoryCreate(Size size,
* loop simultaneously. (CreateDataDirLockFile() does not entirely ensure
* that, but prefer fixing it over coping here.)
*/
- NextShmemSegID = statbuf.st_ino;
+ NextShmemSegID = statbuf.st_ino + segment_id;
for (;;)
{
@@ -852,13 +1235,13 @@ PGSharedMemoryCreate(Size size,
/*
* Initialize space allocation status for segment.
*/
- hdr->totalsize = size;
+ hdr->totalsize = mapping->shmem_req_size;
hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader));
*shim = hdr;
/* Save info for possible future use */
- UsedShmemSegAddr = memAddress;
- UsedShmemSegID = (unsigned long) NextShmemSegID;
+ segment->seg_addr = memAddress;
+ segment->seg_id = (unsigned long) NextShmemSegID;
/*
* If AnonymousShmem is NULL here, then we're not using anonymous shared
@@ -866,10 +1249,10 @@ PGSharedMemoryCreate(Size size,
* block. Otherwise, the System V shared memory block is only a shim, and
* we must return a pointer to the real block.
*/
- if (AnonymousShmem == NULL)
+ if (segment->shmem == NULL)
return hdr;
- memcpy(AnonymousShmem, hdr, sizeof(PGShmemHeader));
- return (PGShmemHeader *) AnonymousShmem;
+ memcpy(segment->shmem, hdr, sizeof(PGShmemHeader));
+ return (PGShmemHeader *) segment->shmem;
}
#ifdef EXEC_BACKEND
@@ -969,23 +1352,47 @@ PGSharedMemoryNoReAttach(void)
void
PGSharedMemoryDetach(void)
{
- if (UsedShmemSegAddr != NULL)
+ for(int i = 0; i < NUM_MEMORY_MAPPINGS; i++)
{
- if ((shmdt(UsedShmemSegAddr) < 0)
+ ShmemSegment *segment = &Segments[i];
+
+ if (segment->seg_addr != NULL)
+ {
+ if ((shmdt(segment->seg_addr) < 0)
#if defined(EXEC_BACKEND) && defined(__CYGWIN__)
- /* Work-around for cygipc exec bug */
- && shmdt(NULL) < 0
+ /* Work-around for cygipc exec bug */
+ && shmdt(NULL) < 0
#endif
- )
- elog(LOG, "shmdt(%p) failed: %m", UsedShmemSegAddr);
- UsedShmemSegAddr = NULL;
+ )
+ elog(LOG, "shmdt(%p) failed: %m", segment->seg_addr);
+ segment->seg_addr = NULL;
+ }
+
+ if (segment->shmem != NULL)
+ {
+ if (munmap(segment->shmem, segment->shmem_size) < 0)
+ elog(LOG, "munmap(%p, %zu) failed: %m",
+ segment->shmem, segment->shmem_size);
+ segment->shmem = NULL;
+ }
}
+}
- if (AnonymousShmem != NULL)
+void
+ShmemControlInit(void)
+{
+ bool foundShmemCtrl;
+
+ ShmemCtrl = (ShmemControl *)
+ ShmemInitStruct("Shmem Control", sizeof(ShmemControl),
+ &foundShmemCtrl);
+
+ if (!foundShmemCtrl)
{
- if (munmap(AnonymousShmem, AnonymousShmemSize) < 0)
- elog(LOG, "munmap(%p, %zu) failed: %m",
- AnonymousShmem, AnonymousShmemSize);
- AnonymousShmem = NULL;
+ pg_atomic_init_u32(&ShmemCtrl->targetNBuffers, 0);
+ pg_atomic_init_u32(&ShmemCtrl->currentNBuffers, 0);
+ pg_atomic_init_flag(&ShmemCtrl->resize_in_progress);
+
+ ShmemCtrl->coordinator = 0;
}
}
diff --git a/src/backend/port/win32_sema.c b/src/backend/port/win32_sema.c
index 5854ad1f54d3..e7365ff8060d 100644
--- a/src/backend/port/win32_sema.c
+++ b/src/backend/port/win32_sema.c
@@ -44,7 +44,7 @@ PGSemaphoreShmemSize(int maxSemas)
* process exits.
*/
void
-PGReserveSemaphores(int maxSemas)
+PGReserveSemaphores(int maxSemas, int shmem_segment)
{
mySemSet = (HANDLE *) malloc(maxSemas * sizeof(HANDLE));
if (mySemSet == NULL)
diff --git a/src/backend/port/win32_shmem.c b/src/backend/port/win32_shmem.c
index 4dee856d6bd6..5c0c32babaf1 100644
--- a/src/backend/port/win32_shmem.c
+++ b/src/backend/port/win32_shmem.c
@@ -204,7 +204,7 @@ EnableLockPagesPrivilege(int elevel)
* standard header.
*/
PGShmemHeader *
-PGSharedMemoryCreate(Size size,
+PGSharedMemoryCreate(MemoryMappingSizes *mapping_sizes, int segment_id,
PGShmemHeader **shim)
{
void *memAddress;
@@ -216,9 +216,10 @@ PGSharedMemoryCreate(Size size,
DWORD size_high;
DWORD size_low;
SIZE_T largePageSize = 0;
- Size orig_size = size;
+ Size size = mapping_sizes->shmem_req_size;
DWORD flProtect = PAGE_READWRITE;
DWORD desiredAccess;
+ ShmemSegment *segment = &Segments[segment_id]
ShmemProtectiveRegion = VirtualAlloc(NULL, PROTECTIVE_REGION_SIZE,
MEM_RESERVE, PAGE_NOACCESS);
@@ -304,7 +305,7 @@ PGSharedMemoryCreate(Size size,
* Use the original size, not the rounded-up value, when
* falling back to non-huge pages.
*/
- size = orig_size;
+ size = mapping_sizes->shmem_req_size;
flProtect = PAGE_READWRITE;
goto retry;
}
@@ -393,6 +394,11 @@ PGSharedMemoryCreate(Size size,
hdr->dsm_control = 0;
/* Save info for possible future use */
+ segment->shmem_size = size;
+ segment->seg_addr = memAddress;
+ segment->shmem = (Pointer) hdr;
+ segment->seg_id = (unsigned long) hmap2;
+
UsedShmemSegAddr = memAddress;
UsedShmemSegSize = size;
UsedShmemSegID = hmap2;
@@ -627,7 +633,7 @@ pgwin32_ReserveSharedMemoryRegion(HANDLE hChild)
* use GetLargePageMinimum() instead.
*/
void
-GetHugePageSize(Size *hugepagesize, int *mmap_flags)
+GetHugePageSize(Size *hugepagesize, int *mmap_flags, int *memfd_flags)
{
if (hugepagesize)
*hugepagesize = 0;
diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c
index e84e8663e966..ef3f84a55f57 100644
--- a/src/backend/postmaster/checkpointer.c
+++ b/src/backend/postmaster/checkpointer.c
@@ -654,9 +654,12 @@ CheckpointerMain(const void *startup_data, size_t startup_data_len)
static void
ProcessCheckpointerInterrupts(void)
{
- if (ProcSignalBarrierPending)
- ProcessProcSignalBarrier();
-
+ /*
+ * Reloading config can trigger further signals, complicating interrupts
+ * processing -- so let it run first.
+ *
+ * XXX: Is there any need in memory barrier after ProcessConfigFile?
+ */
if (ConfigReloadPending)
{
ConfigReloadPending = false;
@@ -676,6 +679,9 @@ ProcessCheckpointerInterrupts(void)
UpdateSharedMemoryConfig();
}
+ if (ProcSignalBarrierPending)
+ ProcessProcSignalBarrier();
+
/* Perform logging of memory contexts of this process */
if (LogMemoryContextPending)
ProcessLogMemoryContextInterrupt();
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 7c064cf9fbb2..2095713d7c0e 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -110,11 +110,15 @@
#include "replication/slotsync.h"
#include "replication/walsender.h"
#include "storage/aio_subsys.h"
+#include "storage/bufmgr.h"
#include "storage/fd.h"
#include "storage/io_worker.h"
#include "storage/ipc.h"
+#include "storage/pg_shmem.h"
#include "storage/pmsignal.h"
#include "storage/proc.h"
+#include "storage/procsignal.h"
+#include "storage/shmem.h"
#include "tcop/backend_startup.h"
#include "tcop/tcopprot.h"
#include "utils/datetime.h"
@@ -125,7 +129,6 @@
#ifdef EXEC_BACKEND
#include "common/file_utils.h"
-#include "storage/pg_shmem.h"
#endif
@@ -958,6 +961,11 @@ PostmasterMain(int argc, char *argv[])
*/
InitializeFastPathLocks();
+ /*
+ * Calculate MaxNBuffers for buffer pool resizing.
+ */
+ InitializeMaxNBuffers();
+
/*
* Give preloaded libraries a chance to request additional shared memory.
*/
diff --git a/src/backend/storage/buffer/Makefile b/src/backend/storage/buffer/Makefile
index fd7c40dcb089..3bc9aee85deb 100644
--- a/src/backend/storage/buffer/Makefile
+++ b/src/backend/storage/buffer/Makefile
@@ -17,6 +17,7 @@ OBJS = \
buf_table.o \
bufmgr.o \
freelist.o \
- localbuf.o
+ localbuf.o \
+ buf_resize.o
include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
index 6fd3a6bbac5e..4a354107185d 100644
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -17,6 +17,7 @@
#include "storage/aio.h"
#include "storage/buf_internals.h"
#include "storage/bufmgr.h"
+#include "utils/guc.h"
BufferDescPadded *BufferDescriptors;
char *BufferBlocks;
@@ -61,8 +62,12 @@ CkptSortItem *CkptBufferIds;
/*
* Initialize shared buffer pool
*
- * This is called once during shared-memory initialization (either in the
- * postmaster, or in a standalone backend).
+ * This is called once during shared-memory initialization.
+ * TODO: Restore this function to it's initial form. This function should see no
+ * change in buffer resize patches, except may be use of NBuffersPending.
+ *
+ * No locks are taking in this function, it is the caller responsibility to
+ * make sure only one backend can work with new buffers.
*/
void
BufferManagerShmemInit(void)
@@ -71,25 +76,26 @@ BufferManagerShmemInit(void)
foundDescs,
foundIOCV,
foundBufCkpt;
+ int i;
/* Align descriptors to a cacheline boundary. */
BufferDescriptors = (BufferDescPadded *)
- ShmemInitStruct("Buffer Descriptors",
- NBuffers * sizeof(BufferDescPadded),
- &foundDescs);
+ ShmemInitStructInSegment("Buffer Descriptors",
+ NBuffersPending * sizeof(BufferDescPadded),
+ &foundDescs, BUFFER_DESCRIPTORS_SHMEM_SEGMENT);
/* Align buffer pool on IO page size boundary. */
BufferBlocks = (char *)
TYPEALIGN(PG_IO_ALIGN_SIZE,
- ShmemInitStruct("Buffer Blocks",
- NBuffers * (Size) BLCKSZ + PG_IO_ALIGN_SIZE,
- &foundBufs));
+ ShmemInitStructInSegment("Buffer Blocks",
+ NBuffersPending * (Size) BLCKSZ + PG_IO_ALIGN_SIZE,
+ &foundBufs, BUFFERS_SHMEM_SEGMENT));
/* Align condition variables to cacheline boundary. */
BufferIOCVArray = (ConditionVariableMinimallyPadded *)
- ShmemInitStruct("Buffer IO Condition Variables",
- NBuffers * sizeof(ConditionVariableMinimallyPadded),
- &foundIOCV);
+ ShmemInitStructInSegment("Buffer IO Condition Variables",
+ NBuffersPending * sizeof(ConditionVariableMinimallyPadded),
+ &foundIOCV, BUFFER_IOCV_SHMEM_SEGMENT);
/*
* The array used to sort to-be-checkpointed buffer ids is located in
@@ -99,81 +105,290 @@ BufferManagerShmemInit(void)
* painful.
*/
CkptBufferIds = (CkptSortItem *)
- ShmemInitStruct("Checkpoint BufferIds",
- NBuffers * sizeof(CkptSortItem), &foundBufCkpt);
+ ShmemInitStructInSegment("Checkpoint BufferIds",
+ NBuffersPending * sizeof(CkptSortItem), &foundBufCkpt,
+ CHECKPOINT_BUFFERS_SHMEM_SEGMENT);
if (foundDescs || foundBufs || foundIOCV || foundBufCkpt)
{
/* should find all of these, or none of them */
Assert(foundDescs && foundBufs && foundIOCV && foundBufCkpt);
- /* note: this path is only taken in EXEC_BACKEND case */
- }
- else
- {
- int i;
-
/*
- * Initialize all the buffer headers.
+ * note: this path is only taken in EXEC_BACKEND case when initializing
+ * shared memory.
*/
- for (i = 0; i < NBuffers; i++)
- {
- BufferDesc *buf = GetBufferDescriptor(i);
+ }
- ClearBufferTag(&buf->tag);
+ /*
+ * Initialize all the buffer headers.
+ */
+ for (i = 0; i < NBuffersPending; i++)
+ {
+ BufferDesc *buf = GetBufferDescriptor(i);
- pg_atomic_init_u32(&buf->state, 0);
- buf->wait_backend_pgprocno = INVALID_PROC_NUMBER;
+ ClearBufferTag(&buf->tag);
- buf->buf_id = i;
+ pg_atomic_init_u32(&buf->state, 0);
+ buf->wait_backend_pgprocno = INVALID_PROC_NUMBER;
- pgaio_wref_clear(&buf->io_wref);
+ buf->buf_id = i;
- LWLockInitialize(BufferDescriptorGetContentLock(buf),
- LWTRANCHE_BUFFER_CONTENT);
+ pgaio_wref_clear(&buf->io_wref);
- ConditionVariableInit(BufferDescriptorGetIOCV(buf));
- }
+ LWLockInitialize(BufferDescriptorGetContentLock(buf),
+ LWTRANCHE_BUFFER_CONTENT);
+
+ ConditionVariableInit(BufferDescriptorGetIOCV(buf));
}
- /* Init other shared buffer-management stuff */
+ /*
+ * Init other shared buffer-management stuff.
+ */
StrategyInitialize(!foundDescs);
/* Initialize per-backend file flush context */
WritebackContextInit(&BackendWritebackContext,
&backend_flush_after);
+
+ /* Declare the size of current buffer pool. */
+ NBuffers = NBuffersPending;
+ pg_atomic_write_u32(&ShmemCtrl->currentNBuffers, NBuffers);
+ pg_atomic_write_u32(&ShmemCtrl->targetNBuffers, NBuffers);
}
/*
* BufferManagerShmemSize
*
* compute the size of shared memory for the buffer pool including
- * data pages, buffer descriptors, hash tables, etc.
+ * data pages, buffer descriptors, hash tables, etc. based on the
+ * shared memory segment. The main segment must not allocate anything
+ * related to buffers, every other segment will receive part of the
+ * data.
+ *
+ * Also sets the shmem_reserved field for each segment based on MaxNBuffers.
*/
Size
-BufferManagerShmemSize(void)
+BufferManagerShmemSize(MemoryMappingSizes *mapping_sizes)
{
- Size size = 0;
+ size_t size;
- /* size of buffer descriptors */
- size = add_size(size, mul_size(NBuffers, sizeof(BufferDescPadded)));
- /* to allow aligning buffer descriptors */
+ /* size of buffer descriptors, plus alignment padding */
+ size = add_size(0, mul_size(NBuffersPending, sizeof(BufferDescPadded)));
+ size = add_size(size, PG_CACHE_LINE_SIZE);
+ mapping_sizes[BUFFER_DESCRIPTORS_SHMEM_SEGMENT].shmem_req_size = size;
+ size = add_size(0, mul_size(MaxNBuffers, sizeof(BufferDescPadded)));
size = add_size(size, PG_CACHE_LINE_SIZE);
+ mapping_sizes[BUFFER_DESCRIPTORS_SHMEM_SEGMENT].shmem_reserved = size;
/* size of data pages, plus alignment padding */
- size = add_size(size, PG_IO_ALIGN_SIZE);
- size = add_size(size, mul_size(NBuffers, BLCKSZ));
+ size = add_size(0, PG_IO_ALIGN_SIZE);
+ size = add_size(size, mul_size(NBuffersPending, BLCKSZ));
+ mapping_sizes[BUFFERS_SHMEM_SEGMENT].shmem_req_size = size;
+ size = add_size(0, PG_IO_ALIGN_SIZE);
+ size = add_size(size, mul_size(MaxNBuffers, BLCKSZ));
+ mapping_sizes[BUFFERS_SHMEM_SEGMENT].shmem_reserved = size;
- /* size of stuff controlled by freelist.c */
- size = add_size(size, StrategyShmemSize());
-
- /* size of I/O condition variables */
- size = add_size(size, mul_size(NBuffers,
+ /* size of I/O condition variables, plus alignment padding */
+ size = add_size(0, mul_size(NBuffersPending,
+ sizeof(ConditionVariableMinimallyPadded)));
+ size = add_size(size, PG_CACHE_LINE_SIZE);
+ mapping_sizes[BUFFER_IOCV_SHMEM_SEGMENT].shmem_req_size = size;
+ size = add_size(0, mul_size(MaxNBuffers,
sizeof(ConditionVariableMinimallyPadded)));
- /* to allow aligning the above */
size = add_size(size, PG_CACHE_LINE_SIZE);
+ mapping_sizes[BUFFER_IOCV_SHMEM_SEGMENT].shmem_reserved = size;
/* size of checkpoint sort array in bufmgr.c */
- size = add_size(size, mul_size(NBuffers, sizeof(CkptSortItem)));
+ mapping_sizes[CHECKPOINT_BUFFERS_SHMEM_SEGMENT].shmem_req_size = mul_size(NBuffersPending, sizeof(CkptSortItem));
+ mapping_sizes[CHECKPOINT_BUFFERS_SHMEM_SEGMENT].shmem_reserved = mul_size(MaxNBuffers, sizeof(CkptSortItem));
+
+ /* Allocations in the main memory segment, at the end. */
+
+ /* size of stuff controlled by freelist.c */
+ size = add_size(0, StrategyShmemSize());
return size;
}
+
+/*
+ * Reinitialize shared buffer manager structures when resizing the buffer pool.
+ *
+ * This function is called in the backend which coordinates buffer resizing
+ * operation.
+ *
+ * TODO: Avoid code duplication with BufferManagerShmemInit() and also assess
+ * which functionality in the latter is required in this function.
+ */
+void
+BufferManagerShmemResize(int currentNBuffers, int targetNBuffers)
+{
+ bool found;
+ int i;
+ void *tmpPtr;
+
+ tmpPtr = (BufferDescPadded *)
+ ShmemUpdateStructInSegment("Buffer Descriptors",
+ targetNBuffers * sizeof(BufferDescPadded),
+ &found, BUFFER_DESCRIPTORS_SHMEM_SEGMENT);
+ if (BufferDescriptors != tmpPtr || !found)
+ elog(FATAL, "resizing buffer descriptors failed: expected pointer %p, got %p, found=%d",
+ BufferDescriptors, tmpPtr, found);
+
+ tmpPtr = (ConditionVariableMinimallyPadded *)
+ ShmemUpdateStructInSegment("Buffer IO Condition Variables",
+ targetNBuffers * sizeof(ConditionVariableMinimallyPadded),
+ &found, BUFFER_IOCV_SHMEM_SEGMENT);
+ if (BufferIOCVArray != tmpPtr || !found)
+ elog(FATAL, "resizing buffer IO condition variables failed: expected pointer %p, got %p, found=%d",
+ BufferIOCVArray, tmpPtr, found);
+
+ tmpPtr = (CkptSortItem *)
+ ShmemUpdateStructInSegment("Checkpoint BufferIds",
+ targetNBuffers * sizeof(CkptSortItem), &found,
+ CHECKPOINT_BUFFERS_SHMEM_SEGMENT);
+ if (CkptBufferIds != tmpPtr || !found)
+ elog(FATAL, "resizing checkpoint buffer IDs failed: expected pointer %p, got %p, found=%d",
+ CkptBufferIds, tmpPtr, found);
+
+ tmpPtr = (char *)
+ TYPEALIGN(PG_IO_ALIGN_SIZE,
+ ShmemUpdateStructInSegment("Buffer Blocks",
+ targetNBuffers * (Size) BLCKSZ + PG_IO_ALIGN_SIZE,
+ &found, BUFFERS_SHMEM_SEGMENT));
+ if (BufferBlocks != tmpPtr || !found)
+ elog(FATAL, "resizing buffer blocks failed: expected pointer %p, got %p, found=%d",
+ BufferBlocks, tmpPtr, found);
+
+ /*
+ * Initialize the headers for new buffers. If we are shrinking the
+ * buffers, currentNBuffers >= targetNBuffers, thus this loop doesn't execute.
+ */
+ for (i = currentNBuffers; i < targetNBuffers; i++)
+ {
+ BufferDesc *buf = GetBufferDescriptor(i);
+
+ ClearBufferTag(&buf->tag);
+
+ pg_atomic_init_u32(&buf->state, 0);
+ buf->wait_backend_pgprocno = INVALID_PROC_NUMBER;
+
+ buf->buf_id = i;
+
+ LWLockInitialize(BufferDescriptorGetContentLock(buf),
+ LWTRANCHE_BUFFER_CONTENT);
+
+ ConditionVariableInit(BufferDescriptorGetIOCV(buf));
+ }
+
+ /*
+ * We do not touch StrategyControl here. Instead it is done by background
+ * writer when handling PROCSIGNAL_BARRIER_SHBUF_EXPAND or
+ * PROCSIGNAL_BARRIER_SHBUF_SHRINK barrier.
+ */
+}
+
+/*
+ * BufferManagerShmemValidate
+ * Validate that buffer manager shared memory structures have correct
+ * pointers and sizes after a resize operation.
+ *
+ * This function is called by backends during ProcessBarrierShmemResizeStruct
+ * to ensure their view of the buffer structures is consistent after memory
+ * remapping.
+ */
+void
+BufferManagerShmemValidate(int targetNBuffers)
+{
+ bool found;
+ void *tmpPtr;
+
+ /* Validate Buffer Descriptors */
+ tmpPtr = (BufferDescPadded *)
+ ShmemInitStructInSegment("Buffer Descriptors",
+ targetNBuffers * sizeof(BufferDescPadded),
+ &found, BUFFER_DESCRIPTORS_SHMEM_SEGMENT);
+ if (!found || BufferDescriptors != tmpPtr)
+ elog(FATAL, "validating buffer descriptors failed: expected pointer %p, got %p, found=%d",
+ BufferDescriptors, tmpPtr, found);
+
+ /* Validate Buffer IO Condition Variables */
+ tmpPtr = (ConditionVariableMinimallyPadded *)
+ ShmemInitStructInSegment("Buffer IO Condition Variables",
+ targetNBuffers * sizeof(ConditionVariableMinimallyPadded),
+ &found, BUFFER_IOCV_SHMEM_SEGMENT);
+ if (!found || BufferIOCVArray != tmpPtr)
+ elog(FATAL, "validating buffer IO condition variables failed: expected pointer %p, got %p, found=%d",
+ BufferIOCVArray, tmpPtr, found);
+
+ /* Validate Checkpoint BufferIds */
+ tmpPtr = (CkptSortItem *)
+ ShmemInitStructInSegment("Checkpoint BufferIds",
+ targetNBuffers * sizeof(CkptSortItem), &found,
+ CHECKPOINT_BUFFERS_SHMEM_SEGMENT);
+ if (!found || CkptBufferIds != tmpPtr)
+ elog(FATAL, "validating checkpoint buffer IDs failed: expected pointer %p, got %p, found=%d",
+ CkptBufferIds, tmpPtr, found);
+
+ /* Validate Buffer Blocks */
+ tmpPtr = (char *)
+ TYPEALIGN(PG_IO_ALIGN_SIZE,
+ ShmemInitStructInSegment("Buffer Blocks",
+ targetNBuffers * (Size) BLCKSZ + PG_IO_ALIGN_SIZE,
+ &found, BUFFERS_SHMEM_SEGMENT));
+ if (!found || BufferBlocks != tmpPtr)
+ elog(FATAL, "validating buffer blocks failed: expected pointer %p, got %p, found=%d",
+ BufferBlocks, tmpPtr, found);
+}
+
+/*
+ * check_shared_buffers
+ * GUC check_hook for shared_buffers
+ *
+ * When reloading the configuration, shared_buffers should not be set to a value
+ * higher than max_shared_buffers fixed at the boot time.
+ */
+bool
+check_shared_buffers(int *newval, void **extra, GucSource source)
+{
+ if (finalMaxNBuffers && *newval > MaxNBuffers)
+ {
+ GUC_check_errdetail("\"shared_buffers\" must be less than \"max_shared_buffers\".");
+ return false;
+ }
+ return true;
+}
+
+/*
+ * show_shared_buffers
+ * GUC show_hook for shared_buffers
+ *
+ * Shows both current and pending buffer counts with proper unit formatting.
+ */
+const char *
+show_shared_buffers(void)
+{
+ static char buffer[128];
+ int64 current_value, pending_value;
+ const char *current_unit, *pending_unit;
+ int currentNBuffers = pg_atomic_read_u32(&ShmemCtrl->currentNBuffers);
+
+ if (currentNBuffers == NBuffersPending)
+ {
+ /* No buffer pool resizing pending. */
+ convert_int_from_base_unit(currentNBuffers, GUC_UNIT_BLOCKS, ¤t_value, ¤t_unit);
+ snprintf(buffer, sizeof(buffer), INT64_FORMAT "%s", current_value, current_unit);
+ }
+ else
+ {
+ /*
+ * New value for NBuffers is loaded but not applied yet, show both
+ * current and pending.
+ */
+ convert_int_from_base_unit(currentNBuffers, GUC_UNIT_BLOCKS, ¤t_value, ¤t_unit);
+ convert_int_from_base_unit(NBuffersPending, GUC_UNIT_BLOCKS, &pending_value, &pending_unit);
+ snprintf(buffer, sizeof(buffer), INT64_FORMAT "%s (pending: " INT64_FORMAT "%s)",
+ current_value, current_unit, pending_value, pending_unit);
+ }
+
+ return buffer;
+}
diff --git a/src/backend/storage/buffer/buf_resize.c b/src/backend/storage/buffer/buf_resize.c
new file mode 100644
index 000000000000..e815600c3ba0
--- /dev/null
+++ b/src/backend/storage/buffer/buf_resize.c
@@ -0,0 +1,399 @@
+/*-------------------------------------------------------------------------
+ *
+ * buf_resize.c
+ * shared buffer pool resizing functionality
+ *
+ * This module contains the implementation of shared buffer pool resizing,
+ * including the main resize coordination function and barrier processing
+ * functions that synchronize all backends during resize operations.
+ *
+ * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/storage/buffer/buf_resize.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "fmgr.h"
+#include "miscadmin.h"
+#include "postmaster/bgwriter.h"
+#include "storage/bufmgr.h"
+#include "storage/buf_internals.h"
+#include "storage/ipc.h"
+#include "storage/pg_shmem.h"
+#include "storage/pmsignal.h"
+#include "storage/procsignal.h"
+#include "storage/shmem.h"
+#include "utils/injection_point.h"
+
+
+/*
+ * Prepare ShmemCtrl for resizing the shared buffer pool.
+ */
+static void
+MarkBufferResizingStart(int targetNBuffers, int currentNBuffers)
+{
+ Assert(!pg_atomic_unlocked_test_flag(&ShmemCtrl->resize_in_progress));
+
+ Assert(pg_atomic_read_u32(&ShmemCtrl->currentNBuffers) == currentNBuffers);
+
+ pg_atomic_write_u32(&ShmemCtrl->targetNBuffers, targetNBuffers);
+ ShmemCtrl->coordinator = MyProcPid;
+}
+
+/*
+ * Reset ShmemCtrl after resizing the shared buffer pool is done.
+ */
+static void
+MarkBufferResizingEnd(int NBuffers)
+{
+ Assert(!pg_atomic_unlocked_test_flag(&ShmemCtrl->resize_in_progress));
+
+ Assert(pg_atomic_read_u32(&ShmemCtrl->currentNBuffers) == NBuffers);
+ pg_atomic_write_u32(&ShmemCtrl->targetNBuffers, 0);
+ ShmemCtrl->coordinator = -1;
+}
+
+/*
+ * Communicate given buffer pool resize barrier to all other backends and the Postmaster.
+ *
+ * ProcSignalBarrier is not sent to the Postmaster but we need the Postmaster to
+ * update its knowledge about the buffer pool so that it can be inherited by the
+ * child processes.
+ */
+static void
+SharedBufferResizeBarrier(ProcSignalBarrierType barrier, const char *barrier_name)
+{
+ WaitForProcSignalBarrier(EmitProcSignalBarrier(barrier));
+ elog(LOG, "all backends acknowledged %s barrier", barrier_name);
+
+#ifdef USE_INJECTION_POINTS
+ /* Injection point specific to this barrier type */
+ switch (barrier)
+ {
+ case PROCSIGNAL_BARRIER_SHBUF_SHRINK:
+ INJECTION_POINT("pgrsb-shrink-barrier-sent", NULL);
+ break;
+ case PROCSIGNAL_BARRIER_SHBUF_RESIZE_MAP_AND_MEM:
+ INJECTION_POINT("pgrsb-resize-barrier-sent", NULL);
+ break;
+ case PROCSIGNAL_BARRIER_SHBUF_EXPAND:
+ INJECTION_POINT("pgrsb-expand-barrier-sent", NULL);
+ break;
+ case PROCSIGNAL_BARRIER_SHBUF_RESIZE_FAILED:
+ /* TODO: Add an injection point here. */
+ break;
+ case PROCSIGNAL_BARRIER_SMGRRELEASE:
+ /*
+ * Not relevant in this function but it's here so that the compiler
+ * can detect any missing shared buffer resizing barrier enum here.
+ */
+ break;
+ }
+#endif /* USE_INJECTION_POINTS */
+}
+
+/*
+ * C implementation of SQL interface to update the shared buffers according to
+ * the current values of shared_buffers GUCs.
+ *
+ * The current boundaries of the buffer pool are given by two ranges.
+ *
+ * - [1, StrategyControl::activeNBuffers] is the range of buffers from which new
+ * allocations can happen at any time.
+ *
+ * - [1, ShmemCtrl::currentNBuffers] is the range of valid buffers at any given
+ * time.
+ *
+ * Let's assume that before resizing, the number of buffers in the buffer pool is
+ * NBuffersOld. After resizing it is NBuffersNew. Before resizing
+ * StrategyControl::activeNBuffers == ShmemCtrl::currentNBuffers == NBuffersOld.
+ * After the resizing finishes StrategyControl::activeNBuffers ==
+ * ShmemCtrl::currentNBuffers == NBuffersNew. Thus when no resizing happens these
+ * two ranges are same.
+ *
+ * Following steps are performed by the coordinator during resizing.
+ *
+ * 1. Marks resizing in progress to avoid multiple concurrent invocations of this
+ * function.
+ *
+ * 2. When shrinking the shared buffer pool, the coordinator sends SHBUF_SHRINK
+ * ProcSignalBarrier. In response to this barrier background writer is expected
+ * to set StrategyControl::activeNBuffers = NBuffersNew to restrict the new
+ * buffer allocations only to the new buffer pool size and also reset its
+ * internal state. Once every backend has acknowledged the barrier, the
+ * coordinator can be sure that new allocations will not happen in the buffer
+ * pool area being shrunk. Then it evicts the buffers in that area. Note that
+ * ShmemCtrl::currentNBuffers is still NBuffersOld, since backend may still
+ * access buffers allocated before the resizing started. Buffer eviction may fail
+ * if a buffer being evicted is pinned and the resizing operatino is aborted.
+ * Once the eviction is finished, the extra memory can be freed in the next step.
+ *
+ * 2. This step is executed in both cases, when expanding the buffer pool or
+ * shrinking the buffer pool. The anonymous file backing each of the shared
+ * memory segment containg the buffer pool shared data structures is resized to
+ * the amount of memory required for the new buffer pool size. When expanding the
+ * expanded portion of memory is initialized appropriately.
+ * ShmemCtrl::currentNBuffers is set to NBuffersNew to indicate new range of
+ * valid shared buffers. Every backend is sent SHBUF_RESIZE_MAP_AND_MEM barrier.
+ * All the backends validate that their pointers to the shared buffers structure
+ * are valid and have the right size. Once every backend has acknowledged the
+ * barrier, this step finishes.
+ *
+ * 3. When expanding the buffer pool, the coordinator sends SHBUF_EXPAND barrier
+ * to signal end of expansion. When expadning the background writer, in response
+ * to StrategyControl::activeNBuffers = NBufferNew so that new allocations can
+ * use expanded range of buffer pool.
+ *
+ * TODO: Handle the case when the backend executing this function dies or the
+ * query is cancelled or it hits an error while resizing.
+ */
+Datum
+pg_resize_shared_buffers(PG_FUNCTION_ARGS)
+{
+ bool result = true;
+ int currentNBuffers = pg_atomic_read_u32(&ShmemCtrl->currentNBuffers);
+ int targetNBuffers = NBuffersPending;
+
+ if (currentNBuffers == targetNBuffers)
+ {
+ elog(LOG, "shared buffers are already at %d, no need to resize", currentNBuffers);
+ PG_RETURN_BOOL(true);
+ }
+
+ if (!pg_atomic_test_set_flag(&ShmemCtrl->resize_in_progress))
+ {
+ elog(LOG, "shared buffer resizing already in progress");
+ PG_RETURN_BOOL(false);
+ }
+
+ /*
+ * TODO: What if the NBuffersPending value seen here is not the desired one
+ * because somebody did a pg_reload_conf() between the last pg_reload_conf()
+ * and execution of this function?
+ */
+ MarkBufferResizingStart(targetNBuffers, currentNBuffers);
+ elog(LOG, "resizing shared buffers from %d to %d", currentNBuffers, targetNBuffers);
+
+ INJECTION_POINT("pg-resize-shared-buffers-flag-set", NULL);
+
+ /* Phase 1: SHBUF_SHRINK - Only for shrinking buffer pool */
+ if (targetNBuffers < currentNBuffers)
+ {
+ /*
+ * Phase 1: Shrinking - send SHBUF_SHRINK barrier
+ * Every backend sets activeNBuffers = NewNBuffers to restrict
+ * buffer pool allocations to the new size
+ */
+ elog(LOG, "Phase 1: Shrinking buffer pool, restricting allocations to %d buffers", targetNBuffers);
+
+ SharedBufferResizeBarrier(PROCSIGNAL_BARRIER_SHBUF_SHRINK, CppAsString(PROCSIGNAL_BARRIER_SHBUF_SHRINK));
+
+ /* Evict buffers in the area being shrunk */
+ elog(LOG, "evicting buffers %u..%u", targetNBuffers + 1, currentNBuffers);
+ if (!EvictExtraBuffers(targetNBuffers, currentNBuffers))
+ {
+ elog(WARNING, "failed to evict extra buffers during shrinking");
+ SharedBufferResizeBarrier(PROCSIGNAL_BARRIER_SHBUF_RESIZE_FAILED, CppAsString(PROCSIGNAL_BARRIER_SHBUF_RESIZE_FAILED));
+ MarkBufferResizingEnd(currentNBuffers);
+ pg_atomic_clear_flag(&ShmemCtrl->resize_in_progress);
+ PG_RETURN_BOOL(false);
+ }
+
+ /* Update the current NBuffers. */
+ pg_atomic_write_u32(&ShmemCtrl->currentNBuffers, targetNBuffers);
+ }
+
+ /* Phase 2: SHBUF_RESIZE_MAP_AND_MEM - Both expanding and shrinking */
+ elog(LOG, "Phase 2: Remapping shared memory segments and updating structures");
+ if (!AnonymousShmemResize())
+ {
+ /*
+ * This should never fail since address map should already be reserved.
+ * So the failure should be treated as PANIC.
+ */
+ elog(PANIC, "failed to resize anonymous shared memory");
+ }
+
+ /* Update structure pointers and sizes */
+ BufferManagerShmemResize(currentNBuffers, targetNBuffers);
+
+ INJECTION_POINT("pgrsb-after-shmem-resize", NULL);
+
+ SharedBufferResizeBarrier(PROCSIGNAL_BARRIER_SHBUF_RESIZE_MAP_AND_MEM, CppAsString(PROCSIGNAL_BARRIER_SHBUF_RESIZE_MAP_AND_MEM));
+
+ /* Phase 3: SHBUF_EXPAND - Only for expanding buffer pool */
+ if (targetNBuffers > currentNBuffers)
+ {
+ /*
+ * Phase 3: Expanding - send SHBUF_EXPAND barrier
+ * Backends set activeNBuffers = NewNBuffers and start allocating
+ * buffers from the expanded range
+ */
+ elog(LOG, "Phase 3: Expanding buffer pool, enabling allocations up to %d buffers", targetNBuffers);
+ pg_atomic_write_u32(&ShmemCtrl->currentNBuffers, targetNBuffers);
+
+ SharedBufferResizeBarrier(PROCSIGNAL_BARRIER_SHBUF_EXPAND, CppAsString(PROCSIGNAL_BARRIER_SHBUF_EXPAND));
+ }
+
+ /*
+ * Reset buffer resize control area.
+ */
+ MarkBufferResizingEnd(targetNBuffers);
+
+ pg_atomic_clear_flag(&ShmemCtrl->resize_in_progress);
+
+ elog(LOG, "successfully resized shared buffers to %d", targetNBuffers);
+
+ PG_RETURN_BOOL(result);
+}
+
+bool
+ProcessBarrierShmemShrink(void)
+{
+ int targetNBuffers = pg_atomic_read_u32(&ShmemCtrl->targetNBuffers);
+
+ Assert(!pg_atomic_unlocked_test_flag(&ShmemCtrl->resize_in_progress));
+
+ /*
+ * Delay adjusting the new active size of buffer pool till this process
+ * becomes ready to resize buffers.
+ */
+ if (delay_shmem_resize)
+ {
+ elog(LOG, "Phase 1: Delaying SHBUF_SHRINK barrier - restricting allocations to %d buffers, coordinator is %d",
+ targetNBuffers, ShmemCtrl->coordinator);
+
+ return false;
+ }
+
+ if (MyBackendType == B_BG_WRITER)
+ {
+ /*
+ * We have to reset the background writer's buffer allocation statistics
+ * and the strategy control together so that background writer doesn't go
+ * out of sync with ClockSweepTick().
+ *
+ * TODO: But in case the background writer is not running, nobody would
+ * reset the strategy control area. So we can't rely on background
+ * worker to do that. So find a better way.
+ */
+ BgBufferSyncReset(NBuffers, targetNBuffers);
+ /* Reset strategy control to new size */
+ StrategyReset(targetNBuffers);
+ }
+
+ elog(LOG, "Phase 1: Processing SHBUF_SHRINK barrier - NBuffers = %d, coordinator is %d",
+ NBuffers, ShmemCtrl->coordinator);
+
+ return true;
+}
+
+bool
+ProcessBarrierShmemResizeMapAndMem(void)
+{
+ int targetNBuffers = pg_atomic_read_u32(&ShmemCtrl->targetNBuffers);
+
+ Assert(!pg_atomic_unlocked_test_flag(&ShmemCtrl->resize_in_progress));
+
+ /*
+ * If buffer pool is being shrunk, we are already working with a smaller
+ * buffer pool, so shrinking address space and shared structures should not
+ * be a problem. When expanding, expanding the address space and shared
+ * structures beyond the current boundaries is not going to be a problem
+ * since we are not accessing that memory yet. So there is no reason to
+ * delay processing this barrier.
+ */
+
+ /*
+ * Coordinator has already adjusted its address map and also updated sizes
+ * of the shared buffer structures, no further validation needed.
+ */
+ if (ShmemCtrl->coordinator == MyProcPid)
+ return true;
+
+ /*
+ * Backends validate that their pointers to shared buffer structures are
+ * still valid and have the correct size after memory remapping.
+ *
+ * TODO: Do want to do this only in assert enabled builds?
+ */
+ BufferManagerShmemValidate(targetNBuffers);
+
+ elog(LOG, "Backend %d successfully validated structure pointers after resize", MyProcPid);
+
+ return true;
+}
+
+bool
+ProcessBarrierShmemExpand(void)
+{
+ int targetNBuffers = pg_atomic_read_u32(&ShmemCtrl->targetNBuffers);
+
+ Assert(!pg_atomic_unlocked_test_flag(&ShmemCtrl->resize_in_progress));
+
+ /*
+ * Delay adjusting the new active size of buffer pool till this process
+ * becomes ready to resize buffers.
+ */
+ if (delay_shmem_resize)
+ {
+ elog(LOG, "Phase 3: delaying SHBUF_EXPAND barrier - enabling allocations up to %d buffers, coordinator is %d",
+ targetNBuffers, ShmemCtrl->coordinator);
+ return false;
+ }
+
+ if (MyBackendType == B_BG_WRITER)
+ {
+ /*
+ * We have to reset the background writer's buffer allocation statistics
+ * and the strategy control together so that background writer doesn't go
+ * out of sync with ClockSweepTick().
+ *
+ * TODO: But in case the background writer is not running, nobody would
+ * reset the strategy control area. So we can't rely on background
+ * worker to do that. So find a better way.
+ */
+ BgBufferSyncReset(NBuffers, targetNBuffers);
+ StrategyReset(targetNBuffers);
+ }
+
+ elog(LOG, "Phase 3: Processing SHBUF_EXPAND barrier - targetNBuffers = %d, ShmemCtrl->coordinator = %d", targetNBuffers, ShmemCtrl->coordinator);
+
+ return true;
+}
+
+bool
+ProcessBarrierShmemResizeFailed(void)
+{
+ int currentNBuffers = pg_atomic_read_u32(&ShmemCtrl->currentNBuffers);
+ int targetNBuffers = pg_atomic_read_u32(&ShmemCtrl->targetNBuffers);
+
+ Assert(!pg_atomic_unlocked_test_flag(&ShmemCtrl->resize_in_progress));
+
+ if (MyBackendType == B_BG_WRITER)
+ {
+ /*
+ * We have to reset the background writer's buffer allocation statistics
+ * and the strategy control together so that background writer doesn't go
+ * out of sync with ClockSweepTick().
+ *
+ * TODO: But in case the background writer is not running, nobody would
+ * reset the strategy control area. So we can't rely on background
+ * worker to do that. So find a better way.
+ */
+ BgBufferSyncReset(NBuffers, currentNBuffers);
+ /* Reset strategy control to new size */
+ StrategyReset(currentNBuffers);
+ }
+
+ elog(LOG, "received proc signal indicating failure to resize shared buffers from %d to %d, restoring to %d, coordinator is %d",
+ NBuffers, targetNBuffers, currentNBuffers, ShmemCtrl->coordinator);
+
+ return true;
+}
\ No newline at end of file
diff --git a/src/backend/storage/buffer/buf_table.c b/src/backend/storage/buffer/buf_table.c
index 9d256559bab9..18c9c6f336c1 100644
--- a/src/backend/storage/buffer/buf_table.c
+++ b/src/backend/storage/buffer/buf_table.c
@@ -21,7 +21,13 @@
*/
#include "postgres.h"
+#include "fmgr.h"
+#include "funcapi.h"
#include "storage/buf_internals.h"
+#include "storage/lwlock.h"
+#include "storage/pg_shmem.h"
+#include "utils/rel.h"
+#include "utils/builtins.h"
/* entry for buffer lookup hashtable */
typedef struct
@@ -59,10 +65,18 @@ InitBufTable(int size)
info.entrysize = sizeof(BufferLookupEnt);
info.num_partitions = NUM_BUFFER_PARTITIONS;
- SharedBufHash = ShmemInitHash("Shared Buffer Lookup Table",
+ /*
+ * The shared buffer look up table is set up only once with maximum possible
+ * entries considering maximum size of the buffer pool. It is not resized
+ * after that even if the buffer pool is resized. Hence it is allocated in
+ * the main shared memory segment and not in a resizeable shared memory
+ * segment.
+ */
+ SharedBufHash = ShmemInitHashInSegment("Shared Buffer Lookup Table",
size, size,
&info,
- HASH_ELEM | HASH_BLOBS | HASH_PARTITION | HASH_FIXED_SIZE);
+ HASH_ELEM | HASH_BLOBS | HASH_PARTITION | HASH_FIXED_SIZE,
+ MAIN_SHMEM_SEGMENT);
}
/*
@@ -159,3 +173,56 @@ BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
if (!result) /* shouldn't happen */
elog(ERROR, "shared buffer hash table corrupted");
}
+
+/*
+ * BufTableGetContents
+ * Fill the given tuplestore with contents of the shared buffer lookup table
+ *
+ * This function is used by pg_buffercache extension to expose buffer lookup
+ * table contents via SQL. The caller is responsible for setting up the
+ * tuplestore and result set info.
+ */
+void
+BufTableGetContents(Tuplestorestate *tupstore, TupleDesc tupdesc)
+{
+/* Expected number of attributes of the buffer lookup table entry. */
+#define BUFTABLE_CONTENTS_COLS 6
+
+ HASH_SEQ_STATUS hstat;
+ BufferLookupEnt *ent;
+ Datum values[BUFTABLE_CONTENTS_COLS];
+ bool nulls[BUFTABLE_CONTENTS_COLS];
+ int i;
+
+ memset(nulls, 0, sizeof(nulls));
+
+ Assert(tupdesc->natts == BUFTABLE_CONTENTS_COLS);
+
+ /*
+ * Lock all buffer mapping partitions to ensure a consistent view of the
+ * hash table during the scan. Must grab LWLocks in partition-number order
+ * to avoid LWLock deadlock.
+ */
+ for (i = 0; i < NUM_BUFFER_PARTITIONS; i++)
+ LWLockAcquire(BufMappingPartitionLockByIndex(i), LW_SHARED);
+
+ hash_seq_init(&hstat, SharedBufHash);
+ while ((ent = (BufferLookupEnt *) hash_seq_search(&hstat)) != NULL)
+ {
+ values[0] = ObjectIdGetDatum(ent->key.spcOid);
+ values[1] = ObjectIdGetDatum(ent->key.dbOid);
+ values[2] = ObjectIdGetDatum(ent->key.relNumber);
+ values[3] = ObjectIdGetDatum(ent->key.forkNum);
+ values[4] = Int64GetDatum(ent->key.blockNum);
+ values[5] = Int32GetDatum(ent->id);
+
+ tuplestore_putvalues(tupstore, tupdesc, values, nulls);
+ }
+
+ /*
+ * Release all buffer mapping partition locks in the reverse order so as
+ * to avoid LWLock deadlock.
+ */
+ for (i = NUM_BUFFER_PARTITIONS - 1; i >= 0; i--)
+ LWLockRelease(BufMappingPartitionLockByIndex(i));
+}
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 327ddb7adc88..f489ae2932fc 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -57,6 +57,7 @@
#include "storage/fd.h"
#include "storage/ipc.h"
#include "storage/lmgr.h"
+#include "storage/pg_shmem.h"
#include "storage/proc.h"
#include "storage/read_stream.h"
#include "storage/smgr.h"
@@ -66,6 +67,7 @@
#include "utils/rel.h"
#include "utils/resowner.h"
#include "utils/timestamp.h"
+#include "utils/injection_point.h"
/* Note: these two macros only work on shared buffers, not local ones! */
@@ -3415,6 +3417,9 @@ BufferSync(int flags)
ProcessProcSignalBarrier();
}
+ /* Injection point after scanning all buffers for dirty pages */
+ INJECTION_POINT("buffer-sync-dirty-buffer-scan", NULL);
+
if (num_to_scan == 0)
return; /* nothing to do */
@@ -3607,6 +3612,32 @@ BufferSync(int flags)
TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
}
+/*
+ * Information saved between BgBufferSync() calls so we can determine the
+ * strategy point's advance rate and avoid scanning already-cleaned buffers. The
+ * variables are global instead of static local so that BgBufferSyncReset() can
+ * adjust it when resizing shared buffers.
+ */
+static bool saved_info_valid = false;
+static int prev_strategy_buf_id;
+static uint32 prev_strategy_passes;
+static int next_to_clean;
+static uint32 next_passes;
+
+/* Moving averages of allocation rate and clean-buffer density */
+static float smoothed_alloc = 0;
+static float smoothed_density = 10.0;
+
+void
+BgBufferSyncReset(int currentNBuffers, int targetNBuffers)
+{
+ saved_info_valid = false;
+#ifdef BGW_DEBUG
+ elog(DEBUG2, "invalidated background writer status after resizing buffers from %d to %d",
+ currentNBuffers, targetNBuffers);
+#endif
+}
+
/*
* BgBufferSync -- Write out some dirty buffers in the pool.
*
@@ -3626,20 +3657,6 @@ BgBufferSync(WritebackContext *wb_context)
uint32 strategy_passes;
uint32 recent_alloc;
- /*
- * Information saved between calls so we can determine the strategy
- * point's advance rate and avoid scanning already-cleaned buffers.
- */
- static bool saved_info_valid = false;
- static int prev_strategy_buf_id;
- static uint32 prev_strategy_passes;
- static int next_to_clean;
- static uint32 next_passes;
-
- /* Moving averages of allocation rate and clean-buffer density */
- static float smoothed_alloc = 0;
- static float smoothed_density = 10.0;
-
/* Potentially these could be tunables, but for now, not */
float smoothing_samples = 16;
float scan_whole_pool_milliseconds = 120000.0;
@@ -3662,6 +3679,25 @@ BgBufferSync(WritebackContext *wb_context)
long new_strategy_delta;
uint32 new_recent_alloc;
+ /*
+ * If buffer pool is being shrunk the buffer being written out may not remain
+ * valid. If the buffer pool is being expanded, more buffers will become
+ * available without even this function writing out any. Hence wait till
+ * buffer resizing finishes i.e. go into hibernation mode.
+ *
+ * TODO: We may not need this synchronization if background worker itself
+ * becomes the coordinator.
+ */
+ if (!pg_atomic_unlocked_test_flag(&ShmemCtrl->resize_in_progress))
+ return true;
+
+ /*
+ * Resizing shared buffers while this function is performing an LRU scan on
+ * them may lead to wrong results. Indicate that the resizing should wait for
+ * the LRU scan to complete.
+ */
+ delay_shmem_resize = true;
+
/*
* Find out where the clock-sweep currently is, and how many buffer
* allocations have happened since our last call.
@@ -3679,6 +3715,7 @@ BgBufferSync(WritebackContext *wb_context)
if (bgwriter_lru_maxpages <= 0)
{
saved_info_valid = false;
+ delay_shmem_resize = false;
return true;
}
@@ -3838,8 +3875,17 @@ BgBufferSync(WritebackContext *wb_context)
num_written = 0;
reusable_buffers = reusable_buffers_est;
- /* Execute the LRU scan */
- while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
+ /*
+ * Execute the LRU scan.
+ *
+ * If buffer pool is being shrunk, the buffer being written may not remain
+ * valid. If the buffer pool is being expanded, more buffers will become
+ * available without even this function writing any. Hence stop what we are doing. This
+ * also unblocks other processes that are waiting for buffer resizing to
+ * finish.
+ */
+ while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est &&
+ !pg_atomic_unlocked_test_flag(&ShmemCtrl->resize_in_progress))
{
int sync_state = SyncOneBuffer(next_to_clean, true,
wb_context);
@@ -3898,6 +3944,9 @@ BgBufferSync(WritebackContext *wb_context)
#endif
}
+ /* Let the resizing commence. */
+ delay_shmem_resize = false;
+
/* Return true if OK to hibernate */
return (bufs_to_lap == 0 && recent_alloc == 0);
}
@@ -4208,7 +4257,23 @@ DebugPrintBufferRefcount(Buffer buffer)
void
CheckPointBuffers(int flags)
{
+ /* Mark that buffer sync is in progress - delay any shared memory resizing. */
+ /*
+ * TODO: We need to assess whether we should allow checkpoint and buffer
+ * resizing to run in parallel. When expanding buffers it may be fine to let
+ * the checkpointer run in RESIZE_MAP_AND_MEM phase but delay phase EXPAND
+ * phase till the checkpoint finishes, at the same time not allow checkpoint
+ * to run during expansion phase. When shrinking the buffers, we should
+ * delay SHRINK phase till checkpoint finishes and not allow to start
+ * checkpoint till SHRINK phase is done, but allow it to run in
+ * RESIZE_MAP_AND_MEM phase. This needs careful analysis and testing.
+ */
+ delay_shmem_resize = true;
+
BufferSync(flags);
+
+ /* Mark that buffer sync is no longer in progress - allow shared memory resizing */
+ delay_shmem_resize = false;
}
/*
@@ -7466,3 +7531,70 @@ const PgAioHandleCallbacks aio_local_buffer_readv_cb = {
.complete_local = local_buffer_readv_complete,
.report = buffer_readv_report,
};
+
+/*
+ * When shrinking shared buffers pool, evict the buffers which will not be part
+ * of the shrunk buffer pool.
+ */
+bool
+EvictExtraBuffers(int targetNBuffers, int currentNBuffers)
+{
+ bool result = true;
+
+ Assert(targetNBuffers < currentNBuffers);
+
+ /*
+ * If the buffer being evicated is locked, this function will need to wait.
+ * This function should not be called from a Postmaster since it can not wait on a lock.
+ */
+ Assert(IsUnderPostmaster);
+
+ /*
+ * TODO: Before evicting any buffer, we should check whether any of the
+ * buffers are pinned. If we find that a buffer is pinned after evicting
+ * most of them, that will impact performance since all those evicted
+ * buffers might need to be read again.
+ */
+ for (Buffer buf = targetNBuffers + 1; buf <= currentNBuffers; buf++)
+ {
+ BufferDesc *desc = GetBufferDescriptor(buf - 1);
+ uint32 buf_state;
+ bool buffer_flushed;
+
+ buf_state = pg_atomic_read_u32(&desc->state);
+
+ /*
+ * Nobody is expected to touch the buffers while resizing is
+ * going one hence unlocked precheck should be safe and saves
+ * some cycles.
+ */
+ if (!(buf_state & BM_VALID))
+ continue;
+
+ /*
+ * XXX: Looks like CurrentResourceOwner can be NULL here, find
+ * another one in that case?
+ * */
+ if (CurrentResourceOwner)
+ ResourceOwnerEnlarge(CurrentResourceOwner);
+
+ ReservePrivateRefCountEntry();
+
+ LockBufHdr(desc);
+
+ /*
+ * Now that we have locked buffer descriptor, make sure that the
+ * buffer without valid data has been skipped above.
+ */
+ Assert(buf_state & BM_VALID);
+
+ if (!EvictUnpinnedBufferInternal(desc, &buffer_flushed))
+ {
+ elog(WARNING, "could not remove buffer %u, it is pinned", buf);
+ result = false;
+ break;
+ }
+ }
+
+ return result;
+}
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 28d952b35344..256521d889af 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -19,6 +19,7 @@
#include "port/atomics.h"
#include "storage/buf_internals.h"
#include "storage/bufmgr.h"
+#include "storage/pg_shmem.h"
#include "storage/proc.h"
#define INT_ACCESS_ONCE(var) ((int)(*((volatile int *)&(var))))
@@ -32,10 +33,16 @@ typedef struct
/* Spinlock: protects the values below */
slock_t buffer_strategy_lock;
+ /*
+ * Number of active buffers that can be allocated. During buffer resizing,
+ * this may be different from NBuffers which tracks the global buffer count.
+ */
+ pg_atomic_uint32 activeNBuffers;
+
/*
* clock-sweep hand: index of next buffer to consider grabbing. Note that
* this isn't a concrete buffer - we only ever increase the value. So, to
- * get an actual buffer, it needs to be used modulo NBuffers.
+ * get an actual buffer, it needs to be used modulo activeNBuffers.
*/
pg_atomic_uint32 nextVictimBuffer;
@@ -100,21 +107,27 @@ static inline uint32
ClockSweepTick(void)
{
uint32 victim;
+ int activeBuffers;
/*
- * Atomically move hand ahead one buffer - if there's several processes
- * doing this, this can lead to buffers being returned slightly out of
- * apparent order.
+ * Atomically move hand ahead one buffer - if there's several processes doing
+ * this, this can lead to buffers being returned slightly out of apparent
+ * order. We need to read both the current position of hand and the current
+ * buffer allocation limit together consistently. They may be reset by
+ * concurrent resize.
*/
+ SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
victim =
pg_atomic_fetch_add_u32(&StrategyControl->nextVictimBuffer, 1);
+ activeBuffers = pg_atomic_read_u32(&StrategyControl->activeNBuffers);
+ SpinLockRelease(&StrategyControl->buffer_strategy_lock);
- if (victim >= NBuffers)
+ if (victim >= activeBuffers)
{
uint32 originalVictim = victim;
/* always wrap what we look up in BufferDescriptors */
- victim = victim % NBuffers;
+ victim = victim % activeBuffers;
/*
* If we're the one that just caused a wraparound, force
@@ -142,7 +155,7 @@ ClockSweepTick(void)
*/
SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
- wrapped = expected % NBuffers;
+ wrapped = expected % activeBuffers;
success = pg_atomic_compare_exchange_u32(&StrategyControl->nextVictimBuffer,
&expected, wrapped);
@@ -227,7 +240,8 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r
pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocs, 1);
/* Use the "clock sweep" algorithm to find a free buffer */
- trycounter = NBuffers;
+ trycounter = pg_atomic_read_u32(&StrategyControl->activeNBuffers);
+
for (;;)
{
uint32 old_buf_state;
@@ -280,7 +294,7 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r
if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
local_buf_state))
{
- trycounter = NBuffers;
+ trycounter = pg_atomic_read_u32(&StrategyControl->activeNBuffers);
break;
}
}
@@ -322,10 +336,12 @@ StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
{
uint32 nextVictimBuffer;
int result;
+ uint32 activeNBuffers;
SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
nextVictimBuffer = pg_atomic_read_u32(&StrategyControl->nextVictimBuffer);
- result = nextVictimBuffer % NBuffers;
+ activeNBuffers = pg_atomic_read_u32(&StrategyControl->activeNBuffers);
+ result = nextVictimBuffer % activeNBuffers;
if (complete_passes)
{
@@ -335,7 +351,7 @@ StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
* Additionally add the number of wraparounds that happened before
* completePasses could be incremented. C.f. ClockSweepTick().
*/
- *complete_passes += nextVictimBuffer / NBuffers;
+ *complete_passes += nextVictimBuffer / activeNBuffers;
}
if (num_buf_alloc)
@@ -382,7 +398,7 @@ StrategyShmemSize(void)
Size size = 0;
/* size of lookup hash table ... see comment in StrategyInitialize */
- size = add_size(size, BufTableShmemSize(NBuffers + NUM_BUFFER_PARTITIONS));
+ size = add_size(size, BufTableShmemSize(MaxNBuffers + NUM_BUFFER_PARTITIONS));
/* size of the shared replacement strategy control block */
size = add_size(size, MAXALIGN(sizeof(BufferStrategyControl)));
@@ -390,6 +406,31 @@ StrategyShmemSize(void)
return size;
}
+void
+StrategyReset(int activeNBuffers)
+{
+ Assert(StrategyControl);
+
+ SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
+
+ /* Update the active buffer count for the strategy */
+ pg_atomic_write_u32(&StrategyControl->activeNBuffers, activeNBuffers);
+
+ /* Reset the clock-sweep pointer to start from beginning */
+ pg_atomic_write_u32(&StrategyControl->nextVictimBuffer, 0);
+
+ /*
+ * The statistics is viewed in the context of the number of shared buffers.
+ * Reset it as the size of active number of shared buffers changes.
+ */
+ StrategyControl->completePasses = 0;
+ pg_atomic_write_u32(&StrategyControl->numBufferAllocs, 0);
+
+ /* TODO: Do we need to seset background writer notifications? */
+ StrategyControl->bgwprocno = -1;
+ SpinLockRelease(&StrategyControl->buffer_strategy_lock);
+}
+
/*
* StrategyInitialize -- initialize the buffer cache replacement
* strategy.
@@ -407,20 +448,29 @@ StrategyInitialize(bool init)
*
* Since we can't tolerate running out of lookup table entries, we must be
* sure to specify an adequate table size here. The maximum steady-state
- * usage is of course NBuffers entries, but BufferAlloc() tries to insert
- * a new entry before deleting the old. In principle this could be
- * happening in each partition concurrently, so we could need as many as
- * NBuffers + NUM_BUFFER_PARTITIONS entries.
+ * usage is of course is as many number of entries as the number of buffers
+ * in the buffer pool. Right now there is no way to free shared memory. Even
+ * if we shrink the buffer lookup table when shrinking the buffer pool the
+ * unused hash table entries can not be freed. When we expand the buffer
+ * pool, more entries can be allocated but we can not resize the hash table
+ * directory without rehashing all the entries. Just allocating more entries
+ * will lead to more contention. Hence we setup the buffer lookup table
+ * considering the maximum possible size of the buffer pool which is
+ * MaxNBuffers.
+ *
+ * Additionally BufferAlloc() tries to insert a new entry before deleting the
+ * old. In principle this could be happening in each partition concurrently,
+ * so we need extra NUM_BUFFER_PARTITIONS entries.
*/
- InitBufTable(NBuffers + NUM_BUFFER_PARTITIONS);
+ InitBufTable(MaxNBuffers + NUM_BUFFER_PARTITIONS);
/*
* Get or create the shared strategy control block
*/
StrategyControl = (BufferStrategyControl *)
- ShmemInitStruct("Buffer Strategy Status",
+ ShmemInitStructInSegment("Buffer Strategy Status",
sizeof(BufferStrategyControl),
- &found);
+ &found, MAIN_SHMEM_SEGMENT);
if (!found)
{
@@ -431,6 +481,8 @@ StrategyInitialize(bool init)
SpinLockInit(&StrategyControl->buffer_strategy_lock);
+ /* Initialize the active buffer count */
+ pg_atomic_init_u32(&StrategyControl->activeNBuffers, NBuffersPending);
/* Initialize the clock-sweep pointer */
pg_atomic_init_u32(&StrategyControl->nextVictimBuffer, 0);
@@ -668,12 +720,23 @@ GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state)
strategy->current = 0;
/*
- * If the slot hasn't been filled yet, tell the caller to allocate a new
- * buffer with the normal allocation strategy. He will then fill this
- * slot by calling AddBufferToRing with the new buffer.
+ * If the slot hasn't been filled yet or the buffer in the slot has been
+ * invalidated when buffer pool was shrunk, tell the caller to allocate a new
+ * buffer with the normal allocation strategy. He will then fill this slot
+ * by calling AddBufferToRing with the new buffer.
+ *
+ * TODO: Ideally we would want to check for bufnum > NBuffers only once
+ * after every time the buffer pool is shrunk so as to catch any runtime
+ * bugs that introduce invalid buffers in the ring. But that is complicated.
+ * The BufferAccessStrategy objects are not accessible outside the
+ * ScanState. Hence we can not purge the buffers while evicting the buffers.
+ * After the resizing is finished, it's not possible to notice when we touch
+ * the first of those objects and the last of objects. See if this can
+ * fixed.
*/
bufnum = strategy->buffers[strategy->current];
- if (bufnum == InvalidBuffer)
+ if (bufnum == InvalidBuffer ||
+ bufnum > pg_atomic_read_u32(&StrategyControl->activeNBuffers))
return NULL;
buf = GetBufferDescriptor(bufnum - 1);
diff --git a/src/backend/storage/buffer/meson.build b/src/backend/storage/buffer/meson.build
index 448976d2400b..2fc58db5a917 100644
--- a/src/backend/storage/buffer/meson.build
+++ b/src/backend/storage/buffer/meson.build
@@ -6,4 +6,5 @@ backend_sources += files(
'bufmgr.c',
'freelist.c',
'localbuf.c',
+ 'buf_resize.c',
)
diff --git a/src/backend/storage/ipc/ipc.c b/src/backend/storage/ipc/ipc.c
index 2704e80b3a7d..1965b2d3eb4d 100644
--- a/src/backend/storage/ipc/ipc.c
+++ b/src/backend/storage/ipc/ipc.c
@@ -61,6 +61,8 @@ static void proc_exit_prepare(int code);
* but provide some additional features we need --- in particular,
* we want to register callbacks to invoke when we are disconnecting
* from a broken shared-memory context but not exiting the postmaster.
+ * Maximum number of such exit callbacks depends on the number of shared
+ * segments.
*
* Callback functions can take zero, one, or two args: the first passed
* arg is the integer exitcode, the second is the Datum supplied when
@@ -68,7 +70,7 @@ static void proc_exit_prepare(int code);
* ----------------------------------------------------------------
*/
-#define MAX_ON_EXITS 20
+#define MAX_ON_EXITS 40
struct ONEXIT
{
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index b23d0c19360a..23e9b53ea074 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -81,10 +81,17 @@ RequestAddinShmemSpace(Size size)
/*
* CalculateShmemSize
- * Calculates the amount of shared memory needed.
+ * Calculates the amount of shared memory needed.
+ *
+ * The amount of shared memory required per segment is saved in mapping_sizes,
+ * which is expected to be an array of size NUM_MEMORY_MAPPINGS. The total
+ * amount of memory needed across all the segments is returned. For the memory
+ * mappings which reserve address space for future expansion, the required
+ * amount of reserved space is saved in mapping_sizes of those segments.
+ * This memory is not included in the returned value.
*/
Size
-CalculateShmemSize(void)
+CalculateShmemSize(MemoryMappingSizes *mapping_sizes)
{
Size size;
@@ -102,7 +109,13 @@ CalculateShmemSize(void)
sizeof(ShmemIndexEnt)));
size = add_size(size, dsm_estimate_size());
size = add_size(size, DSMRegistryShmemSize());
- size = add_size(size, BufferManagerShmemSize());
+
+ /*
+ * Buffer manager adds estimates for memory requirements for every shared
+ * memory segment that it uses in the corresponding AnonymousMappings.
+ * Consider size required from only the main shared memory segment here.
+ */
+ size = add_size(size, BufferManagerShmemSize(mapping_sizes));
size = add_size(size, LockManagerShmemSize());
size = add_size(size, PredicateLockShmemSize());
size = add_size(size, ProcGlobalShmemSize());
@@ -141,11 +154,32 @@ CalculateShmemSize(void)
size = add_size(size, AioShmemSize());
size = add_size(size, WaitLSNShmemSize());
+ /*
+ * XXX: For some reason slightly more memory is needed for larger
+ * shared_buffers, but this size is enough for any large value I've tested
+ * with. Is it a mistake in how slots are split, or there was a hidden
+ * inconsistency in shmem calculation?
+ */
+ size = add_size(size, 1024 * 1024 * 100);
+
/* include additional requested shmem from preload libraries */
size = add_size(size, total_addin_request);
+ /*
+ * All the shared memory allocations considered so far happen in the main
+ * shared memory segment.
+ */
+ mapping_sizes[MAIN_SHMEM_SEGMENT].shmem_req_size = size;
+ mapping_sizes[MAIN_SHMEM_SEGMENT].shmem_reserved = size;
+
+ size = 0;
/* might as well round it off to a multiple of a typical page size */
- size = add_size(size, 8192 - (size % 8192));
+ for (int segment = 0; segment < NUM_MEMORY_MAPPINGS; segment++)
+ {
+ round_off_mapping_sizes(&mapping_sizes[segment]);
+ /* Compute the total size of all segments */
+ size = size + mapping_sizes[segment].shmem_req_size;
+ }
return size;
}
@@ -191,32 +225,44 @@ CreateSharedMemoryAndSemaphores(void)
{
PGShmemHeader *shim;
PGShmemHeader *seghdr;
- Size size;
+ MemoryMappingSizes mapping_sizes[NUM_MEMORY_MAPPINGS];
Assert(!IsUnderPostmaster);
- /* Compute the size of the shared-memory block */
- size = CalculateShmemSize();
- elog(DEBUG3, "invoking IpcMemoryCreate(size=%zu)", size);
-
- /*
- * Create the shmem segment
- */
- seghdr = PGSharedMemoryCreate(size, &shim);
-
- /*
- * Make sure that huge pages are never reported as "unknown" while the
- * server is running.
- */
- Assert(strcmp("unknown",
- GetConfigOption("huge_pages_status", false, false)) != 0);
+ CalculateShmemSize(mapping_sizes);
- InitShmemAccess(seghdr);
+ /* Decide if we use huge pages or regular size pages */
+ PrepareHugePages();
- /*
- * Set up shared memory allocation mechanism
- */
- InitShmemAllocation();
+ for(int segment = 0; segment < NUM_MEMORY_MAPPINGS; segment++)
+ {
+ MemoryMappingSizes *mapping = &mapping_sizes[segment];
+
+ /* Compute the size of the shared-memory block */
+ elog(DEBUG3, "invoking IpcMemoryCreate(segment %s, size=%zu, reserved address space=%zu)",
+ MappingName(segment), mapping->shmem_req_size, mapping->shmem_reserved);
+
+ /*
+ * Create the shmem segment.
+ *
+ * XXX: Do multiple shims are needed, one per segment?
+ */
+ seghdr = PGSharedMemoryCreate(mapping, segment, &shim);
+
+ /*
+ * Make sure that huge pages are never reported as "unknown" while the
+ * server is running.
+ */
+ Assert(strcmp("unknown",
+ GetConfigOption("huge_pages_status", false, false)) != 0);
+
+ InitShmemAccessInSegment(seghdr, segment);
+
+ /*
+ * Set up shared memory allocation mechanism
+ */
+ InitShmemAllocationInSegment(segment);
+ }
/* Initialize subsystems */
CreateOrAttachShmemStructs();
@@ -274,6 +320,8 @@ CreateOrAttachShmemStructs(void)
CommitTsShmemInit();
SUBTRANSShmemInit();
MultiXactShmemInit();
+ /* TODO: This should be part of BufferManagerShmemInit() */
+ ShmemControlInit();
BufferManagerShmemInit();
/*
@@ -334,7 +382,9 @@ CreateOrAttachShmemStructs(void)
* InitializeShmemGUCs
*
* This function initializes runtime-computed GUCs related to the amount of
- * shared memory required for the current configuration.
+ * shared memory required for the current configuration. It assumes that the
+ * memory required by the shared memory segments is already calculated and is
+ * available in AnonymousMappings.
*/
void
InitializeShmemGUCs(void)
@@ -343,11 +393,13 @@ InitializeShmemGUCs(void)
Size size_b;
Size size_mb;
Size hp_size;
+ MemoryMappingSizes mapping_sizes[NUM_MEMORY_MAPPINGS];
+
/*
* Calculate the shared memory size and round up to the nearest megabyte.
*/
- size_b = CalculateShmemSize();
+ size_b = CalculateShmemSize(mapping_sizes);
size_mb = add_size(size_b, (1024 * 1024) - 1) / (1024 * 1024);
sprintf(buf, "%zu", size_mb);
SetConfigOption("shared_memory_size", buf,
@@ -356,7 +408,7 @@ InitializeShmemGUCs(void)
/*
* Calculate the number of huge pages required.
*/
- GetHugePageSize(&hp_size, NULL);
+ GetHugePageSize(&hp_size, NULL, NULL);
if (hp_size != 0)
{
Size hp_required;
diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c
index 087821311cce..c7c36f2be675 100644
--- a/src/backend/storage/ipc/procsignal.c
+++ b/src/backend/storage/ipc/procsignal.c
@@ -24,9 +24,11 @@
#include "port/pg_bitutils.h"
#include "replication/logicalworker.h"
#include "replication/walsender.h"
+#include "storage/bufmgr.h"
#include "storage/condition_variable.h"
#include "storage/ipc.h"
#include "storage/latch.h"
+#include "storage/pg_shmem.h"
#include "storage/shmem.h"
#include "storage/sinval.h"
#include "storage/smgr.h"
@@ -109,6 +111,10 @@ static bool CheckProcSignal(ProcSignalReason reason);
static void CleanupProcSignalState(int status, Datum arg);
static void ResetProcSignalBarrierBits(uint32 flags);
+#ifdef DEBUG_SHMEM_RESIZE
+bool delay_proc_signal_init = false;
+#endif
+
/*
* ProcSignalShmemSize
* Compute space needed for ProcSignal's shared memory
@@ -170,6 +176,43 @@ ProcSignalInit(const uint8 *cancel_key, int cancel_key_len)
uint32 old_pss_pid;
Assert(cancel_key_len >= 0 && cancel_key_len <= MAX_CANCEL_KEY_LENGTH);
+
+#ifdef DEBUG_SHMEM_RESIZE
+ /*
+ * Introduced for debugging purposes. You can change the variable at
+ * runtime using gdb, then start new backends with delayed ProcSignal
+ * initialization. Simple pg_usleep wont work here due to SIGHUP interrupt
+ * needed for testing. Taken from pg_sleep;
+ */
+ if (delay_proc_signal_init)
+ {
+#define GetNowFloat() ((float8) GetCurrentTimestamp() / 1000000.0)
+ float8 endtime = GetNowFloat() + 5;
+
+ for (;;)
+ {
+ float8 delay;
+ long delay_ms;
+
+ CHECK_FOR_INTERRUPTS();
+
+ delay = endtime - GetNowFloat();
+ if (delay >= 600.0)
+ delay_ms = 600000;
+ else if (delay > 0.0)
+ delay_ms = (long) (delay * 1000.0);
+ else
+ break;
+
+ (void) WaitLatch(MyLatch,
+ WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
+ delay_ms,
+ WAIT_EVENT_PG_SLEEP);
+ ResetLatch(MyLatch);
+ }
+ }
+#endif
+
if (MyProcNumber < 0)
elog(ERROR, "MyProcNumber not set");
if (MyProcNumber >= NumProcSignalSlots)
@@ -576,6 +619,18 @@ ProcessProcSignalBarrier(void)
case PROCSIGNAL_BARRIER_SMGRRELEASE:
processed = ProcessBarrierSmgrRelease();
break;
+ case PROCSIGNAL_BARRIER_SHBUF_SHRINK:
+ processed = ProcessBarrierShmemShrink();
+ break;
+ case PROCSIGNAL_BARRIER_SHBUF_RESIZE_MAP_AND_MEM:
+ processed = ProcessBarrierShmemResizeMapAndMem();
+ break;
+ case PROCSIGNAL_BARRIER_SHBUF_EXPAND:
+ processed = ProcessBarrierShmemExpand();
+ break;
+ case PROCSIGNAL_BARRIER_SHBUF_RESIZE_FAILED:
+ processed = ProcessBarrierShmemResizeFailed();
+ break;
}
/*
diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c
index 0f18beb6ad4a..eafcb665ba91 100644
--- a/src/backend/storage/ipc/shmem.c
+++ b/src/backend/storage/ipc/shmem.c
@@ -69,27 +69,34 @@
#include "funcapi.h"
#include "miscadmin.h"
#include "port/pg_numa.h"
+#include "postmaster/bgwriter.h"
+#include "storage/bufmgr.h"
+#include "storage/buf_internals.h"
+#include "storage/ipc.h"
#include "storage/lwlock.h"
#include "storage/pg_shmem.h"
+#include "storage/pmsignal.h"
+#include "storage/procsignal.h"
#include "storage/shmem.h"
#include "storage/spin.h"
#include "utils/builtins.h"
+#include "utils/injection_point.h"
+#include "utils/wait_event.h"
static void *ShmemAllocRaw(Size size, Size *allocated_size);
-static void *ShmemAllocUnlocked(Size size);
+static void *ShmemAllocRawInSegment(Size size, Size *allocated_size,
+ int shmem_segment);
/* shared memory global variables */
-static PGShmemHeader *ShmemSegHdr; /* shared mem segment header */
+ShmemSegment Segments[NUM_MEMORY_MAPPINGS];
-static void *ShmemBase; /* start address of shared memory */
-
-static void *ShmemEnd; /* end+1 address of shared memory */
-
-slock_t *ShmemLock; /* spinlock for shared memory and LWLock
- * allocation */
-
-static HTAB *ShmemIndex = NULL; /* primary index hashtable for shmem */
+/*
+ * Primary index hashtable for shmem, for simplicity we use a single for all
+ * shared memory segments. There can be performance consequences of that, and
+ * an alternative option would be to have one index per shared memory segments.
+ */
+static HTAB *ShmemIndex = NULL;
/* To get reliable results for NUMA inquiry we need to "touch pages" once */
static bool firstNumaTouch = true;
@@ -102,9 +109,17 @@ Datum pg_numa_available(PG_FUNCTION_ARGS);
void
InitShmemAccess(PGShmemHeader *seghdr)
{
- ShmemSegHdr = seghdr;
- ShmemBase = seghdr;
- ShmemEnd = (char *) ShmemBase + seghdr->totalsize;
+ InitShmemAccessInSegment(seghdr, MAIN_SHMEM_SEGMENT);
+}
+
+void
+InitShmemAccessInSegment(PGShmemHeader *seghdr, int shmem_segment)
+{
+ PGShmemHeader *shmhdr = (PGShmemHeader *) seghdr;
+ ShmemSegment *seg = &Segments[shmem_segment];
+ seg->ShmemSegHdr = shmhdr;
+ seg->ShmemBase = (void *) shmhdr;
+ seg->ShmemEnd = (char *) seg->ShmemBase + shmhdr->totalsize;
}
/*
@@ -115,7 +130,13 @@ InitShmemAccess(PGShmemHeader *seghdr)
void
InitShmemAllocation(void)
{
- PGShmemHeader *shmhdr = ShmemSegHdr;
+ InitShmemAllocationInSegment(MAIN_SHMEM_SEGMENT);
+}
+
+void
+InitShmemAllocationInSegment(int shmem_segment)
+{
+ PGShmemHeader *shmhdr = Segments[shmem_segment].ShmemSegHdr;
char *aligned;
Assert(shmhdr != NULL);
@@ -124,9 +145,9 @@ InitShmemAllocation(void)
* Initialize the spinlock used by ShmemAlloc. We must use
* ShmemAllocUnlocked, since obviously ShmemAlloc can't be called yet.
*/
- ShmemLock = (slock_t *) ShmemAllocUnlocked(sizeof(slock_t));
+ Segments[shmem_segment].ShmemLock = (slock_t *) ShmemAllocUnlockedInSegment(sizeof(slock_t), shmem_segment);
- SpinLockInit(ShmemLock);
+ SpinLockInit(Segments[shmem_segment].ShmemLock);
/*
* Allocations after this point should go through ShmemAlloc, which
@@ -151,16 +172,22 @@ InitShmemAllocation(void)
*/
void *
ShmemAlloc(Size size)
+{
+ return ShmemAllocInSegment(size, MAIN_SHMEM_SEGMENT);
+}
+
+void *
+ShmemAllocInSegment(Size size, int shmem_segment)
{
void *newSpace;
Size allocated_size;
- newSpace = ShmemAllocRaw(size, &allocated_size);
+ newSpace = ShmemAllocRawInSegment(size, &allocated_size, shmem_segment);
if (!newSpace)
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
- errmsg("out of shared memory (%zu bytes requested)",
- size)));
+ errmsg("out of shared memory in segment %s (%zu bytes requested)",
+ MappingName(shmem_segment), size)));
return newSpace;
}
@@ -185,6 +212,12 @@ ShmemAllocNoError(Size size)
*/
static void *
ShmemAllocRaw(Size size, Size *allocated_size)
+{
+ return ShmemAllocRawInSegment(size, allocated_size, MAIN_SHMEM_SEGMENT);
+}
+
+static void *
+ShmemAllocRawInSegment(Size size, Size *allocated_size, int shmem_segment)
{
Size newStart;
Size newFree;
@@ -204,22 +237,22 @@ ShmemAllocRaw(Size size, Size *allocated_size)
size = CACHELINEALIGN(size);
*allocated_size = size;
- Assert(ShmemSegHdr != NULL);
+ Assert(Segments[shmem_segment].ShmemSegHdr != NULL);
- SpinLockAcquire(ShmemLock);
+ SpinLockAcquire(Segments[shmem_segment].ShmemLock);
- newStart = ShmemSegHdr->freeoffset;
+ newStart = Segments[shmem_segment].ShmemSegHdr->freeoffset;
newFree = newStart + size;
- if (newFree <= ShmemSegHdr->totalsize)
+ if (newFree <= Segments[shmem_segment].ShmemSegHdr->totalsize)
{
- newSpace = (char *) ShmemBase + newStart;
- ShmemSegHdr->freeoffset = newFree;
+ newSpace = (char *) Segments[shmem_segment].ShmemBase + newStart;
+ Segments[shmem_segment].ShmemSegHdr->freeoffset = newFree;
}
else
newSpace = NULL;
- SpinLockRelease(ShmemLock);
+ SpinLockRelease(Segments[shmem_segment].ShmemLock);
/* note this assert is okay with newSpace == NULL */
Assert(newSpace == (void *) CACHELINEALIGN(newSpace));
@@ -228,15 +261,16 @@ ShmemAllocRaw(Size size, Size *allocated_size)
}
/*
- * ShmemAllocUnlocked -- allocate max-aligned chunk from shared memory
+ * ShmemAllocUnlockedInSegment
+ * allocate max-aligned chunk from given shared memory segment
*
* Allocate space without locking ShmemLock. This should be used for,
* and only for, allocations that must happen before ShmemLock is ready.
*
* We consider maxalign, rather than cachealign, sufficient here.
*/
-static void *
-ShmemAllocUnlocked(Size size)
+void *
+ShmemAllocUnlockedInSegment(Size size, int shmem_segment)
{
Size newStart;
Size newFree;
@@ -247,19 +281,19 @@ ShmemAllocUnlocked(Size size)
*/
size = MAXALIGN(size);
- Assert(ShmemSegHdr != NULL);
+ Assert(Segments[shmem_segment].ShmemSegHdr != NULL);
- newStart = ShmemSegHdr->freeoffset;
+ newStart = Segments[shmem_segment].ShmemSegHdr->freeoffset;
newFree = newStart + size;
- if (newFree > ShmemSegHdr->totalsize)
+ if (newFree > Segments[shmem_segment].ShmemSegHdr->totalsize)
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
- errmsg("out of shared memory (%zu bytes requested)",
- size)));
- ShmemSegHdr->freeoffset = newFree;
+ errmsg("out of shared memory in segment %s (%zu bytes requested)",
+ MappingName(shmem_segment), size)));
+ Segments[shmem_segment].ShmemSegHdr->freeoffset = newFree;
- newSpace = (char *) ShmemBase + newStart;
+ newSpace = (char *) Segments[shmem_segment].ShmemBase + newStart;
Assert(newSpace == (void *) MAXALIGN(newSpace));
@@ -274,7 +308,13 @@ ShmemAllocUnlocked(Size size)
bool
ShmemAddrIsValid(const void *addr)
{
- return (addr >= ShmemBase) && (addr < ShmemEnd);
+ return ShmemAddrIsValidInSegment(addr, MAIN_SHMEM_SEGMENT);
+}
+
+bool
+ShmemAddrIsValidInSegment(const void *addr, int shmem_segment)
+{
+ return (addr >= Segments[shmem_segment].ShmemBase) && (addr < Segments[shmem_segment].ShmemEnd);
}
/*
@@ -335,6 +375,18 @@ ShmemInitHash(const char *name, /* table string name for shmem index */
int64 max_size, /* max size of the table */
HASHCTL *infoP, /* info about key and bucket size */
int hash_flags) /* info about infoP */
+{
+ return ShmemInitHashInSegment(name, init_size, max_size, infoP, hash_flags,
+ MAIN_SHMEM_SEGMENT);
+}
+
+HTAB *
+ShmemInitHashInSegment(const char *name, /* table string name for shmem index */
+ long init_size, /* initial table size */
+ long max_size, /* max size of the table */
+ HASHCTL *infoP, /* info about key and bucket size */
+ int hash_flags, /* info about infoP */
+ int shmem_segment) /* in which segment to keep the table */
{
bool found;
void *location;
@@ -351,9 +403,9 @@ ShmemInitHash(const char *name, /* table string name for shmem index */
hash_flags |= HASH_SHARED_MEM | HASH_ALLOC | HASH_DIRSIZE;
/* look it up in the shmem index */
- location = ShmemInitStruct(name,
+ location = ShmemInitStructInSegment(name,
hash_get_shared_size(infoP, hash_flags),
- &found);
+ &found, shmem_segment);
/*
* if it already exists, attach to it rather than allocate and initialize
@@ -386,6 +438,13 @@ ShmemInitHash(const char *name, /* table string name for shmem index */
*/
void *
ShmemInitStruct(const char *name, Size size, bool *foundPtr)
+{
+ return ShmemInitStructInSegment(name, size, foundPtr, MAIN_SHMEM_SEGMENT);
+}
+
+void *
+ShmemInitStructInSegment(const char *name, Size size, bool *foundPtr,
+ int shmem_segment)
{
ShmemIndexEnt *result;
void *structPtr;
@@ -394,7 +453,7 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr)
if (!ShmemIndex)
{
- PGShmemHeader *shmemseghdr = ShmemSegHdr;
+ PGShmemHeader *shmemseghdr = Segments[shmem_segment].ShmemSegHdr;
/* Must be trying to create/attach to ShmemIndex itself */
Assert(strcmp(name, "ShmemIndex") == 0);
@@ -417,7 +476,7 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr)
* process can be accessing shared memory yet.
*/
Assert(shmemseghdr->index == NULL);
- structPtr = ShmemAlloc(size);
+ structPtr = ShmemAllocInSegment(size, shmem_segment);
shmemseghdr->index = structPtr;
*foundPtr = false;
}
@@ -434,16 +493,15 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr)
LWLockRelease(ShmemIndexLock);
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
- errmsg("could not create ShmemIndex entry for data structure \"%s\"",
- name)));
+ errmsg("could not create ShmemIndex entry for data structure \"%s\" in segment %d",
+ name, shmem_segment)));
}
if (*foundPtr)
{
/*
* Structure is in the shmem index so someone else has allocated it
- * already. The size better be the same as the size we are trying to
- * initialize to, or there is a name conflict (or worse).
+ * already. The size better be the same as the size we are trying to
*/
if (result->size != size)
{
@@ -453,6 +511,7 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr)
" \"%s\": expected %zu, actual %zu",
name, size, result->size)));
}
+
structPtr = result->location;
}
else
@@ -460,7 +519,7 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr)
Size allocated_size;
/* It isn't in the table yet. allocate and initialize it */
- structPtr = ShmemAllocRaw(size, &allocated_size);
+ structPtr = ShmemAllocRawInSegment(size, &allocated_size, shmem_segment);
if (structPtr == NULL)
{
/* out of memory; remove the failed ShmemIndex entry */
@@ -475,18 +534,71 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr)
result->size = size;
result->allocated_size = allocated_size;
result->location = structPtr;
+ result->shmem_segment = shmem_segment;
}
LWLockRelease(ShmemIndexLock);
- Assert(ShmemAddrIsValid(structPtr));
+ Assert(ShmemAddrIsValidInSegment(structPtr, shmem_segment));
+
+ Assert(structPtr == (void *) CACHELINEALIGN(structPtr));
+
+ return structPtr;
+}
+
+/*
+ * ShmemUpdateStructInSegment -- Update the size of a structure in shared memory.
+ *
+ * This function updates the size of an existing shared memory structure. It
+ * finds the structure in the shmem index and updates its size information while
+ * preserving the existing memory location.
+ *
+ * Returns: pointer to the existing structure location.
+ */
+void *
+ShmemUpdateStructInSegment(const char *name, Size size, bool *foundPtr,
+ int shmem_segment)
+{
+ ShmemIndexEnt *result;
+ void *structPtr;
+ Size delta;
+
+ LWLockAcquire(ShmemIndexLock, LW_EXCLUSIVE);
+
+ Assert(ShmemIndex);
+
+ /* Look up the structure in the shmem index */
+ result = (ShmemIndexEnt *)
+ hash_search(ShmemIndex, name, HASH_FIND, foundPtr);
+
+ Assert(*foundPtr);
+ Assert(result);
+ Assert(result->shmem_segment == shmem_segment);
+
+ delta = size - result->size;
+ /* Store the existing structure pointer */
+ structPtr = result->location;
+
+ /* Update the size information.
+ TODO: Ideally we should implement repalloc kind of functionality for shared memory which will return allocated size. */
+ result->size = size;
+ result->allocated_size = size;
+ /* Reflect size change in the shared segment */
+ SpinLockAcquire(Segments[shmem_segment].ShmemLock);
+ Segments[shmem_segment].ShmemSegHdr->freeoffset += delta;
+ SpinLockRelease(Segments[shmem_segment].ShmemLock);
+ LWLockRelease(ShmemIndexLock);
+
+ /* Verify the structure is still in the correct segment */
+ Assert(ShmemAddrIsValidInSegment(structPtr, shmem_segment));
Assert(structPtr == (void *) CACHELINEALIGN(structPtr));
return structPtr;
}
+
/*
* Add two Size values, checking for overflow
*/
@@ -527,13 +639,14 @@ mul_size(Size s1, Size s2)
Datum
pg_get_shmem_allocations(PG_FUNCTION_ARGS)
{
-#define PG_GET_SHMEM_SIZES_COLS 4
+#define PG_GET_SHMEM_SIZES_COLS 5
ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
HASH_SEQ_STATUS hstat;
ShmemIndexEnt *ent;
- Size named_allocated = 0;
+ Size named_allocated[NUM_MEMORY_MAPPINGS] = {0};
Datum values[PG_GET_SHMEM_SIZES_COLS];
bool nulls[PG_GET_SHMEM_SIZES_COLS];
+ int i;
InitMaterializedSRF(fcinfo, 0);
@@ -546,29 +659,40 @@ pg_get_shmem_allocations(PG_FUNCTION_ARGS)
while ((ent = (ShmemIndexEnt *) hash_seq_search(&hstat)) != NULL)
{
values[0] = CStringGetTextDatum(ent->key);
- values[1] = Int64GetDatum((char *) ent->location - (char *) ShmemSegHdr);
- values[2] = Int64GetDatum(ent->size);
- values[3] = Int64GetDatum(ent->allocated_size);
- named_allocated += ent->allocated_size;
+ values[1] = CStringGetTextDatum(MappingName(ent->shmem_segment));
+ values[2] = Int64GetDatum((char *) ent->location - (char *) Segments[ent->shmem_segment].ShmemSegHdr);
+ values[3] = Int64GetDatum(ent->size);
+ values[4] = Int64GetDatum(ent->allocated_size);
+ named_allocated[ent->shmem_segment] += ent->allocated_size;
tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
values, nulls);
}
/* output shared memory allocated but not counted via the shmem index */
- values[0] = CStringGetTextDatum("");
- nulls[1] = true;
- values[2] = Int64GetDatum(ShmemSegHdr->freeoffset - named_allocated);
- values[3] = values[2];
- tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
+ for (i = 0; i < NUM_MEMORY_MAPPINGS; i++)
+ {
+ values[0] = CStringGetTextDatum("");
+ values[1] = CStringGetTextDatum(MappingName(i));
+ nulls[2] = true;
+ values[3] = Int64GetDatum(Segments[i].ShmemSegHdr->freeoffset - named_allocated[i]);
+ values[4] = values[3];
+ tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
+ }
/* output as-of-yet unused shared memory */
- nulls[0] = true;
- values[1] = Int64GetDatum(ShmemSegHdr->freeoffset);
- nulls[1] = false;
- values[2] = Int64GetDatum(ShmemSegHdr->totalsize - ShmemSegHdr->freeoffset);
- values[3] = values[2];
- tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
+ memset(nulls, 0, sizeof(nulls));
+
+ for (i = 0; i < NUM_MEMORY_MAPPINGS; i++)
+ {
+ PGShmemHeader *shmhdr = Segments[i].ShmemSegHdr;
+ nulls[0] = true;
+ values[1] = CStringGetTextDatum(MappingName(i));
+ values[2] = Int64GetDatum(shmhdr->freeoffset);
+ values[3] = Int64GetDatum(shmhdr->totalsize - shmhdr->freeoffset);
+ values[4] = values[3];
+ tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
+ }
LWLockRelease(ShmemIndexLock);
@@ -593,7 +717,7 @@ pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS)
Size os_page_size;
void **page_ptrs;
int *pages_status;
- uint64 shm_total_page_count,
+ uint64 shm_total_page_count = 0,
shm_ent_page_count,
max_nodes;
Size *nodes;
@@ -628,7 +752,12 @@ pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS)
* this is not very likely, and moreover we have more entries, each of
* them using only fraction of the total pages.
*/
- shm_total_page_count = (ShmemSegHdr->totalsize / os_page_size) + 1;
+ for(int segment = 0; segment < NUM_MEMORY_MAPPINGS; segment++)
+ {
+ PGShmemHeader *shmhdr = Segments[segment].ShmemSegHdr;
+ shm_total_page_count += (shmhdr->totalsize / os_page_size) + 1;
+ }
+
page_ptrs = palloc0(sizeof(void *) * shm_total_page_count);
pages_status = palloc(sizeof(int) * shm_total_page_count);
@@ -751,7 +880,7 @@ pg_get_shmem_pagesize(void)
Assert(huge_pages_status != HUGE_PAGES_UNKNOWN);
if (huge_pages_status == HUGE_PAGES_ON)
- GetHugePageSize(&os_page_size, NULL);
+ GetHugePageSize(&os_page_size, NULL, NULL);
return os_page_size;
}
@@ -761,3 +890,45 @@ pg_numa_available(PG_FUNCTION_ARGS)
{
PG_RETURN_BOOL(pg_numa_init() != -1);
}
+
+/* SQL SRF showing shared memory segments */
+Datum
+pg_get_shmem_segments(PG_FUNCTION_ARGS)
+{
+#define PG_GET_SHMEM_SEGS_COLS 6
+ ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+ Datum values[PG_GET_SHMEM_SEGS_COLS];
+ bool nulls[PG_GET_SHMEM_SEGS_COLS];
+ int i;
+
+ InitMaterializedSRF(fcinfo, 0);
+
+ /* output all allocated entries */
+ for (i = 0; i < NUM_MEMORY_MAPPINGS; i++)
+ {
+ ShmemSegment *segment = &Segments[i];
+ PGShmemHeader *shmhdr = segment->ShmemSegHdr;
+ int j;
+
+ if (shmhdr == NULL)
+ {
+ for (j = 0; j < PG_GET_SHMEM_SEGS_COLS; j++)
+ nulls[j] = true;
+ }
+ else
+ {
+ memset(nulls, 0, sizeof(nulls));
+ values[0] = Int32GetDatum(i);
+ values[1] = CStringGetTextDatum(MappingName(i));
+ values[2] = Int64GetDatum(shmhdr->totalsize);
+ values[3] = Int64GetDatum(shmhdr->freeoffset);
+ values[4] = Int64GetDatum(segment->shmem_size);
+ values[5] = Int64GetDatum(segment->shmem_reserved);
+ }
+
+ tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
+ values, nulls);
+ }
+
+ return (Datum) 0;
+}
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index b017880f5e45..c25dd13b63af 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -80,6 +80,8 @@
#include "pg_trace.h"
#include "pgstat.h"
#include "port/pg_bitutils.h"
+#include "postmaster/postmaster.h"
+#include "storage/pg_shmem.h"
#include "storage/proc.h"
#include "storage/proclist.h"
#include "storage/procnumber.h"
@@ -612,12 +614,15 @@ LWLockNewTrancheId(const char *name)
/*
* We use the ShmemLock spinlock to protect LWLockCounter and
* LWLockTrancheNames.
+ *
+ * XXX: Looks like this is the only use of Segments outside of shmem.c,
+ * it's maybe worth it to reshape this part to hide Segments structure.
*/
- SpinLockAcquire(ShmemLock);
+ SpinLockAcquire(Segments[MAIN_SHMEM_SEGMENT].ShmemLock);
if (*LWLockCounter - LWTRANCHE_FIRST_USER_DEFINED >= MAX_NAMED_TRANCHES)
{
- SpinLockRelease(ShmemLock);
+ SpinLockRelease(Segments[MAIN_SHMEM_SEGMENT].ShmemLock);
ereport(ERROR,
(errmsg("maximum number of tranches already registered"),
errdetail("No more than %d tranches may be registered.",
@@ -628,7 +633,7 @@ LWLockNewTrancheId(const char *name)
LocalLWLockCounter = *LWLockCounter;
strlcpy(LWLockTrancheNames[result - LWTRANCHE_FIRST_USER_DEFINED], name, NAMEDATALEN);
- SpinLockRelease(ShmemLock);
+ SpinLockRelease(Segments[MAIN_SHMEM_SEGMENT].ShmemLock);
return result;
}
@@ -750,9 +755,9 @@ GetLWTrancheName(uint16 trancheId)
*/
if (trancheId >= LocalLWLockCounter)
{
- SpinLockAcquire(ShmemLock);
+ SpinLockAcquire(Segments[MAIN_SHMEM_SEGMENT].ShmemLock);
LocalLWLockCounter = *LWLockCounter;
- SpinLockRelease(ShmemLock);
+ SpinLockRelease(Segments[MAIN_SHMEM_SEGMENT].ShmemLock);
if (trancheId >= LocalLWLockCounter)
elog(ERROR, "tranche %d is not registered", trancheId);
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 7dd75a490aab..9c9ebe4280a0 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -63,6 +63,7 @@
#include "rewrite/rewriteHandler.h"
#include "storage/bufmgr.h"
#include "storage/ipc.h"
+#include "storage/pg_shmem.h"
#include "storage/pmsignal.h"
#include "storage/proc.h"
#include "storage/procsignal.h"
@@ -4128,6 +4129,9 @@ PostgresSingleUserMain(int argc, char *argv[],
/* Initialize size of fast-path lock cache. */
InitializeFastPathLocks();
+ /* Initialize MaxNBuffers for buffer pool resizing. */
+ InitializeMaxNBuffers();
+
/*
* Give preloaded libraries a chance to request additional shared memory.
*/
@@ -4318,6 +4322,13 @@ PostgresMain(const char *dbname, const char *username)
*/
BeginReportingGUCOptions();
+ /*
+ * TODO: The new backend should fetch the shared buffers status. If the
+ * resizing is going on, it should bring itself upto speed with it. If not,
+ * simply fetch the latest pointers are sizes. Is this the right place to do
+ * that?
+ */
+
/*
* Also set up handler to log session end; we have to wait till now to be
* sure Log_disconnections has its final value.
diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt
index c1ac71ff7f24..ee5887496baf 100644
--- a/src/backend/utils/activity/wait_event_names.txt
+++ b/src/backend/utils/activity/wait_event_names.txt
@@ -162,6 +162,7 @@ WAL_RECEIVER_EXIT "Waiting for the WAL receiver to exit."
WAL_RECEIVER_WAIT_START "Waiting for startup process to send initial data for streaming replication."
WAL_SUMMARY_READY "Waiting for a new WAL summary to be generated."
XACT_GROUP_UPDATE "Waiting for the group leader to update transaction status at transaction end."
+PM_BUFFER_RESIZE_WAIT "Waiting for the postmaster to complete shared buffer pool resize operations."
ABI_compatibility:
@@ -358,6 +359,7 @@ InjectionPoint "Waiting to read or update information related to injection point
SerialControl "Waiting to read or update shared pg_serial state."
AioWorkerSubmissionQueue "Waiting to access AIO worker submission queue."
WaitLSN "Waiting to read or update shared Wait-for-LSN state."
+ShmemResize "Waiting to resize shared memory."
#
# END OF PREDEFINED LWLOCKS (DO NOT CHANGE THIS LINE)
diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c
index d31cb45a0588..419c7fad8901 100644
--- a/src/backend/utils/init/globals.c
+++ b/src/backend/utils/init/globals.c
@@ -139,7 +139,10 @@ int max_parallel_maintenance_workers = 2;
* MaxBackends is computed by PostmasterMain after modules have had a chance to
* register background workers.
*/
-int NBuffers = 16384;
+int NBuffers = 0;
+int NBuffersPending = 16384;
+bool finalMaxNBuffers = false;
+int MaxNBuffers = 0;
int MaxConnections = 100;
int max_worker_processes = 8;
int max_parallel_workers = 8;
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index 98f9598cd789..46a8a8a3faad 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -595,6 +595,55 @@ InitializeFastPathLocks(void)
pg_nextpower2_32(FastPathLockGroupsPerBackend));
}
+/*
+ * Initialize MaxNBuffers variable with validation.
+ *
+ * This must be called after GUCs have been loaded but before shared memory size
+ * is determined.
+ *
+ * Since MaxNBuffers limits the size of the buffer pool, it must be at least as
+ * much as NBuffersPending. If MaxNBuffers is 0 (default), set it to
+ * NBuffersPending. Otherwise, validate that MaxNBuffers is not less than
+ * NBuffersPending.
+ */
+void
+InitializeMaxNBuffers(void)
+{
+ if (MaxNBuffers == 0) /* default/boot value */
+ {
+ char buf[32];
+
+ snprintf(buf, sizeof(buf), "%d", NBuffersPending);
+ SetConfigOption("max_shared_buffers", buf, PGC_POSTMASTER,
+ PGC_S_DYNAMIC_DEFAULT);
+
+ /*
+ * We prefer to report this value's source as PGC_S_DYNAMIC_DEFAULT.
+ * However, if the DBA explicitly set max_shared_buffers = 0 in
+ * the config file, then PGC_S_DYNAMIC_DEFAULT will fail to override
+ * that and we must force the matter with PGC_S_OVERRIDE.
+ */
+ if (MaxNBuffers == 0) /* failed to apply it? */
+ SetConfigOption("max_shared_buffers", buf, PGC_POSTMASTER,
+ PGC_S_OVERRIDE);
+ }
+ else
+ {
+ if (MaxNBuffers < NBuffersPending)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("max_shared_buffers (%d) cannot be less than current shared_buffers (%d)",
+ MaxNBuffers, NBuffersPending),
+ errhint("Increase max_shared_buffers or decrease shared_buffers.")));
+ }
+ }
+
+ Assert(MaxNBuffers > 0);
+ Assert(!finalMaxNBuffers);
+ finalMaxNBuffers = true;
+}
+
/*
* Early initialization of a backend (either standalone or under postmaster).
* This happens even before InitPostgres.
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index c6484aea087c..96233ba5cb27 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2599,7 +2599,7 @@ convert_to_base_unit(double value, const char *unit,
* the value without loss. For example, if the base unit is GUC_UNIT_KB, 1024
* is converted to 1 MB, but 1025 is represented as 1025 kB.
*/
-static void
+void
convert_int_from_base_unit(int64 base_value, int base_unit,
int64 *value, const char **unit)
{
diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat
index 1128167c0251..539b29f0065a 100644
--- a/src/backend/utils/misc/guc_parameters.dat
+++ b/src/backend/utils/misc/guc_parameters.dat
@@ -2013,6 +2013,15 @@
max => 'MAX_BACKENDS /* XXX? */',
},
+{ name => "max_shared_buffers", type => 'int', context => 'PGC_POSTMASTER', group => 'RESOURCES_MEM',
+ short_desc => 'Sets the upper limit for the shared_buffers value.',
+ flags => 'GUC_UNIT_BLOCKS',
+ variable => 'MaxNBuffers',
+ boot_val => '0',
+ min => '0',
+ max => 'INT_MAX / 2',
+},
+
{ name => 'max_slot_wal_keep_size', type => 'int', context => 'PGC_SIGHUP', group => 'REPLICATION_SENDING',
short_desc => 'Sets the maximum WAL size that can be reserved by replication slots.',
long_desc => 'Replication slots will be marked as failed, and segments released for deletion or recycling, if this much space is occupied by WAL on disk. -1 means no maximum.',
@@ -2581,13 +2590,15 @@
# We sometimes multiply the number of shared buffers by two without
# checking for overflow, so we mustn't allow more than INT_MAX / 2.
-{ name => 'shared_buffers', type => 'int', context => 'PGC_POSTMASTER', group => 'RESOURCES_MEM',
+{ name => 'shared_buffers', type => 'int', context => 'PGC_SIGHUP', group => 'RESOURCES_MEM',
short_desc => 'Sets the number of shared memory buffers used by the server.',
flags => 'GUC_UNIT_BLOCKS',
- variable => 'NBuffers',
+ variable => 'NBuffersPending',
boot_val => '16384',
min => '16',
max => 'INT_MAX / 2',
+ check_hook => 'check_shared_buffers',
+ show_hook => 'show_shared_buffers',
},
{ name => 'shared_memory_size', type => 'int', context => 'PGC_INTERNAL', group => 'PRESET_OPTIONS',
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 1edb18958f75..d0c9e6ec7577 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -8576,8 +8576,8 @@
{ oid => '5052', descr => 'allocations from the main shared memory segment',
proname => 'pg_get_shmem_allocations', prorows => '50', proretset => 't',
provolatile => 'v', prorettype => 'record', proargtypes => '',
- proallargtypes => '{text,int8,int8,int8}', proargmodes => '{o,o,o,o}',
- proargnames => '{name,off,size,allocated_size}',
+ proallargtypes => '{text,text,int8,int8,int8}', proargmodes => '{o,o,o,o,o}',
+ proargnames => '{name,segment,off,size,allocated_size}',
prosrc => 'pg_get_shmem_allocations' },
{ oid => '4099', descr => 'Is NUMA support available?',
@@ -8600,6 +8600,14 @@
proargmodes => '{o,o,o}', proargnames => '{name,type,size}',
prosrc => 'pg_get_dsm_registry_allocations' },
+# shared memory segments
+{ oid => '5101', descr => 'shared memory segments',
+ proname => 'pg_get_shmem_segments', prorows => '6', proretset => 't',
+ provolatile => 'v', prorettype => 'record', proargtypes => '',
+ proallargtypes => '{int4,text,int8,int8,int8,int8}', proargmodes => '{o,o,o,o,o,o}',
+ proargnames => '{id,name,size,freeoffset,mapping_size,mapping_reserved_size}',
+ prosrc => 'pg_get_shmem_segments' },
+
# memory context of local backend
{ oid => '2282',
descr => 'information about all memory contexts of local backend',
@@ -12612,4 +12620,10 @@
proargnames => '{pid,io_id,io_generation,state,operation,off,length,target,handle_data_len,raw_result,result,target_desc,f_sync,f_localmem,f_buffered}',
prosrc => 'pg_get_aios' },
+{ oid => '9999', descr => 'resize shared buffers according to the value of GUC `shared_buffers`',
+ proname => 'pg_resize_shared_buffers',
+ provolatile => 'v',
+ prorettype => 'bool',
+ proargtypes => '',
+ prosrc => 'pg_resize_shared_buffers'},
]
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index 9a7d733ddeff..b4dc2c4ba57d 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -173,7 +173,11 @@ extern PGDLLIMPORT bool ExitOnAnyError;
extern PGDLLIMPORT char *DataDir;
extern PGDLLIMPORT int data_directory_mode;
+/* TODO: This is no more a GUC variable; should be moved somewhere else. */
extern PGDLLIMPORT int NBuffers;
+extern PGDLLIMPORT int NBuffersPending;
+extern PGDLLIMPORT bool finalMaxNBuffers;
+extern PGDLLIMPORT int MaxNBuffers;
extern PGDLLIMPORT int MaxBackends;
extern PGDLLIMPORT int MaxConnections;
extern PGDLLIMPORT int max_worker_processes;
@@ -502,6 +506,7 @@ extern PGDLLIMPORT ProcessingMode Mode;
extern void pg_split_opts(char **argv, int *argcp, const char *optstr);
extern void InitializeMaxBackends(void);
extern void InitializeFastPathLocks(void);
+extern void InitializeMaxNBuffers(void);
extern void InitPostgres(const char *in_dbname, Oid dboid,
const char *username, Oid useroid,
bits32 flags,
diff --git a/src/include/portability/mem.h b/src/include/portability/mem.h
index ef9800732d90..40588ff69683 100644
--- a/src/include/portability/mem.h
+++ b/src/include/portability/mem.h
@@ -38,7 +38,7 @@
#define MAP_NOSYNC 0
#endif
-#define PG_MMAP_FLAGS (MAP_SHARED|MAP_ANONYMOUS|MAP_HASSEMAPHORE)
+#define PG_MMAP_FLAGS (MAP_SHARED|MAP_HASSEMAPHORE)
/* Some really old systems don't define MAP_FAILED. */
#ifndef MAP_FAILED
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 5400c56a965f..4c53194e13e4 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -28,6 +28,7 @@
#include "storage/spin.h"
#include "utils/relcache.h"
#include "utils/resowner.h"
+#include "utils/tuplestore.h"
/*
* Buffer state is a single 32-bit variable where following data is combined.
@@ -512,6 +513,7 @@ extern void StrategyNotifyBgWriter(int bgwprocno);
extern Size StrategyShmemSize(void);
extern void StrategyInitialize(bool init);
+extern void StrategyReset(int activeNBuffers);
/* buf_table.c */
extern Size BufTableShmemSize(int size);
@@ -520,6 +522,7 @@ extern uint32 BufTableHashCode(BufferTag *tagPtr);
extern int BufTableLookup(BufferTag *tagPtr, uint32 hashcode);
extern int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id);
extern void BufTableDelete(BufferTag *tagPtr, uint32 hashcode);
+extern void BufTableGetContents(Tuplestorestate *tupstore, TupleDesc tupdesc);
/* localbuf.c */
extern bool PinLocalBuffer(BufferDesc *buf_hdr, bool adjust_usagecount);
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index b5f8f3c5d42f..774cf8f38edd 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -19,7 +19,9 @@
#include "storage/block.h"
#include "storage/buf.h"
#include "storage/bufpage.h"
+#include "storage/pg_shmem.h"
#include "storage/relfilelocator.h"
+#include "utils/guc.h"
#include "utils/relcache.h"
#include "utils/snapmgr.h"
@@ -158,6 +160,7 @@ typedef struct WritebackContext WritebackContext;
/* in globals.c ... this duplicates miscadmin.h */
extern PGDLLIMPORT int NBuffers;
+extern PGDLLIMPORT int NBuffersPending;
/* in bufmgr.c */
extern PGDLLIMPORT bool zero_damaged_pages;
@@ -204,6 +207,11 @@ extern PGDLLIMPORT int32 *LocalRefCount;
#define BUFFER_LOCK_SHARE 1
#define BUFFER_LOCK_EXCLUSIVE 2
+/*
+ * prototypes for functions in buf_init.c
+ */
+extern const char *show_shared_buffers(void);
+extern bool check_shared_buffers(int *newval, void **extra, GucSource source);
/*
* prototypes for functions in bufmgr.c
@@ -307,6 +315,7 @@ extern bool IsBufferCleanupOK(Buffer buffer);
extern bool HoldingBufferPinThatDelaysRecovery(void);
extern bool BgBufferSync(WritebackContext *wb_context);
+extern void BgBufferSyncReset(int currentNBuffers, int targetNBuffers);
extern uint32 GetPinLimit(void);
extern uint32 GetLocalPinLimit(void);
@@ -323,10 +332,13 @@ extern void EvictRelUnpinnedBuffers(Relation rel,
int32 *buffers_evicted,
int32 *buffers_flushed,
int32 *buffers_skipped);
+extern bool EvictExtraBuffers(int targetNBuffers, int currentNBuffers);
/* in buf_init.c */
extern void BufferManagerShmemInit(void);
-extern Size BufferManagerShmemSize(void);
+extern Size BufferManagerShmemSize(MemoryMappingSizes *mapping_sizes);
+extern void BufferManagerShmemResize(int currentNBuffers, int targetNBuffers);
+extern void BufferManagerShmemValidate(int targetNBuffers);
/* in localbuf.c */
extern void AtProcExit_LocalBuffers(void);
@@ -375,7 +387,7 @@ extern void FreeAccessStrategy(BufferAccessStrategy strategy);
static inline bool
BufferIsValid(Buffer bufnum)
{
- Assert(bufnum <= NBuffers);
+ Assert(bufnum <= (Buffer) pg_atomic_read_u32(&ShmemCtrl->currentNBuffers));
Assert(bufnum >= -NLocBuffer);
return bufnum != InvalidBuffer;
@@ -429,4 +441,11 @@ BufferGetPage(Buffer buffer)
#endif /* FRONTEND */
+/* buf_resize.c */
+extern Datum pg_resize_shared_buffers(PG_FUNCTION_ARGS);
+extern bool ProcessBarrierShmemShrink(void);
+extern bool ProcessBarrierShmemResizeMapAndMem(void);
+extern bool ProcessBarrierShmemExpand(void);
+extern bool ProcessBarrierShmemResizeFailed(void);
+
#endif /* BUFMGR_H */
diff --git a/src/include/storage/ipc.h b/src/include/storage/ipc.h
index 2a8a8f0eabdb..6dbbb9ad064a 100644
--- a/src/include/storage/ipc.h
+++ b/src/include/storage/ipc.h
@@ -18,6 +18,8 @@
#ifndef IPC_H
#define IPC_H
+#include "storage/pg_shmem.h"
+
typedef void (*pg_on_exit_callback) (int code, Datum arg);
typedef void (*shmem_startup_hook_type) (void);
@@ -64,6 +66,7 @@ typedef void (*shmem_startup_hook_type) (void);
/* ipc.c */
extern PGDLLIMPORT bool proc_exit_inprogress;
extern PGDLLIMPORT bool shmem_exit_inprogress;
+extern PGDLLIMPORT volatile bool delay_shmem_resize;
pg_noreturn extern void proc_exit(int code);
extern void shmem_exit(int code);
@@ -77,11 +80,13 @@ extern void check_on_shmem_exit_lists_are_empty(void);
/* ipci.c */
extern PGDLLIMPORT shmem_startup_hook_type shmem_startup_hook;
-extern Size CalculateShmemSize(void);
+extern Size CalculateShmemSize(MemoryMappingSizes *mapping_sizes);
extern void CreateSharedMemoryAndSemaphores(void);
#ifdef EXEC_BACKEND
extern void AttachSharedMemoryStructs(void);
#endif
extern void InitializeShmemGUCs(void);
+extern void CoordinateShmemResize(void);
+extern bool AnonymousShmemResize(void);
#endif /* IPC_H */
diff --git a/src/include/storage/lwlocklist.h b/src/include/storage/lwlocklist.h
index 5b0ce383408c..9c4b928441ce 100644
--- a/src/include/storage/lwlocklist.h
+++ b/src/include/storage/lwlocklist.h
@@ -86,6 +86,7 @@ PG_LWLOCK(51, InjectionPoint)
PG_LWLOCK(52, SerialControl)
PG_LWLOCK(53, AioWorkerSubmissionQueue)
PG_LWLOCK(54, WaitLSN)
+PG_LWLOCK(55, ShmemResize)
/*
* There also exist several built-in LWLock tranches. As with the predefined
diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h
index 5f7d4b83a60e..369000688209 100644
--- a/src/include/storage/pg_shmem.h
+++ b/src/include/storage/pg_shmem.h
@@ -24,7 +24,19 @@
#ifndef PG_SHMEM_H
#define PG_SHMEM_H
+#include "port/atomics.h"
+#include "storage/barrier.h"
#include "storage/dsm_impl.h"
+#include "storage/procsignal.h"
+#include "storage/spin.h"
+#include "storage/shmem.h"
+#include "utils/guc.h"
+
+typedef struct MemoryMappingSizes
+{
+ Size shmem_req_size; /* Required size of the segment */
+ Size shmem_reserved; /* Required size of the reserved address space. */
+} MemoryMappingSizes;
typedef struct PGShmemHeader /* standard header for all Postgres shmem */
{
@@ -41,11 +53,56 @@ typedef struct PGShmemHeader /* standard header for all Postgres shmem */
#endif
} PGShmemHeader;
+typedef struct ShmemSegment
+{
+ PGShmemHeader *ShmemSegHdr; /* shared mem segment header */
+ void *ShmemBase; /* start address of shared memory */
+ void *ShmemEnd; /* end+1 address of shared memory */
+ slock_t *ShmemLock; /* spinlock for shared memory and LWLock
+ * allocation */
+ int segment_fd; /* fd for the backing anon file */
+ unsigned long seg_id; /* IPC key */
+ int shmem_segment; /* TODO: Do we really need it? */
+ Size shmem_size; /* Size of the actually used memory */
+ Size shmem_reserved; /* Size of the reserved mapping */
+ Pointer shmem; /* Pointer to the start of the mapped memory */
+ Pointer seg_addr; /* SysV shared memory for the header */
+} ShmemSegment;
+
+/* Number of available segments for anonymous memory mappings */
+#define NUM_MEMORY_MAPPINGS 5
+
+extern PGDLLIMPORT ShmemSegment Segments[NUM_MEMORY_MAPPINGS];
+
+/*
+ * ShmemControl is shared between backends and helps to coordinate shared
+ * memory resize.
+ *
+ * TODO: I think we need a lock to protect this structure. If we do so, do we
+ * need to use atomic integers?
+ */
+typedef struct
+{
+ pg_atomic_flag resize_in_progress; /* true if resizing is in progress. false otherwise. */
+ pg_atomic_uint32 currentNBuffers; /* Original NBuffers value before resize started */
+ pg_atomic_uint32 targetNBuffers;
+ pid_t coordinator;
+} ShmemControl;
+
+extern PGDLLIMPORT ShmemControl *ShmemCtrl;
+
+/* The phases for shared memory resizing, used by for ProcSignal barrier. */
+#define SHMEM_RESIZE_REQUESTED 0
+#define SHMEM_RESIZE_START 1
+#define SHMEM_RESIZE_DONE 2
+
/* GUC variables */
extern PGDLLIMPORT int shared_memory_type;
extern PGDLLIMPORT int huge_pages;
extern PGDLLIMPORT int huge_page_size;
extern PGDLLIMPORT int huge_pages_status;
+extern PGDLLIMPORT bool finalMaxNBuffers;
+extern PGDLLIMPORT int MaxNBuffers;
/* Possible values for huge_pages and huge_pages_status */
typedef enum
@@ -85,10 +142,53 @@ extern void PGSharedMemoryReAttach(void);
extern void PGSharedMemoryNoReAttach(void);
#endif
-extern PGShmemHeader *PGSharedMemoryCreate(Size size,
+/*
+ * round off mapping size to a multiple of a typical page size.
+ */
+static inline void
+round_off_mapping_sizes(MemoryMappingSizes *mapping_sizes)
+{
+ mapping_sizes->shmem_req_size = add_size(mapping_sizes->shmem_req_size, 8192 - (mapping_sizes->shmem_req_size % 8192));
+ mapping_sizes->shmem_reserved = add_size(mapping_sizes->shmem_reserved, 8192 - (mapping_sizes->shmem_reserved % 8192));
+}
+
+
+extern PGShmemHeader *PGSharedMemoryCreate(MemoryMappingSizes *mapping_sizes, int segment_id,
PGShmemHeader **shim);
extern bool PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2);
extern void PGSharedMemoryDetach(void);
-extern void GetHugePageSize(Size *hugepagesize, int *mmap_flags);
+extern const char *MappingName(int shmem_segment);
+extern void GetHugePageSize(Size *hugepagesize, int *mmap_flags,
+ int *memfd_flags);
+void PrepareHugePages(void);
+
+bool ProcessBarrierShmemResize(Barrier *barrier);
+const char *show_shared_buffers(void);
+bool check_shared_buffers(int *newval, void **extra, GucSource source);
+void AdjustShmemSize(void);
+extern void WaitOnShmemBarrier(void);
+extern void ShmemControlInit(void);
+
+/*
+ * To be able to dynamically resize largest parts of the data stored in shared
+ * memory, we split it into multiple shared memory mappings segments. Each
+ * segment contains only certain part of the data, which size depends on
+ * NBuffers.
+ */
+
+/* The main segment, contains everything except buffer blocks and related data. */
+#define MAIN_SHMEM_SEGMENT 0
+
+/* Buffer blocks */
+#define BUFFERS_SHMEM_SEGMENT 1
+
+/* Buffer descriptors */
+#define BUFFER_DESCRIPTORS_SHMEM_SEGMENT 2
+
+/* Condition variables for buffers */
+#define BUFFER_IOCV_SHMEM_SEGMENT 3
+
+/* Checkpoint BufferIds */
+#define CHECKPOINT_BUFFERS_SHMEM_SEGMENT 4
#endif /* PG_SHMEM_H */
diff --git a/src/include/storage/pmsignal.h b/src/include/storage/pmsignal.h
index 428aa3fd68a0..5ced2a835370 100644
--- a/src/include/storage/pmsignal.h
+++ b/src/include/storage/pmsignal.h
@@ -42,9 +42,10 @@ typedef enum
PMSIGNAL_START_WALRECEIVER, /* start a walreceiver */
PMSIGNAL_ADVANCE_STATE_MACHINE, /* advance postmaster's state machine */
PMSIGNAL_XLOG_IS_SHUTDOWN, /* ShutdownXLOG() completed */
+ PMSIGNAL_SHMEM_RESIZE, /* resize shared memory */
} PMSignalReason;
-#define NUM_PMSIGNALS (PMSIGNAL_XLOG_IS_SHUTDOWN+1)
+#define NUM_PMSIGNALS (PMSIGNAL_SHMEM_RESIZE+1)
/*
* Reasons why the postmaster would send SIGQUIT to its children.
diff --git a/src/include/storage/procsignal.h b/src/include/storage/procsignal.h
index afeeb1ca019f..4de11faf12d4 100644
--- a/src/include/storage/procsignal.h
+++ b/src/include/storage/procsignal.h
@@ -54,6 +54,10 @@ typedef enum
typedef enum
{
PROCSIGNAL_BARRIER_SMGRRELEASE, /* ask smgr to close files */
+ PROCSIGNAL_BARRIER_SHBUF_SHRINK, /* shrink buffer pool - restrict allocations to new size */
+ PROCSIGNAL_BARRIER_SHBUF_RESIZE_MAP_AND_MEM, /* remap shared memory segments and update structure pointers */
+ PROCSIGNAL_BARRIER_SHBUF_EXPAND, /* expand buffer pool - enable allocations in new range */
+ PROCSIGNAL_BARRIER_SHBUF_RESIZE_FAILED, /* signal backends that the shared buffer resizing failed. */
} ProcSignalBarrierType;
/*
diff --git a/src/include/storage/shmem.h b/src/include/storage/shmem.h
index 70a5b8b172c6..d59e5ba6dcd6 100644
--- a/src/include/storage/shmem.h
+++ b/src/include/storage/shmem.h
@@ -30,19 +30,33 @@ extern PGDLLIMPORT slock_t *ShmemLock;
typedef struct PGShmemHeader PGShmemHeader; /* avoid including
* storage/pg_shmem.h here */
extern void InitShmemAccess(PGShmemHeader *seghdr);
+extern void InitShmemAccessInSegment(struct PGShmemHeader *seghdr,
+ int shmem_segment);
extern void InitShmemAllocation(void);
+extern void InitShmemAllocationInSegment(int shmem_segment);
extern void *ShmemAlloc(Size size);
+extern void *ShmemAllocInSegment(Size size, int shmem_segment);
extern void *ShmemAllocNoError(Size size);
+extern void *ShmemAllocUnlockedInSegment(Size size, int shmem_segment);
extern bool ShmemAddrIsValid(const void *addr);
+extern bool ShmemAddrIsValidInSegment(const void *addr, int shmem_segment);
extern void InitShmemIndex(void);
extern HTAB *ShmemInitHash(const char *name, int64 init_size, int64 max_size,
HASHCTL *infoP, int hash_flags);
+extern HTAB *ShmemInitHashInSegment(const char *name, long init_size,
+ long max_size, HASHCTL *infoP,
+ int hash_flags, int shmem_segment);
extern void *ShmemInitStruct(const char *name, Size size, bool *foundPtr);
+extern void *ShmemInitStructInSegment(const char *name, Size size,
+ bool *foundPtr, int shmem_segment);
+extern void *ShmemUpdateStructInSegment(const char *name, Size size,
+ bool *foundPtr, int shmem_segment);
extern Size add_size(Size s1, Size s2);
extern Size mul_size(Size s1, Size s2);
extern PGDLLIMPORT Size pg_get_shmem_pagesize(void);
+
/* ipci.c */
extern void RequestAddinShmemSpace(Size size);
@@ -59,6 +73,7 @@ typedef struct
void *location; /* location in shared mem */
Size size; /* # bytes requested for the structure */
Size allocated_size; /* # bytes actually allocated */
+ int shmem_segment; /* segment in which the structure is allocated */
} ShmemIndexEnt;
#endif /* SHMEM_H */
diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h
index f21ec37da893..08a84373fb70 100644
--- a/src/include/utils/guc.h
+++ b/src/include/utils/guc.h
@@ -459,6 +459,8 @@ extern config_handle *get_config_handle(const char *name);
extern void AlterSystemSetConfigFile(AlterSystemStmt *altersysstmt);
extern char *GetConfigOptionByName(const char *name, const char **varname,
bool missing_ok);
+extern void convert_int_from_base_unit(int64 base_value, int base_unit,
+ int64 *value, const char **unit);
extern void TransformGUCArray(ArrayType *array, List **names,
List **values);
diff --git a/src/test/Makefile b/src/test/Makefile
index 511a72e6238a..95f8858a8183 100644
--- a/src/test/Makefile
+++ b/src/test/Makefile
@@ -12,7 +12,7 @@ subdir = src/test
top_builddir = ../..
include $(top_builddir)/src/Makefile.global
-SUBDIRS = perl postmaster regress isolation modules authentication recovery subscription
+SUBDIRS = perl postmaster regress isolation modules authentication recovery subscription buffermgr
ifeq ($(with_icu),yes)
SUBDIRS += icu
diff --git a/src/test/README b/src/test/README
index afdc76765190..77f11607ff76 100644
--- a/src/test/README
+++ b/src/test/README
@@ -15,6 +15,9 @@ examples/
Demonstration programs for libpq that double as regression tests via
"make check"
+buffermgr/
+ Tests for resizing buffer pool without restarting the server
+
isolation/
Tests for concurrent behavior at the SQL level
diff --git a/src/test/buffermgr/Makefile b/src/test/buffermgr/Makefile
new file mode 100644
index 000000000000..eb275027fa60
--- /dev/null
+++ b/src/test/buffermgr/Makefile
@@ -0,0 +1,30 @@
+#-------------------------------------------------------------------------
+#
+# Makefile for src/test/buffermgr
+#
+# Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
+# Portions Copyright (c) 1994, Regents of the University of California
+#
+# src/test/buffermgr/Makefile
+#
+#-------------------------------------------------------------------------
+
+EXTRA_INSTALL = contrib/pg_buffercache
+
+REGRESS = buffer_resize
+
+# Custom configuration for buffer manager tests
+TEMP_CONFIG = $(srcdir)/buffermgr_test.conf
+
+subdir = src/test/buffermgr
+top_builddir = ../../..
+include $(top_builddir)/src/Makefile.global
+
+check:
+ $(prove_check)
+
+installcheck:
+ $(prove_installcheck)
+
+clean distclean:
+ rm -rf tmp_check
diff --git a/src/test/buffermgr/README b/src/test/buffermgr/README
new file mode 100644
index 000000000000..c375ad809892
--- /dev/null
+++ b/src/test/buffermgr/README
@@ -0,0 +1,26 @@
+src/test/buffermgr/README
+
+Regression tests for buffer manager
+===================================
+
+This directory contains a test suite for resizing buffer manager without restarting the server.
+
+
+Running the tests
+=================
+
+NOTE: You must have given the --enable-tap-tests argument to configure.
+
+Run
+ make check
+or
+ make installcheck
+You can use "make installcheck" if you previously did "make install".
+In that case, the code in the installation tree is tested. With
+"make check", a temporary installation tree is built from the current
+sources and then tested.
+
+Either way, this test initializes, starts, and stops a test Postgres
+cluster.
+
+See src/test/perl/README for more info about running these tests.
diff --git a/src/test/buffermgr/buffermgr_test.conf b/src/test/buffermgr/buffermgr_test.conf
new file mode 100644
index 000000000000..b7c0065c80b8
--- /dev/null
+++ b/src/test/buffermgr/buffermgr_test.conf
@@ -0,0 +1,11 @@
+# Configuration for buffer manager regression tests
+
+# Even if max_shared_buffers is set multiple times only the last one is used to
+# as the limit on shared_buffers.
+max_shared_buffers = 128kB
+# Set initial shared_buffers as expected by test
+shared_buffers = 128MB
+# Set a larger value for max_shared_buffers to allow testing resize operations
+max_shared_buffers = 300MB
+# Turn huge pages off, since that affects the size of memory segments
+huge_pages = off
\ No newline at end of file
diff --git a/src/test/buffermgr/expected/buffer_resize.out b/src/test/buffermgr/expected/buffer_resize.out
new file mode 100644
index 000000000000..d5cb9d784372
--- /dev/null
+++ b/src/test/buffermgr/expected/buffer_resize.out
@@ -0,0 +1,329 @@
+-- Test buffer pool resizing and shared memory allocation tracking
+-- This test resizes the buffer pool multiple times and monitors
+-- shared memory allocations related to buffer management
+-- TODO: The test sets shared_buffers values in MBs. Instead it could use values
+-- in kBs so that the test runs on very small machines.
+-- Create a view for buffer-related shared memory allocations
+CREATE VIEW buffer_allocations AS
+SELECT name, segment, size, allocated_size
+FROM pg_shmem_allocations
+WHERE name IN ('Buffer Blocks', 'Buffer Descriptors', 'Buffer IO Condition Variables',
+ 'Checkpoint BufferIds')
+ORDER BY name;
+-- Note: We exclude the 'main' segment even if it contains the shared buffer
+-- lookup table because it contains other shared structures whose total sizes
+-- may vary as the code changes.
+CREATE VIEW buffer_segments AS
+SELECT name, size, mapping_size, mapping_reserved_size
+FROM pg_shmem_segments
+WHERE name <> 'main'
+ORDER BY name;
+-- Enable pg_buffercache for buffer count verification
+CREATE EXTENSION IF NOT EXISTS pg_buffercache;
+-- Test 1: Default shared_buffers
+SHOW shared_buffers;
+ shared_buffers
+----------------
+ 128MB
+(1 row)
+
+SHOW max_shared_buffers;
+ max_shared_buffers
+--------------------
+ 300MB
+(1 row)
+
+SELECT * FROM buffer_allocations;
+ name | segment | size | allocated_size
+-------------------------------+-------------+-----------+----------------
+ Buffer Blocks | buffers | 134221824 | 134221824
+ Buffer Descriptors | descriptors | 1048576 | 1048576
+ Buffer IO Condition Variables | iocv | 262144 | 262144
+ Checkpoint BufferIds | checkpoint | 327680 | 327680
+(4 rows)
+
+SELECT * FROM buffer_segments;
+ name | size | mapping_size | mapping_reserved_size
+-------------+-----------+--------------+-----------------------
+ buffers | 134225920 | 134225920 | 314580992
+ checkpoint | 335872 | 335872 | 770048
+ descriptors | 1056768 | 1056768 | 2465792
+ iocv | 270336 | 270336 | 622592
+(4 rows)
+
+SELECT COUNT(*) AS buffer_count FROM pg_buffercache;
+ buffer_count
+--------------
+ 16384
+(1 row)
+
+-- Calling pg_resize_shared_buffers() without changing shared_buffers should be a no-op.
+SELECT pg_resize_shared_buffers();
+ pg_resize_shared_buffers
+--------------------------
+ t
+(1 row)
+
+SHOW shared_buffers;
+ shared_buffers
+----------------
+ 128MB
+(1 row)
+
+SELECT * FROM buffer_allocations;
+ name | segment | size | allocated_size
+-------------------------------+-------------+-----------+----------------
+ Buffer Blocks | buffers | 134221824 | 134221824
+ Buffer Descriptors | descriptors | 1048576 | 1048576
+ Buffer IO Condition Variables | iocv | 262144 | 262144
+ Checkpoint BufferIds | checkpoint | 327680 | 327680
+(4 rows)
+
+SELECT * FROM buffer_segments;
+ name | size | mapping_size | mapping_reserved_size
+-------------+-----------+--------------+-----------------------
+ buffers | 134225920 | 134225920 | 314580992
+ checkpoint | 335872 | 335872 | 770048
+ descriptors | 1056768 | 1056768 | 2465792
+ iocv | 270336 | 270336 | 622592
+(4 rows)
+
+SELECT COUNT(*) AS buffer_count FROM pg_buffercache;
+ buffer_count
+--------------
+ 16384
+(1 row)
+
+-- Test 2: Set to 64MB
+ALTER SYSTEM SET shared_buffers = '64MB';
+SELECT pg_reload_conf();
+ pg_reload_conf
+----------------
+ t
+(1 row)
+
+-- reconnect to ensure new setting is loaded
+\c
+SHOW shared_buffers;
+ shared_buffers
+-----------------------
+ 128MB (pending: 64MB)
+(1 row)
+
+SELECT pg_resize_shared_buffers();
+ pg_resize_shared_buffers
+--------------------------
+ t
+(1 row)
+
+SHOW shared_buffers;
+ shared_buffers
+----------------
+ 64MB
+(1 row)
+
+SELECT * FROM buffer_allocations;
+ name | segment | size | allocated_size
+-------------------------------+-------------+----------+----------------
+ Buffer Blocks | buffers | 67112960 | 67112960
+ Buffer Descriptors | descriptors | 524288 | 524288
+ Buffer IO Condition Variables | iocv | 131072 | 131072
+ Checkpoint BufferIds | checkpoint | 163840 | 163840
+(4 rows)
+
+SELECT * FROM buffer_segments;
+ name | size | mapping_size | mapping_reserved_size
+-------------+----------+--------------+-----------------------
+ buffers | 67117056 | 67117056 | 314580992
+ checkpoint | 172032 | 172032 | 770048
+ descriptors | 532480 | 532480 | 2465792
+ iocv | 139264 | 139264 | 622592
+(4 rows)
+
+SELECT COUNT(*) AS buffer_count FROM pg_buffercache;
+ buffer_count
+--------------
+ 8192
+(1 row)
+
+-- Test 3: Set to 256MB
+ALTER SYSTEM SET shared_buffers = '256MB';
+SELECT pg_reload_conf();
+ pg_reload_conf
+----------------
+ t
+(1 row)
+
+-- reconnect to ensure new setting is loaded
+\c
+SHOW shared_buffers;
+ shared_buffers
+-----------------------
+ 64MB (pending: 256MB)
+(1 row)
+
+SELECT pg_resize_shared_buffers();
+ pg_resize_shared_buffers
+--------------------------
+ t
+(1 row)
+
+SHOW shared_buffers;
+ shared_buffers
+----------------
+ 256MB
+(1 row)
+
+SELECT * FROM buffer_allocations;
+ name | segment | size | allocated_size
+-------------------------------+-------------+-----------+----------------
+ Buffer Blocks | buffers | 268439552 | 268439552
+ Buffer Descriptors | descriptors | 2097152 | 2097152
+ Buffer IO Condition Variables | iocv | 524288 | 524288
+ Checkpoint BufferIds | checkpoint | 655360 | 655360
+(4 rows)
+
+SELECT * FROM buffer_segments;
+ name | size | mapping_size | mapping_reserved_size
+-------------+-----------+--------------+-----------------------
+ buffers | 268443648 | 268443648 | 314580992
+ checkpoint | 663552 | 663552 | 770048
+ descriptors | 2105344 | 2105344 | 2465792
+ iocv | 532480 | 532480 | 622592
+(4 rows)
+
+SELECT COUNT(*) AS buffer_count FROM pg_buffercache;
+ buffer_count
+--------------
+ 32768
+(1 row)
+
+-- Test 4: Set to 100MB (non-power-of-two)
+ALTER SYSTEM SET shared_buffers = '100MB';
+SELECT pg_reload_conf();
+ pg_reload_conf
+----------------
+ t
+(1 row)
+
+-- reconnect to ensure new setting is loaded
+\c
+SHOW shared_buffers;
+ shared_buffers
+------------------------
+ 256MB (pending: 100MB)
+(1 row)
+
+SELECT pg_resize_shared_buffers();
+ pg_resize_shared_buffers
+--------------------------
+ t
+(1 row)
+
+SHOW shared_buffers;
+ shared_buffers
+----------------
+ 100MB
+(1 row)
+
+SELECT * FROM buffer_allocations;
+ name | segment | size | allocated_size
+-------------------------------+-------------+-----------+----------------
+ Buffer Blocks | buffers | 104861696 | 104861696
+ Buffer Descriptors | descriptors | 819200 | 819200
+ Buffer IO Condition Variables | iocv | 204800 | 204800
+ Checkpoint BufferIds | checkpoint | 256000 | 256000
+(4 rows)
+
+SELECT * FROM buffer_segments;
+ name | size | mapping_size | mapping_reserved_size
+-------------+-----------+--------------+-----------------------
+ buffers | 104865792 | 104865792 | 314580992
+ checkpoint | 262144 | 262144 | 770048
+ descriptors | 827392 | 827392 | 2465792
+ iocv | 212992 | 212992 | 622592
+(4 rows)
+
+SELECT COUNT(*) AS buffer_count FROM pg_buffercache;
+ buffer_count
+--------------
+ 12800
+(1 row)
+
+-- Test 5: Set to minimum 128kB
+ALTER SYSTEM SET shared_buffers = '128kB';
+SELECT pg_reload_conf();
+ pg_reload_conf
+----------------
+ t
+(1 row)
+
+-- reconnect to ensure new setting is loaded
+\c
+SHOW shared_buffers;
+ shared_buffers
+------------------------
+ 100MB (pending: 128kB)
+(1 row)
+
+SELECT pg_resize_shared_buffers();
+ pg_resize_shared_buffers
+--------------------------
+ t
+(1 row)
+
+SHOW shared_buffers;
+ shared_buffers
+----------------
+ 128kB
+(1 row)
+
+SELECT * FROM buffer_allocations;
+ name | segment | size | allocated_size
+-------------------------------+-------------+--------+----------------
+ Buffer Blocks | buffers | 135168 | 135168
+ Buffer Descriptors | descriptors | 1024 | 1024
+ Buffer IO Condition Variables | iocv | 256 | 256
+ Checkpoint BufferIds | checkpoint | 320 | 320
+(4 rows)
+
+SELECT * FROM buffer_segments;
+ name | size | mapping_size | mapping_reserved_size
+-------------+--------+--------------+-----------------------
+ buffers | 139264 | 139264 | 314580992
+ checkpoint | 8192 | 8192 | 770048
+ descriptors | 8192 | 8192 | 2465792
+ iocv | 8192 | 8192 | 622592
+(4 rows)
+
+SELECT COUNT(*) AS buffer_count FROM pg_buffercache;
+ buffer_count
+--------------
+ 16
+(1 row)
+
+-- Test 6: Try to set shared_buffers higher than max_shared_buffers (should fail)
+ALTER SYSTEM SET shared_buffers = '400MB';
+ERROR: invalid value for parameter "shared_buffers": 51200
+DETAIL: "shared_buffers" must be less than "max_shared_buffers".
+SELECT pg_reload_conf();
+ pg_reload_conf
+----------------
+ t
+(1 row)
+
+-- reconnect to ensure new setting is loaded
+\c
+-- This should show the old value since the configuration was rejected
+SHOW shared_buffers;
+ shared_buffers
+----------------
+ 128kB
+(1 row)
+
+SHOW max_shared_buffers;
+ max_shared_buffers
+--------------------
+ 300MB
+(1 row)
+
diff --git a/src/test/buffermgr/meson.build b/src/test/buffermgr/meson.build
new file mode 100644
index 000000000000..f33feb64a069
--- /dev/null
+++ b/src/test/buffermgr/meson.build
@@ -0,0 +1,24 @@
+# Copyright (c) 2022-2025, PostgreSQL Global Development Group
+
+tests += {
+ 'name': 'buffermgr',
+ 'sd': meson.current_source_dir(),
+ 'bd': meson.current_build_dir(),
+ 'regress': {
+ 'sql': [
+ 'buffer_resize',
+ ],
+ 'regress_args': ['--temp-config', files('buffermgr_test.conf')],
+ },
+ 'tap': {
+ 'env': {
+ 'enable_injection_points': get_option('injection_points') ? 'yes' : 'no',
+ },
+ 'tests': [
+ 't/001_resize_buffer.pl',
+ 't/002_checkpoint_buffer_resize.pl',
+ 't/003_parallel_resize_buffer.pl',
+ 't/004_client_join_buffer_resize.pl',
+ ],
+ },
+}
diff --git a/src/test/buffermgr/sql/buffer_resize.sql b/src/test/buffermgr/sql/buffer_resize.sql
new file mode 100644
index 000000000000..dfaaeabfcbbb
--- /dev/null
+++ b/src/test/buffermgr/sql/buffer_resize.sql
@@ -0,0 +1,95 @@
+-- Test buffer pool resizing and shared memory allocation tracking
+-- This test resizes the buffer pool multiple times and monitors
+-- shared memory allocations related to buffer management
+-- TODO: The test sets shared_buffers values in MBs. Instead it could use values
+-- in kBs so that the test runs on very small machines.
+
+-- Create a view for buffer-related shared memory allocations
+CREATE VIEW buffer_allocations AS
+SELECT name, segment, size, allocated_size
+FROM pg_shmem_allocations
+WHERE name IN ('Buffer Blocks', 'Buffer Descriptors', 'Buffer IO Condition Variables',
+ 'Checkpoint BufferIds')
+ORDER BY name;
+
+-- Note: We exclude the 'main' segment even if it contains the shared buffer
+-- lookup table because it contains other shared structures whose total sizes
+-- may vary as the code changes.
+CREATE VIEW buffer_segments AS
+SELECT name, size, mapping_size, mapping_reserved_size
+FROM pg_shmem_segments
+WHERE name <> 'main'
+ORDER BY name;
+
+-- Enable pg_buffercache for buffer count verification
+CREATE EXTENSION IF NOT EXISTS pg_buffercache;
+
+-- Test 1: Default shared_buffers
+SHOW shared_buffers;
+SHOW max_shared_buffers;
+SELECT * FROM buffer_allocations;
+SELECT * FROM buffer_segments;
+SELECT COUNT(*) AS buffer_count FROM pg_buffercache;
+-- Calling pg_resize_shared_buffers() without changing shared_buffers should be a no-op.
+SELECT pg_resize_shared_buffers();
+SHOW shared_buffers;
+SELECT * FROM buffer_allocations;
+SELECT * FROM buffer_segments;
+SELECT COUNT(*) AS buffer_count FROM pg_buffercache;
+
+-- Test 2: Set to 64MB
+ALTER SYSTEM SET shared_buffers = '64MB';
+SELECT pg_reload_conf();
+-- reconnect to ensure new setting is loaded
+\c
+SHOW shared_buffers;
+SELECT pg_resize_shared_buffers();
+SHOW shared_buffers;
+SELECT * FROM buffer_allocations;
+SELECT * FROM buffer_segments;
+SELECT COUNT(*) AS buffer_count FROM pg_buffercache;
+
+-- Test 3: Set to 256MB
+ALTER SYSTEM SET shared_buffers = '256MB';
+SELECT pg_reload_conf();
+-- reconnect to ensure new setting is loaded
+\c
+SHOW shared_buffers;
+SELECT pg_resize_shared_buffers();
+SHOW shared_buffers;
+SELECT * FROM buffer_allocations;
+SELECT * FROM buffer_segments;
+SELECT COUNT(*) AS buffer_count FROM pg_buffercache;
+
+-- Test 4: Set to 100MB (non-power-of-two)
+ALTER SYSTEM SET shared_buffers = '100MB';
+SELECT pg_reload_conf();
+-- reconnect to ensure new setting is loaded
+\c
+SHOW shared_buffers;
+SELECT pg_resize_shared_buffers();
+SHOW shared_buffers;
+SELECT * FROM buffer_allocations;
+SELECT * FROM buffer_segments;
+SELECT COUNT(*) AS buffer_count FROM pg_buffercache;
+
+-- Test 5: Set to minimum 128kB
+ALTER SYSTEM SET shared_buffers = '128kB';
+SELECT pg_reload_conf();
+-- reconnect to ensure new setting is loaded
+\c
+SHOW shared_buffers;
+SELECT pg_resize_shared_buffers();
+SHOW shared_buffers;
+SELECT * FROM buffer_allocations;
+SELECT * FROM buffer_segments;
+SELECT COUNT(*) AS buffer_count FROM pg_buffercache;
+
+-- Test 6: Try to set shared_buffers higher than max_shared_buffers (should fail)
+ALTER SYSTEM SET shared_buffers = '400MB';
+SELECT pg_reload_conf();
+-- reconnect to ensure new setting is loaded
+\c
+-- This should show the old value since the configuration was rejected
+SHOW shared_buffers;
+SHOW max_shared_buffers;
diff --git a/src/test/buffermgr/t/001_resize_buffer.pl b/src/test/buffermgr/t/001_resize_buffer.pl
new file mode 100644
index 000000000000..a0d7f0941713
--- /dev/null
+++ b/src/test/buffermgr/t/001_resize_buffer.pl
@@ -0,0 +1,135 @@
+# Copyright (c) 2025-2025, PostgreSQL Global Development Group
+#
+# Minimal test testing shared_buffer resizing under load
+
+use strict;
+use warnings;
+use IPC::Run;
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+# Function to resize buffer pool and verify the change.
+sub apply_and_verify_buffer_change
+{
+ my ($node, $new_size) = @_;
+
+ # Use the new pg_resize_shared_buffers() interface which handles everything synchronously
+ $node->safe_psql('postgres', "ALTER SYSTEM SET shared_buffers = '$new_size'");
+ $node->safe_psql('postgres', "SELECT pg_reload_conf()");
+
+ # If resize function fails, try a few times before giving up
+ my $max_retries = 5;
+ my $retry_delay = 1; # seconds
+ my $success = 0;
+ for my $attempt (1..$max_retries) {
+ my $result = $node->safe_psql('postgres', "SELECT pg_resize_shared_buffers()");
+ if ($result eq 't') {
+ $success = 1;
+ last;
+ }
+
+ # If not the last attempt, wait before retrying
+ if ($attempt < $max_retries) {
+ note "Resizing buffer pool to $new_size, attempt $attempt failed, retrying after $retry_delay seconds...";
+ sleep($retry_delay);
+ }
+ }
+
+ is($success, 1, 'resizing to ' . $new_size . ' succeeded after retries');
+ is($node->safe_psql('postgres', "SHOW shared_buffers"), $new_size,
+ 'SHOW after resizing to '. $new_size . ' succeeded');
+}
+
+# Initialize a cluster and start pgbench in the background for concurrent load.
+my $node = PostgreSQL::Test::Cluster->new('main');
+$node->init;
+
+# Permit resizing up to 1GB for this test and let the server start with 128MB.
+$node->append_conf('postgresql.conf', qq{
+max_shared_buffers = 1GB
+shared_buffers = 128MB
+log_statement = none
+});
+
+$node->start;
+$node->safe_psql('postgres', "CREATE EXTENSION pg_buffercache");
+my $pgb_scale = 10;
+my $pgb_duration = 120;
+my $pgb_num_clients = 10;
+$node->pgbench(
+ "--initialize --init-steps=dtpvg --scale=$pgb_scale --quiet",
+ 0,
+ [qr{^$}],
+ [ # stderr patterns to verify initialization stages
+ qr{dropping old tables},
+ qr{creating tables},
+ qr{done in \d+\.\d\d s }
+ ],
+ "pgbench initialization (scale=$pgb_scale)"
+);
+my ($pgbench_stdin, $pgbench_stdout, $pgbench_stderr) = ('', '', '');
+# Use --exit-on-abort so that the test stops on the first server crash or error,
+# thus making it easy to debug the failure. Use -C to increase the chances of a
+# new backend being created while resizing the buffer pool.
+my $pgbench_process = IPC::Run::start(
+ [
+ 'pgbench',
+ '-p', $node->port,
+ '-T', $pgb_duration,
+ '-c', $pgb_num_clients,
+ '-C',
+ '--exit-on-abort',
+ 'postgres'
+ ],
+ '<' => \$pgbench_stdin,
+ '>' => \$pgbench_stdout,
+ '2>' => \$pgbench_stderr
+);
+
+ok($pgbench_process, "pgbench started successfully");
+
+# Allow pgbench to establish connections and start generating load.
+#
+# TODO: When creating new backends is known to work well with buffer pool
+# resizing, this wait should be removed.
+sleep(1);
+
+# Resize buffer pool to various sizes while pgbench is running in the
+# background.
+#
+# TODO: These are pseudo-randomly picked sizes, but we can do better.
+my $tests_completed = 0;
+my @buffer_sizes = ('900MB', '500MB', '250MB', '400MB', '120MB', '600MB');
+for my $target_size (@buffer_sizes)
+{
+ # Verify workload generator is still running
+ if (!$pgbench_process->pumpable) {
+ ok(0, "pgbench is still running");
+ last;
+ }
+
+ apply_and_verify_buffer_change($node, $target_size);
+ $tests_completed++;
+
+ # Wait for the resized buffer pool to stabilize. If the resized buffer pool
+ # is utilized fully, it might hit any wrongly initialized areas of shared
+ # memory.
+ sleep(2);
+}
+is($tests_completed, scalar(@buffer_sizes), "All buffer sizes were tested");
+
+# Make sure that pgbench can end normally.
+$pgbench_process->signal('TERM');
+IPC::Run::finish $pgbench_process;
+ok(grep { $pgbench_process->result == $_ } (0, 15), "pgbench exited gracefully");
+
+# Log any error output from pgbench for debugging
+diag("pgbench stderr:\n$pgbench_stderr");
+diag("pgbench stdout:\n$pgbench_stdout");
+
+# Ensure database is still functional after all the buffer changes
+$node->connect_ok("dbname=postgres",
+ "Database remains accessible after $tests_completed buffer resize operations");
+
+done_testing();
\ No newline at end of file
diff --git a/src/test/buffermgr/t/002_checkpoint_buffer_resize.pl b/src/test/buffermgr/t/002_checkpoint_buffer_resize.pl
new file mode 100644
index 000000000000..9ab615b6557f
--- /dev/null
+++ b/src/test/buffermgr/t/002_checkpoint_buffer_resize.pl
@@ -0,0 +1,111 @@
+# Copyright (c) 2025-2025, PostgreSQL Global Development Group
+#
+# Test shared_buffer resizing coordination with checkpoint using injection points
+
+use strict;
+use warnings;
+use IPC::Run;
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+# Skip this test if injection points are not supported
+if ($ENV{enable_injection_points} ne 'yes')
+{
+ plan skip_all => 'Injection points not supported by this build';
+}
+
+# Initialize cluster with injection points enabled
+my $node = PostgreSQL::Test::Cluster->new('main');
+$node->init;
+$node->append_conf('postgresql.conf', 'shared_preload_libraries = injection_points');
+$node->append_conf('postgresql.conf', 'shared_buffers = 256kB');
+# Disable background writer to prevent interference with dirty buffers
+$node->append_conf('postgresql.conf', 'bgwriter_lru_maxpages = 0');
+$node->start;
+
+# Load the injection points extension
+$node->safe_psql('postgres', "CREATE EXTENSION injection_points");
+
+# Create some data to make checkpoint meaningful and ensure many dirty buffers
+$node->safe_psql('postgres', "CREATE TABLE test_data (id int, data text)");
+# Insert enough data to fill more than 16 buffers (each row ~1KB, so 20+ rows per page)
+$node->safe_psql('postgres', "INSERT INTO test_data SELECT i, repeat('x', 1000) FROM generate_series(1, 5000) i");
+
+# Create additional tables to ensure we have plenty of dirty buffers
+$node->safe_psql('postgres', "CREATE TABLE test_data2 AS SELECT * FROM test_data WHERE id <= 2500");
+$node->safe_psql('postgres', "CREATE TABLE test_data3 AS SELECT * FROM test_data WHERE id > 2500");
+
+# Update data to create more dirty buffers
+$node->safe_psql('postgres', "UPDATE test_data SET data = repeat('y', 1000) WHERE id % 3 = 0");
+$node->safe_psql('postgres', "UPDATE test_data2 SET data = repeat('z', 1000) WHERE id % 2 = 0");
+
+# Prepare the new shared_buffers configuration before starting checkpoint
+$node->safe_psql('postgres', "ALTER SYSTEM SET shared_buffers = '128kB'");
+$node->safe_psql('postgres', "SELECT pg_reload_conf()");
+
+# Set up the injection point to make checkpoint wait
+$node->safe_psql('postgres', "SELECT injection_points_attach('buffer-sync-dirty-buffer-scan', 'wait')");
+
+# Start a checkpoint in the background that will trigger the injection point
+my $checkpoint_session = $node->background_psql('postgres');
+$checkpoint_session->query_until(
+ qr/starting_checkpoint/,
+ q(
+ \echo starting_checkpoint
+ CHECKPOINT;
+ \q
+ )
+);
+
+# Wait until checkpointer actually reaches the injection point
+$node->wait_for_event('checkpointer', 'buffer-sync-dirty-buffer-scan');
+
+# Verify checkpoint is waiting by checking if it hasn't completed
+my $checkpoint_running = $node->safe_psql('postgres',
+ "SELECT COUNT(*) FROM pg_stat_activity WHERE backend_type = 'checkpointer' AND wait_event = 'buffer-sync-dirty-buffer-scan'");
+is($checkpoint_running, '1', 'Checkpoint is waiting at injection point');
+
+# Start the resize operation in the background (don't wait for completion)
+my $resize_session = $node->background_psql('postgres');
+$resize_session->query_until(
+ qr/starting_resize/,
+ q(
+ \echo starting_resize
+ SELECT pg_resize_shared_buffers();
+ )
+);
+
+# Continue the checkpoint and wait for its completion
+my $log_offset = -s $node->logfile;
+$node->safe_psql('postgres', "SELECT injection_points_wakeup('buffer-sync-dirty-buffer-scan')");
+
+# Wait for both checkpoint and resize to complete
+$node->wait_for_log(qr/checkpoint complete/, $log_offset);
+
+# Wait for the resize operation to complete using the proper method
+$resize_session->query(q(\echo 'resize_complete'));
+
+pass('Checkpoint and buffer resize both completed after injection point was released');
+
+# Verify the resize actually worked
+is($node->safe_psql('postgres', "SHOW shared_buffers"), '128kB',
+ 'Buffer resize completed successfully after checkpoint coordination');
+
+# Cleanup the background session
+$resize_session->quit;
+
+# Clean up the injection point
+$node->safe_psql('postgres', "SELECT injection_points_detach('buffer-sync-dirty-buffer-scan')");
+
+# Verify system remains stable after coordinated operations
+
+# Perform a normal checkpoint to ensure everything is working
+$node->safe_psql('postgres', "CHECKPOINT");
+
+pass('System remains stable after injection point testing');
+
+# Cleanup
+$node->safe_psql('postgres', "DROP TABLE test_data, test_data2, test_data3");
+
+done_testing();
\ No newline at end of file
diff --git a/src/test/buffermgr/t/003_parallel_resize_buffer.pl b/src/test/buffermgr/t/003_parallel_resize_buffer.pl
new file mode 100644
index 000000000000..9cbb5452fd27
--- /dev/null
+++ b/src/test/buffermgr/t/003_parallel_resize_buffer.pl
@@ -0,0 +1,71 @@
+# Copyright (c) 2025-2025, PostgreSQL Global Development Group
+#
+# Test that only one pg_resize_shared_buffers() call succeeds when multiple
+# sessions attempt to resize buffers concurrently
+
+use strict;
+use warnings;
+use IPC::Run;
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+# Skip this test if injection points are not supported
+if ($ENV{enable_injection_points} ne 'yes')
+{
+ plan skip_all => 'Injection points not supported by this build';
+}
+
+# Initialize a cluster
+my $node = PostgreSQL::Test::Cluster->new('main');
+$node->init;
+$node->append_conf('postgresql.conf', 'shared_preload_libraries = injection_points');
+$node->append_conf('postgresql.conf', 'shared_buffers = 128kB');
+$node->append_conf('postgresql.conf', 'max_shared_buffers = 256kB');
+$node->start;
+
+# Load injection points extension for test coordination
+$node->safe_psql('postgres', "CREATE EXTENSION injection_points");
+
+# Test 1: Two concurrent pg_resize_shared_buffers() calls
+# Set up injection point to pause the first resize call
+$node->safe_psql('postgres',
+ "SELECT injection_points_attach('pg-resize-shared-buffers-flag-set', 'wait')");
+
+# Change shared_buffers for the resize operation
+$node->safe_psql('postgres', "ALTER SYSTEM SET shared_buffers = '144kB'");
+$node->safe_psql('postgres', "SELECT pg_reload_conf()");
+
+# Start first resize session (will pause at injection point)
+my $session1 = $node->background_psql('postgres');
+$session1->query_until(
+ qr/starting_resize/,
+ q(
+ \echo starting_resize
+ SELECT pg_resize_shared_buffers();
+ )
+);
+
+# Wait until session actually reaches the injection point
+$node->wait_for_event('client backend', 'pg-resize-shared-buffers-flag-set');
+
+# Start second resize session (should fail immediately since resize is in progress)
+my $result2 = $node->safe_psql('postgres', "SELECT pg_resize_shared_buffers()");
+
+# The second call should return false (already in progress)
+is($result2, 'f', 'Second concurrent resize call returns false');
+
+# Wake up the first session
+$node->safe_psql('postgres',
+ "SELECT injection_points_wakeup('pg-resize-shared-buffers-flag-set')");
+
+# The pg_resize_shared_buffers() in session1 should now complete successfully
+# We can't easily capture the return value from query_until, but we can
+# verify the session completes without error and the resize actually happened
+$session1->quit;
+
+# Detach injection point
+$node->safe_psql('postgres',
+ "SELECT injection_points_detach('pg-resize-shared-buffers-flag-set')");
+
+done_testing();
\ No newline at end of file
diff --git a/src/test/buffermgr/t/004_client_join_buffer_resize.pl b/src/test/buffermgr/t/004_client_join_buffer_resize.pl
new file mode 100644
index 000000000000..06f0de6b4091
--- /dev/null
+++ b/src/test/buffermgr/t/004_client_join_buffer_resize.pl
@@ -0,0 +1,241 @@
+# Copyright (c) 2025-2025, PostgreSQL Global Development Group
+#
+# Test shared_buffer resizing coordination with client connections joining using injection points
+
+use strict;
+use warnings;
+use IPC::Run;
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+use Time::HiRes qw(sleep);
+
+# Skip this test if injection points are not supported
+if ($ENV{enable_injection_points} ne 'yes')
+{
+ plan skip_all => 'Injection points not supported by this build';
+}
+
+# Function to calculate the size of test table required to fill up maximum
+# buffer pool when populating it.
+sub calculate_test_sizes
+{
+ my ($node, $block_size) = @_;
+
+ # Get the maximum buffer pool size from configuration
+ my $max_shared_buffers = $node->safe_psql('postgres', "SHOW max_shared_buffers");
+ my ($max_val, $max_unit) = ($max_shared_buffers =~ /(\d+)(\w+)/);
+ my $max_size_bytes;
+ if (lc($max_unit) eq 'kb') {
+ $max_size_bytes = $max_val * 1024;
+ } elsif (lc($max_unit) eq 'mb') {
+ $max_size_bytes = $max_val * 1024 * 1024;
+ } elsif (lc($max_unit) eq 'gb') {
+ $max_size_bytes = $max_val * 1024 * 1024 * 1024;
+ } else {
+ # Default to kB if unit is not recognized
+ $max_size_bytes = $max_val * 1024;
+ }
+
+ # Fill more pages than minimally required to increase the chances of pages
+ # from the test table filling the buffer cache.
+ $max_size_bytes = $max_size_bytes;
+ my $pages_needed = int($max_size_bytes / $block_size) + 10; # Add some extra to ensure buffers are filled
+ my $rows_to_insert = $pages_needed * 100; # Assuming roughly 100 rows per page for our table structure
+
+ return ($max_size_bytes, $pages_needed, $rows_to_insert);
+}
+
+# Function to calculate expected buffer count from size string
+sub calculate_buffer_count
+{
+ my ($size_string, $block_size) = @_;
+
+ # Parse size and convert to bytes
+ my ($size_val, $unit) = ($size_string =~ /(\d+)(\w+)/);
+ my $size_bytes;
+ if (lc($unit) eq 'kb') {
+ $size_bytes = $size_val * 1024;
+ } elsif (lc($unit) eq 'mb') {
+ $size_bytes = $size_val * 1024 * 1024;
+ } elsif (lc($unit) eq 'gb') {
+ $size_bytes = $size_val * 1024 * 1024 * 1024;
+ } else {
+ # Default to kB if unit is not recognized
+ $size_bytes = $size_val * 1024;
+ }
+
+ return int($size_bytes / $block_size);
+}
+
+# Initialize cluster with very small buffer sizes for testing
+my $node = PostgreSQL::Test::Cluster->new('main');
+$node->init;
+
+# Configure for buffer resizing with very small buffer pool sizes for faster tests.
+# TODO: for some reason parallel workers try to load default number of shared_buffers which doesn't work with lower max_shared_buffers. We need to fix that - somewhere it's picking default value of shared buffers. For now disable parallelism
+$node->append_conf('postgresql.conf', 'shared_preload_libraries = injection_points');
+$node->append_conf('postgresql.conf', qq{
+max_shared_buffers = 512kB
+shared_buffers = 320kB
+max_parallel_workers_per_gather = 0
+});
+
+$node->start;
+
+# Enable injection points
+$node->safe_psql('postgres', "CREATE EXTENSION injection_points");
+
+# Get the block size (this is fixed for the binary)
+my $block_size = $node->safe_psql('postgres', "SHOW block_size");
+
+# Try to create pg_buffercache extension for buffer analysis
+eval {
+ $node->safe_psql('postgres', "CREATE EXTENSION pg_buffercache");
+};
+if ($@) {
+ $node->stop;
+ plan skip_all => 'pg_buffercache extension not available - cannot verify buffer usage';
+}
+
+# Create a small test table, and fetch its properties for later reference if required.
+$node->safe_psql('postgres', qq{
+ CREATE TABLE client_test (c1 int, data char(50));
+});
+
+my $table_oid = $node->safe_psql('postgres', "SELECT oid FROM pg_class WHERE relname = 'client_test'");
+my $table_relfilenode = $node->safe_psql('postgres', "SELECT relfilenode FROM pg_class WHERE relname = 'client_test'");
+note("Test table client_test: OID = $table_oid, relfilenode = $table_relfilenode");
+my ($max_size_bytes, $pages_needed, $rows_to_insert) = calculate_test_sizes($node, $block_size);
+
+# Create dedicated sessions for injection point handling and test queries,
+# so that we don't create new backends for test operations after starting
+# resize operation. Only one backend, which tests new backend synchronization
+# with resizing operation, should start after resizing has commenced.
+my $injection_session = $node->background_psql('postgres');
+my $query_session = $node->background_psql('postgres');
+my $resize_session = $node->background_psql('postgres');
+
+# Function to run a single injection point test
+sub run_injection_point_test
+{
+ my ($test_name, $injection_point, $target_size, $operation_type) = @_;
+
+ note("Test with $test_name ($operation_type)");
+
+ # Calculate test parameters before starting resize
+ my ($max_size_bytes, $pages_needed, $rows_to_insert) = calculate_test_sizes($node, $target_size, $block_size);
+
+ # Update buffer pool size and wait for it to reflect pending state
+ $resize_session->query_safe("ALTER SYSTEM SET shared_buffers = '$target_size'");
+ $resize_session->query_safe("SELECT pg_reload_conf()");
+ my $pending_size_str = "pending: $target_size";
+ $resize_session->poll_query_until("SELECT substring(current_setting('shared_buffers'), '$pending_size_str')", $pending_size_str);
+
+ # Set up injection point in injection session
+ $injection_session->query_safe("SELECT injection_points_attach('$injection_point', 'wait')");
+
+ # Trigger resize
+ $resize_session->query_until(
+ qr/starting_resize/,
+ q(
+ \echo starting_resize
+ SELECT pg_resize_shared_buffers();
+ )
+ );
+
+ # Wait until resize actually reaches the injection point using the query session
+ $query_session->wait_for_event('client backend', $injection_point);
+
+ # Start a client while resize is paused
+ my $client = $node->background_psql('postgres');
+ note("Background client backend PID: " . $client->query_safe("SELECT pg_backend_pid()"));
+
+ # Wake up the injection point from injection session
+ $injection_session->query_safe("SELECT injection_points_wakeup('$injection_point')");
+
+ # Test buffer functionality immediately after waking up injection point
+ # Insert data to test buffer pool functionality during/after resize
+ $client->query_safe("INSERT INTO client_test SELECT i, 'test_data_' || i FROM generate_series(1, $rows_to_insert) i");
+ # Verify the data was inserted correctly and can be read back
+ is($client->query_safe("SELECT COUNT(*) FROM client_test"), $rows_to_insert, "inserted $rows_to_insert during $test_name ($operation_type) successful");
+
+ # Verify table size is reasonable (should be substantial for testing)
+ ok($query_session->query_safe("SELECT pg_total_relation_size('client_test')") >= $max_size_bytes,"table size is large enough to overflow buffer pool in test $test_name ($operation_type)");
+
+ # Wait for the resize operation to complete. There is no direct way to do so
+ # in background_psql. Hence fire a psql command and wait for it to finish
+ $resize_session->query(q(\echo 'done'));
+
+ # Detach injection point from injection session
+ $injection_session->query_safe("SELECT injection_points_detach('$injection_point')");
+
+ # Verify resize completed successfully
+ is($query_session->query_safe("SELECT current_setting('shared_buffers')"), $target_size,
+ "resize completed successfully to $target_size");
+
+ # Check buffer pool size using pg_buffercache after resize completion
+ is($query_session->query_safe("SELECT COUNT(*) FROM pg_buffercache"), calculate_buffer_count($target_size, $block_size), "all buffers in the buffer pool used in $test_name ($operation_type)");
+
+ # Wait for client to complete
+ ok($client->quit, "client succeeded during $test_name ($operation_type)");
+
+ # Clean up for next test
+ $query_session->query_safe("DELETE FROM client_test");
+}
+
+# Test injection points during buffer resize with client connections
+my @common_injection_tests = (
+ {
+ name => 'flag setting phase',
+ injection_point => 'pg-resize-shared-buffers-flag-set',
+ },
+ {
+ name => 'memory remap phase',
+ injection_point => 'pgrsb-after-shmem-resize',
+ },
+ {
+ name => 'resize map barrier complete',
+ injection_point => 'pgrsb-resize-barrier-sent',
+ },
+);
+
+# Test common injection points for both shrinking and expanding
+foreach my $test (@common_injection_tests)
+{
+ # Test shrinking scenario
+ run_injection_point_test($test->{name}, $test->{injection_point}, '272kB', 'shrinking');
+
+ # Test expanding scenario
+ run_injection_point_test($test->{name}, $test->{injection_point}, '400kB', 'expanding');
+}
+
+my @shrink_only_tests = (
+ {
+ name => 'shrink barrier complete',
+ injection_point => 'pgrsb-shrink-barrier-sent',
+ size => '200kB',
+ }
+);
+foreach my $test (@shrink_only_tests)
+{
+ run_injection_point_test($test->{name}, $test->{injection_point}, $test->{size}, 'shrinking only');
+}
+
+my @expand_only_tests = (
+ {
+ name => 'expand barrier complete',
+ injection_point => 'pgrsb-expand-barrier-sent',
+ size => '416kB',
+ }
+);
+foreach my $test (@expand_only_tests)
+{
+ run_injection_point_test($test->{name}, $test->{injection_point}, $test->{size}, 'expanding only');
+}
+
+$injection_session->quit;
+$query_session->quit;
+$resize_session->quit;
+
+done_testing();
\ No newline at end of file
diff --git a/src/test/meson.build b/src/test/meson.build
index ccc31d6a86a1..2a5ba1dec398 100644
--- a/src/test/meson.build
+++ b/src/test/meson.build
@@ -4,6 +4,7 @@ subdir('regress')
subdir('isolation')
subdir('authentication')
+subdir('buffermgr')
subdir('postmaster')
subdir('recovery')
subdir('subscription')
diff --git a/src/test/perl/PostgreSQL/Test/BackgroundPsql.pm b/src/test/perl/PostgreSQL/Test/BackgroundPsql.pm
index 60bbd5dd445b..16625e94d92e 100644
--- a/src/test/perl/PostgreSQL/Test/BackgroundPsql.pm
+++ b/src/test/perl/PostgreSQL/Test/BackgroundPsql.pm
@@ -61,6 +61,7 @@ use Config;
use IPC::Run;
use PostgreSQL::Test::Utils qw(pump_until);
use Test::More;
+use Time::HiRes qw(usleep);
=pod
@@ -371,4 +372,79 @@ sub set_query_timer_restart
return $self->{query_timer_restart};
}
+=pod
+
+=item $session->poll_query_until($query [, $expected ])
+
+Run B<$query> repeatedly in this background session, until it returns the
+B<$expected> result ('t', or SQL boolean true, by default).
+Continues polling if the query returns an error result.
+Times out after a reasonable number of attempts.
+Returns 1 if successful, 0 if timed out.
+
+=cut
+
+sub poll_query_until
+{
+ my ($self, $query, $expected) = @_;
+
+ $expected = 't' unless defined($expected); # default value
+
+ my $max_attempts = 10 * $PostgreSQL::Test::Utils::timeout_default;
+ my $attempts = 0;
+ my ($stdout, $stderr_flag);
+
+ while ($attempts < $max_attempts)
+ {
+ ($stdout, $stderr_flag) = $self->query($query);
+
+ chomp($stdout);
+
+ # If query succeeded and returned expected result
+ if (!$stderr_flag && $stdout eq $expected)
+ {
+ return 1;
+ }
+
+ # Wait 0.1 second before retrying.
+ usleep(100_000);
+
+ $attempts++;
+ }
+
+ # Give up. Print the output from the last attempt, hopefully that's useful
+ # for debugging.
+ my $stderr_output = $stderr_flag ? $self->{stderr} : '';
+ diag qq(poll_query_until timed out executing this query:
+$query
+expecting this output:
+$expected
+last actual query output:
+$stdout
+with stderr:
+$stderr_output);
+ return 0;
+}
+
+=item $session->wait_for_event(backend_type, wait_event_name)
+
+Poll pg_stat_activity until backend_type reaches wait_event_name using this
+background session.
+
+=cut
+
+sub wait_for_event
+{
+ my ($self, $backend_type, $wait_event_name) = @_;
+
+ $self->poll_query_until(qq[
+ SELECT count(*) > 0 FROM pg_stat_activity
+ WHERE backend_type = '$backend_type' AND wait_event = '$wait_event_name'
+ ])
+ or die
+ qq(timed out when waiting for $backend_type to reach wait event '$wait_event_name');
+
+ return;
+}
+
1;
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index 372a2188c22a..f02e82de520d 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -1765,14 +1765,22 @@ pg_shadow| SELECT pg_authid.rolname AS usename,
LEFT JOIN pg_db_role_setting s ON (((pg_authid.oid = s.setrole) AND (s.setdatabase = (0)::oid))))
WHERE pg_authid.rolcanlogin;
pg_shmem_allocations| SELECT name,
+ segment,
off,
size,
allocated_size
- FROM pg_get_shmem_allocations() pg_get_shmem_allocations(name, off, size, allocated_size);
+ FROM pg_get_shmem_allocations() pg_get_shmem_allocations(name, segment, off, size, allocated_size);
pg_shmem_allocations_numa| SELECT name,
numa_node,
size
FROM pg_get_shmem_allocations_numa() pg_get_shmem_allocations_numa(name, numa_node, size);
+pg_shmem_segments| SELECT id,
+ name,
+ size,
+ freeoffset,
+ mapping_size,
+ mapping_reserved_size
+ FROM pg_get_shmem_segments() pg_get_shmem_segments(id, name, size, freeoffset, mapping_size, mapping_reserved_size);
pg_stat_activity| SELECT s.datid,
d.datname,
s.pid,
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 27a4d1318978..4d9879ac60d7 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -2778,6 +2778,7 @@ ShellTypeInfo
ShippableCacheEntry
ShippableCacheKey
ShmemIndexEnt
+ShmemControl
ShutdownForeignScan_function
ShutdownInformation
ShutdownMode