From 81da8b3cc36601ef44209c9c23fd8d4de1f33d1e Mon Sep 17 00:00:00 2001 From: Ashutosh Bapat Date: Mon, 25 Aug 2025 19:23:50 +0530 Subject: [PATCH 1/4] Add a view to read contents of shared buffer lookup table The view exposes the contents of the shared buffer lookup table for debugging, testing and investigation. This helped me in debugging issues where the buffer descriptor array and buffer lookup table were out of sync; either the buffer lookup table had a mapping page->buffer which wasn't present in the buffer descriptor array or a page in the buffer descriptor array didn't have corresponding entry in the buffer lookup table. pg_buffercache doesn't help with those kind of issues. Also doing that under the debugger in very painful. I intend to keep this patch while the rest of the code matures. If it is found useful as a debugging tool, we may consider make it committable and commit it. Author: Ashutosh Bapat --- .../expected/pg_buffercache.out | 39 ++++++++ .../pg_buffercache--1.5--1.6.sql | 24 +++++ contrib/pg_buffercache/pg_buffercache_pages.c | 18 ++++ contrib/pg_buffercache/sql/pg_buffercache.sql | 20 +++++ doc/src/sgml/system-views.sgml | 89 +++++++++++++++++++ src/backend/storage/buffer/buf_table.c | 58 ++++++++++++ src/include/storage/buf_internals.h | 2 + 7 files changed, 250 insertions(+) diff --git a/contrib/pg_buffercache/expected/pg_buffercache.out b/contrib/pg_buffercache/expected/pg_buffercache.out index 9a9216dc7b1b..2f27bf34637e 100644 --- a/contrib/pg_buffercache/expected/pg_buffercache.out +++ b/contrib/pg_buffercache/expected/pg_buffercache.out @@ -23,6 +23,26 @@ SELECT count(*) > 0 FROM pg_buffercache_usage_counts() WHERE buffers >= 0; t (1 row) +-- Test the buffer lookup table function and count is <= shared_buffers +select count(*) <= (select setting::bigint + from pg_settings + where name = 'shared_buffers') +from pg_buffercache_lookup_table_entries(); + ?column? +---------- + t +(1 row) + +-- Check that pg_buffercache_lookup_table view works and count is <= shared_buffers +select count(*) <= (select setting::bigint + from pg_settings + where name = 'shared_buffers') +from pg_buffercache_lookup_table; + ?column? +---------- + t +(1 row) + -- Check that the functions / views can't be accessed by default. To avoid -- having to create a dedicated user, use the pg_database_owner pseudo-role. SET ROLE pg_database_owner; @@ -34,6 +54,10 @@ SELECT * FROM pg_buffercache_summary(); ERROR: permission denied for function pg_buffercache_summary SELECT * FROM pg_buffercache_usage_counts(); ERROR: permission denied for function pg_buffercache_usage_counts +SELECT * FROM pg_buffercache_lookup_table_entries(); +ERROR: permission denied for function pg_buffercache_lookup_table_entries +SELECT * FROM pg_buffercache_lookup_table; +ERROR: permission denied for view pg_buffercache_lookup_table RESET role; -- Check that pg_monitor is allowed to query view / function SET ROLE pg_monitor; @@ -55,6 +79,21 @@ SELECT count(*) > 0 FROM pg_buffercache_usage_counts(); t (1 row) +RESET role; +-- Check that pg_read_all_stats is allowed to query buffer lookup table +SET ROLE pg_read_all_stats; +SELECT count(*) >= 0 FROM pg_buffercache_lookup_table_entries(); + ?column? +---------- + t +(1 row) + +SELECT count(*) >= 0 FROM pg_buffercache_lookup_table; + ?column? +---------- + t +(1 row) + RESET role; ------ ---- Test pg_buffercache_evict* functions diff --git a/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql b/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql index 458f054a6917..9bf58567878d 100644 --- a/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql +++ b/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql @@ -44,3 +44,27 @@ CREATE FUNCTION pg_buffercache_evict_all( OUT buffers_skipped int4) AS 'MODULE_PATHNAME', 'pg_buffercache_evict_all' LANGUAGE C PARALLEL SAFE VOLATILE; + +-- Add the buffer lookup table function +CREATE FUNCTION pg_buffercache_lookup_table_entries( + OUT tablespace oid, + OUT database oid, + OUT relfilenode oid, + OUT forknum int2, + OUT blocknum int8, + OUT bufferid int4) +RETURNS SETOF record +AS 'MODULE_PATHNAME', 'pg_buffercache_lookup_table_entries' +LANGUAGE C PARALLEL SAFE VOLATILE; + +-- Create a view for convenient access. +CREATE VIEW pg_buffercache_lookup_table AS + SELECT * FROM pg_buffercache_lookup_table_entries(); + +-- Don't want these to be available to public. +REVOKE ALL ON FUNCTION pg_buffercache_lookup_table_entries() FROM PUBLIC; +REVOKE ALL ON pg_buffercache_lookup_table FROM PUBLIC; + +-- Grant access to monitoring role. +GRANT EXECUTE ON FUNCTION pg_buffercache_lookup_table_entries() TO pg_read_all_stats; +GRANT SELECT ON pg_buffercache_lookup_table TO pg_read_all_stats; diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c index c29b784dfa1a..31c4a339d74a 100644 --- a/contrib/pg_buffercache/pg_buffercache_pages.c +++ b/contrib/pg_buffercache/pg_buffercache_pages.c @@ -16,6 +16,7 @@ #include "storage/buf_internals.h" #include "storage/bufmgr.h" #include "utils/rel.h" +#include "utils/tuplestore.h" #define NUM_BUFFERCACHE_PAGES_MIN_ELEM 8 @@ -100,6 +101,7 @@ PG_FUNCTION_INFO_V1(pg_buffercache_usage_counts); PG_FUNCTION_INFO_V1(pg_buffercache_evict); PG_FUNCTION_INFO_V1(pg_buffercache_evict_relation); PG_FUNCTION_INFO_V1(pg_buffercache_evict_all); +PG_FUNCTION_INFO_V1(pg_buffercache_lookup_table_entries); /* Only need to touch memory once per backend process lifetime */ @@ -776,3 +778,19 @@ pg_buffercache_evict_all(PG_FUNCTION_ARGS) PG_RETURN_DATUM(result); } + +/* + * Return lookup table content as a set of records. + */ +Datum +pg_buffercache_lookup_table_entries(PG_FUNCTION_ARGS) +{ + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + + InitMaterializedSRF(fcinfo, 0); + + /* Fill the tuplestore */ + BufTableGetContents(rsinfo->setResult, rsinfo->setDesc); + + return (Datum) 0; +} diff --git a/contrib/pg_buffercache/sql/pg_buffercache.sql b/contrib/pg_buffercache/sql/pg_buffercache.sql index 47cca1907c74..569b28aebb9d 100644 --- a/contrib/pg_buffercache/sql/pg_buffercache.sql +++ b/contrib/pg_buffercache/sql/pg_buffercache.sql @@ -12,6 +12,18 @@ from pg_buffercache_summary(); SELECT count(*) > 0 FROM pg_buffercache_usage_counts() WHERE buffers >= 0; +-- Test the buffer lookup table function and count is <= shared_buffers +select count(*) <= (select setting::bigint + from pg_settings + where name = 'shared_buffers') +from pg_buffercache_lookup_table_entries(); + +-- Check that pg_buffercache_lookup_table view works and count is <= shared_buffers +select count(*) <= (select setting::bigint + from pg_settings + where name = 'shared_buffers') +from pg_buffercache_lookup_table; + -- Check that the functions / views can't be accessed by default. To avoid -- having to create a dedicated user, use the pg_database_owner pseudo-role. SET ROLE pg_database_owner; @@ -19,6 +31,8 @@ SELECT * FROM pg_buffercache; SELECT * FROM pg_buffercache_pages() AS p (wrong int); SELECT * FROM pg_buffercache_summary(); SELECT * FROM pg_buffercache_usage_counts(); +SELECT * FROM pg_buffercache_lookup_table_entries(); +SELECT * FROM pg_buffercache_lookup_table; RESET role; -- Check that pg_monitor is allowed to query view / function @@ -28,6 +42,12 @@ SELECT buffers_used + buffers_unused > 0 FROM pg_buffercache_summary(); SELECT count(*) > 0 FROM pg_buffercache_usage_counts(); RESET role; +-- Check that pg_read_all_stats is allowed to query buffer lookup table +SET ROLE pg_read_all_stats; +SELECT count(*) >= 0 FROM pg_buffercache_lookup_table_entries(); +SELECT count(*) >= 0 FROM pg_buffercache_lookup_table; +RESET role; + ------ ---- Test pg_buffercache_evict* functions diff --git a/doc/src/sgml/system-views.sgml b/doc/src/sgml/system-views.sgml index 0e623e7fb867..3c751de4a4be 100644 --- a/doc/src/sgml/system-views.sgml +++ b/doc/src/sgml/system-views.sgml @@ -71,6 +71,11 @@ backend memory contexts + + pg_buffer_lookup_table + shared buffer lookup table + + pg_config compile-time configuration parameters @@ -901,6 +906,90 @@ AND c1.path[c2.level] = c2.path[c2.level]; + + <structname>pg_buffer_lookup_table</structname> + + pg_buffer_lookup_table + + + The pg_buffer_lookup_table view exposes the current + contents of the shared buffer lookup table. Each row represents an entry in + the lookup table mapping a relation page to the ID of buffer in which it is + cached. The shared buffer lookup table is locked for a short duration while + reading so as to ensure consistency. This may affect performance if this view + is queried very frequently. + + + <structname>pg_buffer_lookup_table</structname> View + + + + + Column Type + + + Description + + + + + + + tablespace oid + + + OID of the tablespace containing the relation + + + + + database oid + + + OID of the database containing the relation (zero for shared relations) + + + + + relfilenode oid + + + relfilenode identifying the relation + + + + + forknum int2 + + + Fork number within the relation (see ) + + + + + blocknum int8 + + + Block number within the relation + + + + + bufferid int4 + + + ID of the buffer caching the page + + + + +
+ + Access to this view is restricted to members of the + pg_read_all_stats role by default. + +
+ <structname>pg_config</structname> diff --git a/src/backend/storage/buffer/buf_table.c b/src/backend/storage/buffer/buf_table.c index 9d256559bab9..f0c39ec28222 100644 --- a/src/backend/storage/buffer/buf_table.c +++ b/src/backend/storage/buffer/buf_table.c @@ -21,7 +21,12 @@ */ #include "postgres.h" +#include "fmgr.h" +#include "funcapi.h" #include "storage/buf_internals.h" +#include "storage/lwlock.h" +#include "utils/rel.h" +#include "utils/builtins.h" /* entry for buffer lookup hashtable */ typedef struct @@ -159,3 +164,56 @@ BufTableDelete(BufferTag *tagPtr, uint32 hashcode) if (!result) /* shouldn't happen */ elog(ERROR, "shared buffer hash table corrupted"); } + +/* + * BufTableGetContents + * Fill the given tuplestore with contents of the shared buffer lookup table + * + * This function is used by pg_buffercache extension to expose buffer lookup + * table contents via SQL. The caller is responsible for setting up the + * tuplestore and result set info. + */ +void +BufTableGetContents(Tuplestorestate *tupstore, TupleDesc tupdesc) +{ +/* Expected number of attributes of the buffer lookup table entry. */ +#define BUFTABLE_CONTENTS_COLS 6 + + HASH_SEQ_STATUS hstat; + BufferLookupEnt *ent; + Datum values[BUFTABLE_CONTENTS_COLS]; + bool nulls[BUFTABLE_CONTENTS_COLS]; + int i; + + memset(nulls, 0, sizeof(nulls)); + + Assert(tupdesc->natts == BUFTABLE_CONTENTS_COLS); + + /* + * Lock all buffer mapping partitions to ensure a consistent view of the + * hash table during the scan. Must grab LWLocks in partition-number order + * to avoid LWLock deadlock. + */ + for (i = 0; i < NUM_BUFFER_PARTITIONS; i++) + LWLockAcquire(BufMappingPartitionLockByIndex(i), LW_SHARED); + + hash_seq_init(&hstat, SharedBufHash); + while ((ent = (BufferLookupEnt *) hash_seq_search(&hstat)) != NULL) + { + values[0] = ObjectIdGetDatum(ent->key.spcOid); + values[1] = ObjectIdGetDatum(ent->key.dbOid); + values[2] = ObjectIdGetDatum(ent->key.relNumber); + values[3] = ObjectIdGetDatum(ent->key.forkNum); + values[4] = Int64GetDatum(ent->key.blockNum); + values[5] = Int32GetDatum(ent->id); + + tuplestore_putvalues(tupstore, tupdesc, values, nulls); + } + + /* + * Release all buffer mapping partition locks in the reverse order so as + * to avoid LWLock deadlock. + */ + for (i = NUM_BUFFER_PARTITIONS - 1; i >= 0; i--) + LWLockRelease(BufMappingPartitionLockByIndex(i)); +} diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index 5400c56a965f..519692702a02 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -28,6 +28,7 @@ #include "storage/spin.h" #include "utils/relcache.h" #include "utils/resowner.h" +#include "utils/tuplestore.h" /* * Buffer state is a single 32-bit variable where following data is combined. @@ -520,6 +521,7 @@ extern uint32 BufTableHashCode(BufferTag *tagPtr); extern int BufTableLookup(BufferTag *tagPtr, uint32 hashcode); extern int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id); extern void BufTableDelete(BufferTag *tagPtr, uint32 hashcode); +extern void BufTableGetContents(Tuplestorestate *tupstore, TupleDesc tupdesc); /* localbuf.c */ extern bool PinLocalBuffer(BufferDesc *buf_hdr, bool adjust_usagecount); From a12248f73045996ba66347c4a9b22fff7895764a Mon Sep 17 00:00:00 2001 From: Dmitrii Dolgov <9erthalion6@gmail.com> Date: Fri, 28 Feb 2025 19:54:47 +0100 Subject: [PATCH 2/4] Memory and address space management for buffer resizing This has three changes 1. Allow to use multiple shared memory mappings ============================================ Currently all the work with shared memory is done via a single anonymous memory mapping, which limits ways how the shared memory could be organized. Introduce possibility to allocate multiple shared memory mappings, where a single mapping is associated with a specified shared memory segment. A new shared memory API is introduced, extended with a segment as a new parameter. As a path of least resistance, the original API is kept in place, utilizing the main shared memory segment. Modifies pg_shmem_allocations to report shared memory segment as well. Adds pg_shmem_segments to report shared memory segment information. 2. Address space reservation for shared memory ============================================ Currently the shared memory layout is designed to pack everything tight together, leaving no space between mappings for resizing. Here is how it looks like for one mapping in /proc/$PID/maps, /dev/zero represents the anonymous shared memory we talk about: 00400000-00490000 /path/bin/postgres ... 012d9000-0133e000 [heap] 7f443a800000-7f470a800000 /dev/zero (deleted) 7f470a800000-7f471831d000 /usr/lib/locale/locale-archive 7f4718400000-7f4718401000 /usr/lib64/libstdc++.so.6.0.34 ... Make the layout more dynamic via splitting every shared memory segment into two parts: * An anonymous file, which actually contains shared memory content. Such an anonymous file is created via memfd_create, it lives in memory, behaves like a regular file and semantically equivalent to an anonymous memory allocated via mmap with MAP_ANONYMOUS. * A reservation mapping, which size is much larger than required shared segment size. This mapping is created with flag MAP_NORESERVE (to not count the reserved space against memory limits). The anonymous file is mapped into this reservation mapping. If we have to change the address maps while resizing the shared buffer pool, it is needed to be done in Postmaster too, so that the new backends will inherit the resized address space from the Postmaster. However, Postmaster is not invovled in ProcSignalBarrier mechanism and we don't want it to spend time in things other than its core functionality. To achive that, maximum required address space maps are setup upfront with read and write access when starting the server. When resizing the buffer pool only the backing file object is resized from the coordinator. This also makes the ProcSignalBarrier handling code light for backends other than the coordinator. The resulting layout looks like this: 00400000-00490000 /path/bin/postgres ... 3f526000-3f590000 rw-p [heap] 7fbd827fe000-7fbd8bdde000 rw-s /memfd:main (deleted) -- anon file 7fbd8bdde000-7fbe82800000 ---s /memfd:main (deleted) -- reservation 7fbe82800000-7fbe90670000 r--p /usr/lib/locale/locale-archive 7fbe90800000-7fbe90941000 r-xp /usr/lib64/libstdc++.so.6.0.34 To resize a shared memory segment in this layout it's possible to use ftruncate on the memory mapped file. This approach also do not impact the actual memory usage as reported by the kernel. TODO: Verify that Cgroup v2 doesn't have any problems with that as well. To verify a new cgroup was created with the memory limit 256 MB, then PostgreSQL was launched within this cgroup with shared_buffers = 128 MB: $ cd /sys/fs/cgroup $ mkdir postgres $ cd postres $ echo 268435456 > memory.max $ echo $MASTER_PID_SHELL > cgroup.procs # postgres from the master branch has being successfully launched # from that shell $ cat memory.current 17465344 (~16.6 MB) # stop postgres $ echo $PATCH_PID_SHELL > cgroup.procs # postgres from the patch has being successfully launched from that shell $ cat memory.current 20770816 (~19.8 MB) There are also few unrelated advantages of using memory mapped files: * We've got a file descriptor, which could be used for regular file operations (modification, truncation, you name it). * The file could be given a name, which improves readability when it comes to process maps. * By default, Linux will not add file-backed shared mappings into a core dump, making it more convenient to work with them in PostgreSQL: no more huge dumps to process. - Some hackers have expressed concerns over it. The downside is that memfd_create is Linux specific. 3. Refactor CalculateShmemSize() ============================= This function calls many functions which return the amount of shared memory required for different shared memory data structures. Up until now, the returned total of these sizes was used to create a single shared memory segment. With this change, CalculateShmemSize() needs to estimate memory requirements for each of the segments. It now takes an array of MemoryMappingSizes, containing as many elements as the number of segments, as an argument. The sizes returned by all the function it calls, except BufferManagerShmemSize(), are added and saved in the first element (index 0) of the array. BufferManagerShmemSize() is modified to save the amount of memory required for buffer manager related segments in the corresponding array element. Additionally it also saves the amount of reserved space. For now, the amount of reserved address space is same as the amount of required memory but that is expected to change with the next commit which implements buffer pool resize. CalculateShmemSize() now returns the total of sizes corresponding to all the sizes. Author: Dmitrii Dolgov and Ashutosh Bapat Reviewed-by: Tomas Vondra --- doc/src/sgml/system-views.sgml | 9 + src/backend/catalog/system_views.sql | 7 + src/backend/port/sysv_shmem.c | 425 +++++++++++++++++++------ src/backend/port/win32_sema.c | 2 +- src/backend/port/win32_shmem.c | 14 +- src/backend/storage/buffer/buf_init.c | 56 ++-- src/backend/storage/buffer/buf_table.c | 6 +- src/backend/storage/buffer/freelist.c | 5 +- src/backend/storage/ipc/ipc.c | 4 +- src/backend/storage/ipc/ipci.c | 99 ++++-- src/backend/storage/ipc/shmem.c | 243 ++++++++++---- src/backend/storage/lmgr/lwlock.c | 15 +- src/include/catalog/pg_proc.dat | 12 +- src/include/portability/mem.h | 2 +- src/include/storage/bufmgr.h | 3 +- src/include/storage/ipc.h | 4 +- src/include/storage/pg_shmem.h | 60 +++- src/include/storage/shmem.h | 12 + src/test/regress/expected/rules.out | 10 +- 19 files changed, 755 insertions(+), 233 deletions(-) diff --git a/doc/src/sgml/system-views.sgml b/doc/src/sgml/system-views.sgml index 3c751de4a4be..7cb1e5e17f81 100644 --- a/doc/src/sgml/system-views.sgml +++ b/doc/src/sgml/system-views.sgml @@ -4233,6 +4233,15 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
+ + + segment text + + + The name of the shared memory segment concerning the allocation. + + + off int8 diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 95ad29a64b98..6a0180f39be4 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -668,6 +668,13 @@ GRANT SELECT ON pg_shmem_allocations TO pg_read_all_stats; REVOKE EXECUTE ON FUNCTION pg_get_shmem_allocations() FROM PUBLIC; GRANT EXECUTE ON FUNCTION pg_get_shmem_allocations() TO pg_read_all_stats; +CREATE VIEW pg_shmem_segments AS + SELECT * FROM pg_get_shmem_segments(); + +REVOKE ALL ON pg_shmem_segments FROM PUBLIC; +GRANT SELECT ON pg_shmem_segments TO pg_read_all_stats; +REVOKE EXECUTE ON FUNCTION pg_get_shmem_segments() FROM PUBLIC; +GRANT EXECUTE ON FUNCTION pg_get_shmem_segments() TO pg_read_all_stats; CREATE VIEW pg_shmem_allocations_numa AS SELECT * FROM pg_get_shmem_allocations_numa(); diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c index 197926d44f6b..cc4b2c80e1ab 100644 --- a/src/backend/port/sysv_shmem.c +++ b/src/backend/port/sysv_shmem.c @@ -90,12 +90,49 @@ typedef enum SHMSTATE_UNATTACHED, /* pertinent to DataDir, no attached PIDs */ } IpcMemoryState; - +/* + * TODO: These should be moved into ShmemSegment, now that there can be multiple + * shared memory segments. But there's windows specific code which will need + * adjustment, so leaving it here. + */ unsigned long UsedShmemSegID = 0; void *UsedShmemSegAddr = NULL; -static Size AnonymousShmemSize; -static void *AnonymousShmem = NULL; +/* + * Anonymous mapping layout we use looks like this: + * + * 00400000-00c2a000 r-xp /bin/postgres + * ... + * 3f526000-3f590000 rw-p [heap] + * 7fbd827fe000-7fbd8bdde000 rw-s /memfd:main (deleted) + * 7fbd8bdde000-7fbe82800000 ---s /memfd:main (deleted) + * 7fbe82800000-7fbe90670000 r--p /usr/lib/locale/locale-archive + * 7fbe90800000-7fbe90941000 r-xp /usr/lib64/libstdc++.so.6.0.34 + * ... + * + * We need to place shared memory mappings in such a way, that there will be + * gaps between them in the address space. Those gaps have to be large enough + * to resize the mapping up to certain size, without counting towards the total + * memory consumption. + * + * To achieve this, for each shared memory segment we first create an anonymous + * file of specified size using memfd_create, which will accomodate actual + * shared memory mapping content. It is represented by the first /memfd:main + * with rw permissions. Then we create a mapping for this file using mmap, with + * size much larger than required and flags PROT_NONE (allows to make sure the + * reserved space will not be used) and MAP_NORESERVE (prevents the space from + * being counted against memory limits). The mapping serves as an address space + * reservation, into which shared memory segment can be extended and is + * represented by the second /memfd:main with no permissions. + */ + +/* + * Flag telling that we have decided to use huge pages. + * + * XXX: It's possible to use GetConfigOption("huge_pages_status", false, false) + * instead, but it feels like an overkill. + */ +static bool huge_pages_on = false; static void *InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size); static void IpcMemoryDetach(int status, Datum shmaddr); @@ -104,6 +141,27 @@ static IpcMemoryState PGSharedMemoryAttach(IpcMemoryId shmId, void *attachAt, PGShmemHeader **addr); +const char* +MappingName(int shmem_segment) +{ + switch (shmem_segment) + { + case MAIN_SHMEM_SEGMENT: + return "main"; + case BUFFERS_SHMEM_SEGMENT: + return "buffers"; + case BUFFER_DESCRIPTORS_SHMEM_SEGMENT: + return "descriptors"; + case BUFFER_IOCV_SHMEM_SEGMENT: + return "iocv"; + case CHECKPOINT_BUFFERS_SHMEM_SEGMENT: + return "checkpoint"; + case STRATEGY_SHMEM_SEGMENT: + return "strategy"; + default: + return "unknown"; + } +} /* * InternalIpcMemoryCreate(memKey, size) @@ -470,19 +528,20 @@ PGSharedMemoryAttach(IpcMemoryId shmId, * hugepage sizes, we might want to think about more invasive strategies, * such as increasing shared_buffers to absorb the extra space. * - * Returns the (real, assumed or config provided) page size into - * *hugepagesize, and the hugepage-related mmap flags to use into - * *mmap_flags if requested by the caller. If huge pages are not supported, - * *hugepagesize and *mmap_flags are set to 0. + * Returns the (real, assumed or config provided) page size into *hugepagesize, + * the hugepage-related mmap and memfd flags to use into *mmap_flags and + * *memfd_flags if requested by the caller. If huge pages are not supported, + * *hugepagesize, *mmap_flags and *memfd_flags are set to 0. */ void -GetHugePageSize(Size *hugepagesize, int *mmap_flags) +GetHugePageSize(Size *hugepagesize, int *mmap_flags, int *memfd_flags) { #ifdef MAP_HUGETLB Size default_hugepagesize = 0; Size hugepagesize_local = 0; int mmap_flags_local = 0; + int memfd_flags_local = 0; /* * System-dependent code to find out the default huge page size. @@ -541,6 +600,7 @@ GetHugePageSize(Size *hugepagesize, int *mmap_flags) } mmap_flags_local = MAP_HUGETLB; + memfd_flags_local = MFD_HUGETLB; /* * On recent enough Linux, also include the explicit page size, if @@ -551,7 +611,16 @@ GetHugePageSize(Size *hugepagesize, int *mmap_flags) { int shift = pg_ceil_log2_64(hugepagesize_local); - mmap_flags_local |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT; + memfd_flags_local |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT; + } +#endif + +#if defined(MFD_HUGE_MASK) && defined(MFD_HUGE_SHIFT) + if (hugepagesize_local != default_hugepagesize) + { + int shift = pg_ceil_log2_64(hugepagesize_local); + + memfd_flags_local |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT; } #endif @@ -560,6 +629,8 @@ GetHugePageSize(Size *hugepagesize, int *mmap_flags) *mmap_flags = mmap_flags_local; if (hugepagesize) *hugepagesize = hugepagesize_local; + if (memfd_flags) + *memfd_flags = memfd_flags_local; #else @@ -567,6 +638,8 @@ GetHugePageSize(Size *hugepagesize, int *mmap_flags) *hugepagesize = 0; if (mmap_flags) *mmap_flags = 0; + if (memfd_flags) + *memfd_flags = 0; #endif /* MAP_HUGETLB */ } @@ -588,83 +661,242 @@ check_huge_page_size(int *newval, void **extra, GucSource source) return true; } +/* + * Wrapper around posix_fallocate() to allocate memory for a given shared memory + * segment. + * + * Performs retry on EINTR, and raises error upon failure. + */ +static void +shmem_fallocate(int fd, const char *mapping_name, Size size, int elevel) +{ +#if defined(HAVE_POSIX_FALLOCATE) && defined(__linux__) + int ret; + + + /* + * If there is not enough memory, trying to access a hole in address space + * will cause SIGBUS. If supported, avoid that by allocating memory upfront. + * + * We still use a traditional EINTR retry loop to handle SIGCONT. + * posix_fallocate() doesn't restart automatically, and we don't want this to + * fail if you attach a debugger. + */ + do + { + ret = posix_fallocate(fd, 0, size); + } while (ret == EINTR); + + if (ret != 0) + { + ereport(elevel, + (errmsg("segment[%s]: could not allocate space for anonymous file: %s", + mapping_name, strerror(ret)), + (ret == ENOMEM) ? + errhint("This error usually means that PostgreSQL's request " + "for a shared memory segment exceeded available memory, " + "swap space, or huge pages. To reduce the request size " + "(currently %zu bytes), reduce PostgreSQL's shared " + "memory usage, perhaps by reducing \"shared_buffers\" or " + "\"max_connections\".", + size) : 0)); + } +#endif /* HAVE_POSIX_FALLOCATE && __linux__ */ +} + +/* + * Round up the required amount of memory and the amount of required reserved + * address space to the nearest huge page size. + */ +static inline void +round_off_mapping_sizes_for_hugepages(MemoryMappingSizes *mapping, int hugepagesize) +{ + if (hugepagesize == 0) + return; + + if (mapping->shmem_req_size % hugepagesize != 0) + mapping->shmem_req_size += hugepagesize - + (mapping->shmem_req_size % hugepagesize); + + if (mapping->shmem_reserved % hugepagesize != 0) + mapping->shmem_reserved = mapping->shmem_reserved + hugepagesize - + (mapping->shmem_reserved % hugepagesize); +} + /* * Creates an anonymous mmap()ed shared memory segment. * - * Pass the requested size in *size. This function will modify *size to the - * actual size of the allocation, if it ends up allocating a segment that is - * larger than requested. + * This function will modify mapping size to the actual size of the allocation, + * if it ends up allocating a segment that is larger than requested. If needed, + * it also rounds up the mapping reserved size to be a multiple of huge page + * size. + * + * Note that we do not fallback from huge pages to regular pages in this + * function, this decision was already made in ReserveAnonymousMemory and we + * stick to it. + * + * TODO: Update the prologue to be consistent with the code. */ -static void * -CreateAnonymousSegment(Size *size) +static void +CreateAnonymousSegment(MemoryMappingSizes *mapping, int segment_id) { - Size allocsize = *size; void *ptr = MAP_FAILED; - int mmap_errno = 0; + int save_errno = 0; + int mmap_flags = PG_MMAP_FLAGS, memfd_flags = 0; + ShmemSegment *segment = &Segments[segment_id]; #ifndef MAP_HUGETLB - /* PGSharedMemoryCreate should have dealt with this case */ - Assert(huge_pages != HUGE_PAGES_ON); + /* PrepareHugePages should have dealt with this case */ + Assert(huge_pages != HUGE_PAGES_ON && !huge_pages_on); #else - if (huge_pages == HUGE_PAGES_ON || huge_pages == HUGE_PAGES_TRY) + if (huge_pages_on) { - /* - * Round up the request size to a suitable large value. - */ Size hugepagesize; - int mmap_flags; - GetHugePageSize(&hugepagesize, &mmap_flags); + /* Make sure nothing is messed up */ + Assert(huge_pages == HUGE_PAGES_ON || huge_pages == HUGE_PAGES_TRY); - if (allocsize % hugepagesize != 0) - allocsize += hugepagesize - (allocsize % hugepagesize); + /* Round up the request size to a suitable large value */ + GetHugePageSize(&hugepagesize, &mmap_flags, &memfd_flags); + round_off_mapping_sizes_for_hugepages(mapping, hugepagesize); - ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE, - PG_MMAP_FLAGS | mmap_flags, -1, 0); - mmap_errno = errno; - if (huge_pages == HUGE_PAGES_TRY && ptr == MAP_FAILED) - elog(DEBUG1, "mmap(%zu) with MAP_HUGETLB failed, huge pages disabled: %m", - allocsize); + /* Verify that the new size is withing the reserved boundaries */ + Assert(mapping->shmem_reserved >= mapping->shmem_req_size); + + mmap_flags = PG_MMAP_FLAGS | mmap_flags; } #endif /* - * Report whether huge pages are in use. This needs to be tracked before - * the second mmap() call if attempting to use huge pages failed - * previously. + * Prepare an anonymous file backing the segment. Its size will be + * specified later via ftruncate. + * + * The file behaves like a regular file, but lives in memory. Once all + * references to the file are dropped, it is automatically released. + * Anonymous memory is used for all backing pages of the file, thus it has + * the same semantics as anonymous memory allocations using mmap with the + * MAP_ANONYMOUS flag. */ - SetConfigOption("huge_pages_status", (ptr == MAP_FAILED) ? "off" : "on", - PGC_INTERNAL, PGC_S_DYNAMIC_DEFAULT); + segment->segment_fd = memfd_create(MappingName(segment_id), memfd_flags); + if (segment->segment_fd == -1) + ereport(FATAL, + (errmsg("segment[%s]: could not create anonymous shared memory file: %m", + MappingName(segment_id)))); - if (ptr == MAP_FAILED && huge_pages != HUGE_PAGES_ON) - { - /* - * Use the original size, not the rounded-up value, when falling back - * to non-huge pages. - */ - allocsize = *size; - ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE, - PG_MMAP_FLAGS, -1, 0); - mmap_errno = errno; - } + elog(DEBUG1, "segment[%s]: mmap(%zu)", MappingName(segment_id), mapping->shmem_req_size); + /* + * Reserve maximum required address space for future expansion of this + * memory segment. MAP_NORESERVE ensures that no memory is allocated. The + * whole address space will be setup for read/write access, so that memory + * allocated to this address space can be read or written to even if it is + * resized. + */ + ptr = mmap(NULL, mapping->shmem_reserved, PROT_READ | PROT_WRITE, + mmap_flags | MAP_NORESERVE, segment->segment_fd, 0); if (ptr == MAP_FAILED) + ereport(FATAL, + (errmsg("segment[%s]: could not map anonymous shared memory: %m", + MappingName(segment_id)))); + + /* + * Resize the backing file to the required size. On platforms where it is + * supported, we also allocate the required memory upfront. On other + * platform the memory upto the size of file will be allocated on demand. + */ + if(ftruncate(segment->segment_fd, mapping->shmem_req_size) == -1) { - errno = mmap_errno; + save_errno = errno; + + close(segment->segment_fd); + + errno = save_errno; ereport(FATAL, - (errmsg("could not map anonymous shared memory: %m"), - (mmap_errno == ENOMEM) ? + (errmsg("segment[%s]: could not truncate anonymous file to size %zu: %m", + MappingName(segment_id), mapping->shmem_req_size), + (save_errno == ENOMEM) ? errhint("This error usually means that PostgreSQL's request " "for a shared memory segment exceeded available memory, " "swap space, or huge pages. To reduce the request size " "(currently %zu bytes), reduce PostgreSQL's shared " "memory usage, perhaps by reducing \"shared_buffers\" or " "\"max_connections\".", - allocsize) : 0)); + mapping->shmem_req_size) : 0)); } + shmem_fallocate(segment->segment_fd, MappingName(segment_id), mapping->shmem_req_size, FATAL); - *size = allocsize; - return ptr; + segment->shmem = ptr; + segment->shmem_size = mapping->shmem_req_size; + segment->shmem_reserved = mapping->shmem_reserved; +} + +/* + * PrepareHugePages + * + * Figure out if there are enough huge pages to allocate all shared memory + * segments, and report that information via huge_pages_status and + * huge_pages_on. It needs to be called before creating shared memory segments. + * + * It is necessary to maintain the same semantic (simple on/off) for + * huge_pages_status, even if there are multiple shared memory segments: all + * segments either use huge pages or not, there is no mix of segments with + * different page size. The latter might be actually beneficial, in particular + * because only some segments may require large amount of memory, but for now + * we go with a simple solution. + */ +void +PrepareHugePages() +{ + void *ptr = MAP_FAILED; + MemoryMappingSizes mapping_sizes[NUM_MEMORY_MAPPINGS]; + + CalculateShmemSize(mapping_sizes); + + /* Complain if hugepages demanded but we can't possibly support them */ +#if !defined(MAP_HUGETLB) + if (huge_pages == HUGE_PAGES_ON) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("huge pages not supported on this platform"))); +#else + if (huge_pages == HUGE_PAGES_ON || huge_pages == HUGE_PAGES_TRY) + { + Size hugepagesize, total_size = 0; + int mmap_flags; + + GetHugePageSize(&hugepagesize, &mmap_flags, NULL); + + /* + * Figure out how much memory is needed for all segments, keeping in + * mind that for every segment this value will be rounding up by the + * huge page size. The resulting value will be used to probe memory and + * decide whether we will allocate huge pages or not. + */ + for(int segment = 0; segment < NUM_MEMORY_MAPPINGS; segment++) + { + Size segment_size = mapping_sizes[segment].shmem_req_size; + + if (segment_size % hugepagesize != 0) + segment_size += hugepagesize - (segment_size % hugepagesize); + + total_size += segment_size; + } + + /* Map total amount of memory to test its availability. */ + elog(DEBUG1, "reserving space: probe mmap(%zu) with MAP_HUGETLB", + total_size); + ptr = mmap(NULL, total_size, PROT_NONE, + PG_MMAP_FLAGS | MAP_ANONYMOUS | mmap_flags, -1, 0); + } +#endif + + /* + * Report whether huge pages are in use. This needs to be tracked before + * creating shared memory segments. + */ + SetConfigOption("huge_pages_status", (ptr == MAP_FAILED) ? "off" : "on", + PGC_INTERNAL, PGC_S_DYNAMIC_DEFAULT); + huge_pages_on = ptr != MAP_FAILED; } /* @@ -674,20 +906,25 @@ CreateAnonymousSegment(Size *size) static void AnonymousShmemDetach(int status, Datum arg) { - /* Release anonymous shared memory block, if any. */ - if (AnonymousShmem != NULL) + for(int i = 0; i < NUM_MEMORY_MAPPINGS; i++) { - if (munmap(AnonymousShmem, AnonymousShmemSize) < 0) - elog(LOG, "munmap(%p, %zu) failed: %m", - AnonymousShmem, AnonymousShmemSize); - AnonymousShmem = NULL; + ShmemSegment *segment = &Segments[i]; + + /* Release anonymous shared memory block, if any. */ + if (segment->shmem != NULL) + { + if (munmap(segment->shmem, segment->shmem_size) < 0) + elog(LOG, "munmap(%p, %zu) failed: %m", + segment->shmem, segment->shmem_size); + segment->shmem = NULL; + } } } /* * PGSharedMemoryCreate * - * Create a shared memory segment of the given size and initialize its + * Create a shared memory segment for the given mapping and initialize its * standard header. Also, register an on_shmem_exit callback to release * the storage. * @@ -697,7 +934,7 @@ AnonymousShmemDetach(int status, Datum arg) * postmaster or backend. */ PGShmemHeader * -PGSharedMemoryCreate(Size size, +PGSharedMemoryCreate(MemoryMappingSizes *mapping, int segment_id, PGShmemHeader **shim) { IpcMemoryKey NextShmemSegID; @@ -705,6 +942,7 @@ PGSharedMemoryCreate(Size size, PGShmemHeader *hdr; struct stat statbuf; Size sysvsize; + ShmemSegment *segment = &Segments[segment_id]; /* * We use the data directory's ID info (inode and device numbers) to @@ -717,14 +955,6 @@ PGSharedMemoryCreate(Size size, errmsg("could not stat data directory \"%s\": %m", DataDir))); - /* Complain if hugepages demanded but we can't possibly support them */ -#if !defined(MAP_HUGETLB) - if (huge_pages == HUGE_PAGES_ON) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("huge pages not supported on this platform"))); -#endif - /* For now, we don't support huge pages in SysV memory */ if (huge_pages == HUGE_PAGES_ON && shared_memory_type != SHMEM_TYPE_MMAP) ereport(ERROR, @@ -732,12 +962,12 @@ PGSharedMemoryCreate(Size size, errmsg("huge pages not supported with the current \"shared_memory_type\" setting"))); /* Room for a header? */ - Assert(size > MAXALIGN(sizeof(PGShmemHeader))); + Assert(mapping->shmem_req_size > MAXALIGN(sizeof(PGShmemHeader))); if (shared_memory_type == SHMEM_TYPE_MMAP) { - AnonymousShmem = CreateAnonymousSegment(&size); - AnonymousShmemSize = size; + /* On success, mapping data will be modified. */ + CreateAnonymousSegment(mapping, segment_id); /* Register on-exit routine to unmap the anonymous segment */ on_shmem_exit(AnonymousShmemDetach, (Datum) 0); @@ -747,7 +977,7 @@ PGSharedMemoryCreate(Size size, } else { - sysvsize = size; + sysvsize = mapping->shmem_req_size; /* huge pages are only available with mmap */ SetConfigOption("huge_pages_status", "off", @@ -760,7 +990,7 @@ PGSharedMemoryCreate(Size size, * loop simultaneously. (CreateDataDirLockFile() does not entirely ensure * that, but prefer fixing it over coping here.) */ - NextShmemSegID = statbuf.st_ino; + NextShmemSegID = statbuf.st_ino + segment_id; for (;;) { @@ -852,13 +1082,13 @@ PGSharedMemoryCreate(Size size, /* * Initialize space allocation status for segment. */ - hdr->totalsize = size; + hdr->totalsize = mapping->shmem_req_size; hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader)); *shim = hdr; /* Save info for possible future use */ - UsedShmemSegAddr = memAddress; - UsedShmemSegID = (unsigned long) NextShmemSegID; + segment->seg_addr = memAddress; + segment->seg_id = (unsigned long) NextShmemSegID; /* * If AnonymousShmem is NULL here, then we're not using anonymous shared @@ -866,10 +1096,10 @@ PGSharedMemoryCreate(Size size, * block. Otherwise, the System V shared memory block is only a shim, and * we must return a pointer to the real block. */ - if (AnonymousShmem == NULL) + if (segment->shmem == NULL) return hdr; - memcpy(AnonymousShmem, hdr, sizeof(PGShmemHeader)); - return (PGShmemHeader *) AnonymousShmem; + memcpy(segment->shmem, hdr, sizeof(PGShmemHeader)); + return (PGShmemHeader *) segment->shmem; } #ifdef EXEC_BACKEND @@ -969,23 +1199,28 @@ PGSharedMemoryNoReAttach(void) void PGSharedMemoryDetach(void) { - if (UsedShmemSegAddr != NULL) + for(int i = 0; i < NUM_MEMORY_MAPPINGS; i++) { - if ((shmdt(UsedShmemSegAddr) < 0) + ShmemSegment *segment = &Segments[i]; + + if (segment->seg_addr != NULL) + { + if ((shmdt(segment->seg_addr) < 0) #if defined(EXEC_BACKEND) && defined(__CYGWIN__) - /* Work-around for cygipc exec bug */ - && shmdt(NULL) < 0 + /* Work-around for cygipc exec bug */ + && shmdt(NULL) < 0 #endif - ) - elog(LOG, "shmdt(%p) failed: %m", UsedShmemSegAddr); - UsedShmemSegAddr = NULL; - } + ) + elog(LOG, "shmdt(%p) failed: %m", segment->seg_addr); + segment->seg_addr = NULL; + } - if (AnonymousShmem != NULL) - { - if (munmap(AnonymousShmem, AnonymousShmemSize) < 0) - elog(LOG, "munmap(%p, %zu) failed: %m", - AnonymousShmem, AnonymousShmemSize); - AnonymousShmem = NULL; + if (segment->shmem != NULL) + { + if (munmap(segment->shmem, segment->shmem_size) < 0) + elog(LOG, "munmap(%p, %zu) failed: %m", + segment->shmem, segment->shmem_size); + segment->shmem = NULL; + } } } diff --git a/src/backend/port/win32_sema.c b/src/backend/port/win32_sema.c index 5854ad1f54d3..e7365ff8060d 100644 --- a/src/backend/port/win32_sema.c +++ b/src/backend/port/win32_sema.c @@ -44,7 +44,7 @@ PGSemaphoreShmemSize(int maxSemas) * process exits. */ void -PGReserveSemaphores(int maxSemas) +PGReserveSemaphores(int maxSemas, int shmem_segment) { mySemSet = (HANDLE *) malloc(maxSemas * sizeof(HANDLE)); if (mySemSet == NULL) diff --git a/src/backend/port/win32_shmem.c b/src/backend/port/win32_shmem.c index 4dee856d6bd6..5c0c32babaf1 100644 --- a/src/backend/port/win32_shmem.c +++ b/src/backend/port/win32_shmem.c @@ -204,7 +204,7 @@ EnableLockPagesPrivilege(int elevel) * standard header. */ PGShmemHeader * -PGSharedMemoryCreate(Size size, +PGSharedMemoryCreate(MemoryMappingSizes *mapping_sizes, int segment_id, PGShmemHeader **shim) { void *memAddress; @@ -216,9 +216,10 @@ PGSharedMemoryCreate(Size size, DWORD size_high; DWORD size_low; SIZE_T largePageSize = 0; - Size orig_size = size; + Size size = mapping_sizes->shmem_req_size; DWORD flProtect = PAGE_READWRITE; DWORD desiredAccess; + ShmemSegment *segment = &Segments[segment_id] ShmemProtectiveRegion = VirtualAlloc(NULL, PROTECTIVE_REGION_SIZE, MEM_RESERVE, PAGE_NOACCESS); @@ -304,7 +305,7 @@ PGSharedMemoryCreate(Size size, * Use the original size, not the rounded-up value, when * falling back to non-huge pages. */ - size = orig_size; + size = mapping_sizes->shmem_req_size; flProtect = PAGE_READWRITE; goto retry; } @@ -393,6 +394,11 @@ PGSharedMemoryCreate(Size size, hdr->dsm_control = 0; /* Save info for possible future use */ + segment->shmem_size = size; + segment->seg_addr = memAddress; + segment->shmem = (Pointer) hdr; + segment->seg_id = (unsigned long) hmap2; + UsedShmemSegAddr = memAddress; UsedShmemSegSize = size; UsedShmemSegID = hmap2; @@ -627,7 +633,7 @@ pgwin32_ReserveSharedMemoryRegion(HANDLE hChild) * use GetLargePageMinimum() instead. */ void -GetHugePageSize(Size *hugepagesize, int *mmap_flags) +GetHugePageSize(Size *hugepagesize, int *mmap_flags, int *memfd_flags) { if (hugepagesize) *hugepagesize = 0; diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c index 6fd3a6bbac5e..4fa547f48dea 100644 --- a/src/backend/storage/buffer/buf_init.c +++ b/src/backend/storage/buffer/buf_init.c @@ -17,6 +17,7 @@ #include "storage/aio.h" #include "storage/buf_internals.h" #include "storage/bufmgr.h" +#include "storage/pg_shmem.h" BufferDescPadded *BufferDescriptors; char *BufferBlocks; @@ -62,7 +63,10 @@ CkptSortItem *CkptBufferIds; * Initialize shared buffer pool * * This is called once during shared-memory initialization (either in the - * postmaster, or in a standalone backend). + * postmaster, or in a standalone backend). Size of data structures initialized + * here depends on NBuffers, and to be able to change NBuffers without a + * restart we store each structure into a separate shared memory segment, which + * could be resized on demand. */ void BufferManagerShmemInit(void) @@ -74,22 +78,22 @@ BufferManagerShmemInit(void) /* Align descriptors to a cacheline boundary. */ BufferDescriptors = (BufferDescPadded *) - ShmemInitStruct("Buffer Descriptors", + ShmemInitStructInSegment("Buffer Descriptors", NBuffers * sizeof(BufferDescPadded), - &foundDescs); + &foundDescs, BUFFER_DESCRIPTORS_SHMEM_SEGMENT); /* Align buffer pool on IO page size boundary. */ BufferBlocks = (char *) TYPEALIGN(PG_IO_ALIGN_SIZE, - ShmemInitStruct("Buffer Blocks", + ShmemInitStructInSegment("Buffer Blocks", NBuffers * (Size) BLCKSZ + PG_IO_ALIGN_SIZE, - &foundBufs)); + &foundBufs, BUFFERS_SHMEM_SEGMENT)); /* Align condition variables to cacheline boundary. */ BufferIOCVArray = (ConditionVariableMinimallyPadded *) - ShmemInitStruct("Buffer IO Condition Variables", + ShmemInitStructInSegment("Buffer IO Condition Variables", NBuffers * sizeof(ConditionVariableMinimallyPadded), - &foundIOCV); + &foundIOCV, BUFFER_IOCV_SHMEM_SEGMENT); /* * The array used to sort to-be-checkpointed buffer ids is located in @@ -99,8 +103,9 @@ BufferManagerShmemInit(void) * painful. */ CkptBufferIds = (CkptSortItem *) - ShmemInitStruct("Checkpoint BufferIds", - NBuffers * sizeof(CkptSortItem), &foundBufCkpt); + ShmemInitStructInSegment("Checkpoint BufferIds", + NBuffers * sizeof(CkptSortItem), &foundBufCkpt, + CHECKPOINT_BUFFERS_SHMEM_SEGMENT); if (foundDescs || foundBufs || foundIOCV || foundBufCkpt) { @@ -147,33 +152,42 @@ BufferManagerShmemInit(void) * BufferManagerShmemSize * * compute the size of shared memory for the buffer pool including - * data pages, buffer descriptors, hash tables, etc. + * data pages, buffer descriptors, hash tables, etc. based on the + * shared memory segment. The main segment must not allocate anything + * related to buffers, every other segment will receive part of the + * data. */ Size -BufferManagerShmemSize(void) +BufferManagerShmemSize(MemoryMappingSizes *mapping_sizes) { - Size size = 0; + size_t size; - /* size of buffer descriptors */ - size = add_size(size, mul_size(NBuffers, sizeof(BufferDescPadded))); - /* to allow aligning buffer descriptors */ + /* size of buffer descriptors, plus alignment padding */ + size = add_size(0, mul_size(NBuffers, sizeof(BufferDescPadded))); size = add_size(size, PG_CACHE_LINE_SIZE); + mapping_sizes[BUFFER_DESCRIPTORS_SHMEM_SEGMENT].shmem_req_size = size; + mapping_sizes[BUFFER_DESCRIPTORS_SHMEM_SEGMENT].shmem_reserved = size; /* size of data pages, plus alignment padding */ - size = add_size(size, PG_IO_ALIGN_SIZE); + size = add_size(0, PG_IO_ALIGN_SIZE); size = add_size(size, mul_size(NBuffers, BLCKSZ)); + mapping_sizes[BUFFERS_SHMEM_SEGMENT].shmem_req_size = size; + mapping_sizes[BUFFERS_SHMEM_SEGMENT].shmem_reserved = size; /* size of stuff controlled by freelist.c */ - size = add_size(size, StrategyShmemSize()); + mapping_sizes[STRATEGY_SHMEM_SEGMENT].shmem_req_size = StrategyShmemSize(); + mapping_sizes[STRATEGY_SHMEM_SEGMENT].shmem_reserved = StrategyShmemSize(); - /* size of I/O condition variables */ - size = add_size(size, mul_size(NBuffers, + /* size of I/O condition variables, plus alignment padding */ + size = add_size(0, mul_size(NBuffers, sizeof(ConditionVariableMinimallyPadded))); - /* to allow aligning the above */ size = add_size(size, PG_CACHE_LINE_SIZE); + mapping_sizes[BUFFER_IOCV_SHMEM_SEGMENT].shmem_req_size = size; + mapping_sizes[BUFFER_IOCV_SHMEM_SEGMENT].shmem_reserved = size; /* size of checkpoint sort array in bufmgr.c */ - size = add_size(size, mul_size(NBuffers, sizeof(CkptSortItem))); + mapping_sizes[CHECKPOINT_BUFFERS_SHMEM_SEGMENT].shmem_req_size = mul_size(NBuffers, sizeof(CkptSortItem)); + mapping_sizes[CHECKPOINT_BUFFERS_SHMEM_SEGMENT].shmem_reserved = mul_size(NBuffers, sizeof(CkptSortItem)); return size; } diff --git a/src/backend/storage/buffer/buf_table.c b/src/backend/storage/buffer/buf_table.c index f0c39ec28222..67e87f9935de 100644 --- a/src/backend/storage/buffer/buf_table.c +++ b/src/backend/storage/buffer/buf_table.c @@ -25,6 +25,7 @@ #include "funcapi.h" #include "storage/buf_internals.h" #include "storage/lwlock.h" +#include "storage/pg_shmem.h" #include "utils/rel.h" #include "utils/builtins.h" @@ -64,10 +65,11 @@ InitBufTable(int size) info.entrysize = sizeof(BufferLookupEnt); info.num_partitions = NUM_BUFFER_PARTITIONS; - SharedBufHash = ShmemInitHash("Shared Buffer Lookup Table", + SharedBufHash = ShmemInitHashInSegment("Shared Buffer Lookup Table", size, size, &info, - HASH_ELEM | HASH_BLOBS | HASH_PARTITION | HASH_FIXED_SIZE); + HASH_ELEM | HASH_BLOBS | HASH_PARTITION | HASH_FIXED_SIZE, + STRATEGY_SHMEM_SEGMENT); } /* diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c index 28d952b35344..13ee840ab9f5 100644 --- a/src/backend/storage/buffer/freelist.c +++ b/src/backend/storage/buffer/freelist.c @@ -19,6 +19,7 @@ #include "port/atomics.h" #include "storage/buf_internals.h" #include "storage/bufmgr.h" +#include "storage/pg_shmem.h" #include "storage/proc.h" #define INT_ACCESS_ONCE(var) ((int)(*((volatile int *)&(var)))) @@ -418,9 +419,9 @@ StrategyInitialize(bool init) * Get or create the shared strategy control block */ StrategyControl = (BufferStrategyControl *) - ShmemInitStruct("Buffer Strategy Status", + ShmemInitStructInSegment("Buffer Strategy Status", sizeof(BufferStrategyControl), - &found); + &found, STRATEGY_SHMEM_SEGMENT); if (!found) { diff --git a/src/backend/storage/ipc/ipc.c b/src/backend/storage/ipc/ipc.c index 2704e80b3a7d..1965b2d3eb4d 100644 --- a/src/backend/storage/ipc/ipc.c +++ b/src/backend/storage/ipc/ipc.c @@ -61,6 +61,8 @@ static void proc_exit_prepare(int code); * but provide some additional features we need --- in particular, * we want to register callbacks to invoke when we are disconnecting * from a broken shared-memory context but not exiting the postmaster. + * Maximum number of such exit callbacks depends on the number of shared + * segments. * * Callback functions can take zero, one, or two args: the first passed * arg is the integer exitcode, the second is the Datum supplied when @@ -68,7 +70,7 @@ static void proc_exit_prepare(int code); * ---------------------------------------------------------------- */ -#define MAX_ON_EXITS 20 +#define MAX_ON_EXITS 40 struct ONEXIT { diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index b23d0c19360a..41190f966395 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -81,10 +81,17 @@ RequestAddinShmemSpace(Size size) /* * CalculateShmemSize - * Calculates the amount of shared memory needed. + * Calculates the amount of shared memory needed. + * + * The amount of shared memory required per segment is saved in mapping_sizes, + * which is expected to be an array of size NUM_MEMORY_MAPPINGS. The total + * amount of memory needed across all the segments is returned. For the memory + * mappings which reserve address space for future expansion, the required + * amount of reserved space is saved in mapping_sizes of those segments. + * This memory is not included in the returned value. */ Size -CalculateShmemSize(void) +CalculateShmemSize(MemoryMappingSizes *mapping_sizes) { Size size; @@ -102,7 +109,13 @@ CalculateShmemSize(void) sizeof(ShmemIndexEnt))); size = add_size(size, dsm_estimate_size()); size = add_size(size, DSMRegistryShmemSize()); - size = add_size(size, BufferManagerShmemSize()); + + /* + * Buffer manager adds estimates for memory requirements for every shared + * memory segment that it uses in the corresponding AnonymousMappings. + * Consider size required from only the main shared memory segment here. + */ + size = add_size(size, BufferManagerShmemSize(mapping_sizes)); size = add_size(size, LockManagerShmemSize()); size = add_size(size, PredicateLockShmemSize()); size = add_size(size, ProcGlobalShmemSize()); @@ -144,8 +157,22 @@ CalculateShmemSize(void) /* include additional requested shmem from preload libraries */ size = add_size(size, total_addin_request); + /* + * All the shared memory allocations considered so far happen in the main + * shared memory segment. + */ + mapping_sizes[MAIN_SHMEM_SEGMENT].shmem_req_size = size; + mapping_sizes[MAIN_SHMEM_SEGMENT].shmem_reserved = size; + + size = 0; /* might as well round it off to a multiple of a typical page size */ - size = add_size(size, 8192 - (size % 8192)); + for (int segment = 0; segment < NUM_MEMORY_MAPPINGS; segment++) + { + mapping_sizes[segment].shmem_req_size = add_size(mapping_sizes[segment].shmem_req_size, 8192 - (mapping_sizes[segment].shmem_req_size % 8192)); + mapping_sizes[segment].shmem_reserved = add_size(mapping_sizes[segment].shmem_reserved, 8192 - (mapping_sizes[segment].shmem_reserved % 8192)); + /* Compute the total size of all segments */ + size = size + mapping_sizes[segment].shmem_req_size; + } return size; } @@ -191,32 +218,44 @@ CreateSharedMemoryAndSemaphores(void) { PGShmemHeader *shim; PGShmemHeader *seghdr; - Size size; + MemoryMappingSizes mapping_sizes[NUM_MEMORY_MAPPINGS]; Assert(!IsUnderPostmaster); - /* Compute the size of the shared-memory block */ - size = CalculateShmemSize(); - elog(DEBUG3, "invoking IpcMemoryCreate(size=%zu)", size); + CalculateShmemSize(mapping_sizes); - /* - * Create the shmem segment - */ - seghdr = PGSharedMemoryCreate(size, &shim); - - /* - * Make sure that huge pages are never reported as "unknown" while the - * server is running. - */ - Assert(strcmp("unknown", - GetConfigOption("huge_pages_status", false, false)) != 0); - - InitShmemAccess(seghdr); + /* Decide if we use huge pages or regular size pages */ + PrepareHugePages(); - /* - * Set up shared memory allocation mechanism - */ - InitShmemAllocation(); + for(int segment = 0; segment < NUM_MEMORY_MAPPINGS; segment++) + { + MemoryMappingSizes *mapping = &mapping_sizes[segment]; + + /* Compute the size of the shared-memory block */ + elog(DEBUG3, "invoking IpcMemoryCreate(segment %s, size=%zu, reserved address space=%zu)", + MappingName(segment), mapping->shmem_req_size, mapping->shmem_reserved); + + /* + * Create the shmem segment. + * + * XXX: Do multiple shims are needed, one per segment? + */ + seghdr = PGSharedMemoryCreate(mapping, segment, &shim); + + /* + * Make sure that huge pages are never reported as "unknown" while the + * server is running. + */ + Assert(strcmp("unknown", + GetConfigOption("huge_pages_status", false, false)) != 0); + + InitShmemAccessInSegment(seghdr, segment); + + /* + * Set up shared memory allocation mechanism + */ + InitShmemAllocationInSegment(segment); + } /* Initialize subsystems */ CreateOrAttachShmemStructs(); @@ -334,7 +373,9 @@ CreateOrAttachShmemStructs(void) * InitializeShmemGUCs * * This function initializes runtime-computed GUCs related to the amount of - * shared memory required for the current configuration. + * shared memory required for the current configuration. It assumes that the + * memory required by the shared memory segments is already calculated and is + * available in AnonymousMappings. */ void InitializeShmemGUCs(void) @@ -343,11 +384,13 @@ InitializeShmemGUCs(void) Size size_b; Size size_mb; Size hp_size; + MemoryMappingSizes mapping_sizes[NUM_MEMORY_MAPPINGS]; + /* * Calculate the shared memory size and round up to the nearest megabyte. */ - size_b = CalculateShmemSize(); + size_b = CalculateShmemSize(mapping_sizes); size_mb = add_size(size_b, (1024 * 1024) - 1) / (1024 * 1024); sprintf(buf, "%zu", size_mb); SetConfigOption("shared_memory_size", buf, @@ -356,7 +399,7 @@ InitializeShmemGUCs(void) /* * Calculate the number of huge pages required. */ - GetHugePageSize(&hp_size, NULL); + GetHugePageSize(&hp_size, NULL, NULL); if (hp_size != 0) { Size hp_required; diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c index 0f18beb6ad4a..f303a9328dfc 100644 --- a/src/backend/storage/ipc/shmem.c +++ b/src/backend/storage/ipc/shmem.c @@ -76,20 +76,19 @@ #include "utils/builtins.h" static void *ShmemAllocRaw(Size size, Size *allocated_size); -static void *ShmemAllocUnlocked(Size size); +static void *ShmemAllocRawInSegment(Size size, Size *allocated_size, + int shmem_segment); /* shared memory global variables */ -static PGShmemHeader *ShmemSegHdr; /* shared mem segment header */ +ShmemSegment Segments[NUM_MEMORY_MAPPINGS]; -static void *ShmemBase; /* start address of shared memory */ - -static void *ShmemEnd; /* end+1 address of shared memory */ - -slock_t *ShmemLock; /* spinlock for shared memory and LWLock - * allocation */ - -static HTAB *ShmemIndex = NULL; /* primary index hashtable for shmem */ +/* + * Primary index hashtable for shmem, for simplicity we use a single for all + * shared memory segments. There can be performance consequences of that, and + * an alternative option would be to have one index per shared memory segments. + */ +static HTAB *ShmemIndex = NULL; /* To get reliable results for NUMA inquiry we need to "touch pages" once */ static bool firstNumaTouch = true; @@ -102,9 +101,17 @@ Datum pg_numa_available(PG_FUNCTION_ARGS); void InitShmemAccess(PGShmemHeader *seghdr) { - ShmemSegHdr = seghdr; - ShmemBase = seghdr; - ShmemEnd = (char *) ShmemBase + seghdr->totalsize; + InitShmemAccessInSegment(seghdr, MAIN_SHMEM_SEGMENT); +} + +void +InitShmemAccessInSegment(PGShmemHeader *seghdr, int shmem_segment) +{ + PGShmemHeader *shmhdr = (PGShmemHeader *) seghdr; + ShmemSegment *seg = &Segments[shmem_segment]; + seg->ShmemSegHdr = shmhdr; + seg->ShmemBase = (void *) shmhdr; + seg->ShmemEnd = (char *) seg->ShmemBase + shmhdr->totalsize; } /* @@ -115,7 +122,13 @@ InitShmemAccess(PGShmemHeader *seghdr) void InitShmemAllocation(void) { - PGShmemHeader *shmhdr = ShmemSegHdr; + InitShmemAllocationInSegment(MAIN_SHMEM_SEGMENT); +} + +void +InitShmemAllocationInSegment(int shmem_segment) +{ + PGShmemHeader *shmhdr = Segments[shmem_segment].ShmemSegHdr; char *aligned; Assert(shmhdr != NULL); @@ -124,9 +137,9 @@ InitShmemAllocation(void) * Initialize the spinlock used by ShmemAlloc. We must use * ShmemAllocUnlocked, since obviously ShmemAlloc can't be called yet. */ - ShmemLock = (slock_t *) ShmemAllocUnlocked(sizeof(slock_t)); + Segments[shmem_segment].ShmemLock = (slock_t *) ShmemAllocUnlockedInSegment(sizeof(slock_t), shmem_segment); - SpinLockInit(ShmemLock); + SpinLockInit(Segments[shmem_segment].ShmemLock); /* * Allocations after this point should go through ShmemAlloc, which @@ -151,16 +164,22 @@ InitShmemAllocation(void) */ void * ShmemAlloc(Size size) +{ + return ShmemAllocInSegment(size, MAIN_SHMEM_SEGMENT); +} + +void * +ShmemAllocInSegment(Size size, int shmem_segment) { void *newSpace; Size allocated_size; - newSpace = ShmemAllocRaw(size, &allocated_size); + newSpace = ShmemAllocRawInSegment(size, &allocated_size, shmem_segment); if (!newSpace) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of shared memory (%zu bytes requested)", - size))); + errmsg("out of shared memory in segment %s (%zu bytes requested)", + MappingName(shmem_segment), size))); return newSpace; } @@ -185,6 +204,12 @@ ShmemAllocNoError(Size size) */ static void * ShmemAllocRaw(Size size, Size *allocated_size) +{ + return ShmemAllocRawInSegment(size, allocated_size, MAIN_SHMEM_SEGMENT); +} + +static void * +ShmemAllocRawInSegment(Size size, Size *allocated_size, int shmem_segment) { Size newStart; Size newFree; @@ -204,22 +229,22 @@ ShmemAllocRaw(Size size, Size *allocated_size) size = CACHELINEALIGN(size); *allocated_size = size; - Assert(ShmemSegHdr != NULL); + Assert(Segments[shmem_segment].ShmemSegHdr != NULL); - SpinLockAcquire(ShmemLock); + SpinLockAcquire(Segments[shmem_segment].ShmemLock); - newStart = ShmemSegHdr->freeoffset; + newStart = Segments[shmem_segment].ShmemSegHdr->freeoffset; newFree = newStart + size; - if (newFree <= ShmemSegHdr->totalsize) + if (newFree <= Segments[shmem_segment].ShmemSegHdr->totalsize) { - newSpace = (char *) ShmemBase + newStart; - ShmemSegHdr->freeoffset = newFree; + newSpace = (char *) Segments[shmem_segment].ShmemBase + newStart; + Segments[shmem_segment].ShmemSegHdr->freeoffset = newFree; } else newSpace = NULL; - SpinLockRelease(ShmemLock); + SpinLockRelease(Segments[shmem_segment].ShmemLock); /* note this assert is okay with newSpace == NULL */ Assert(newSpace == (void *) CACHELINEALIGN(newSpace)); @@ -228,15 +253,16 @@ ShmemAllocRaw(Size size, Size *allocated_size) } /* - * ShmemAllocUnlocked -- allocate max-aligned chunk from shared memory + * ShmemAllocUnlockedInSegment + * allocate max-aligned chunk from given shared memory segment * * Allocate space without locking ShmemLock. This should be used for, * and only for, allocations that must happen before ShmemLock is ready. * * We consider maxalign, rather than cachealign, sufficient here. */ -static void * -ShmemAllocUnlocked(Size size) +void * +ShmemAllocUnlockedInSegment(Size size, int shmem_segment) { Size newStart; Size newFree; @@ -247,19 +273,19 @@ ShmemAllocUnlocked(Size size) */ size = MAXALIGN(size); - Assert(ShmemSegHdr != NULL); + Assert(Segments[shmem_segment].ShmemSegHdr != NULL); - newStart = ShmemSegHdr->freeoffset; + newStart = Segments[shmem_segment].ShmemSegHdr->freeoffset; newFree = newStart + size; - if (newFree > ShmemSegHdr->totalsize) + if (newFree > Segments[shmem_segment].ShmemSegHdr->totalsize) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of shared memory (%zu bytes requested)", - size))); - ShmemSegHdr->freeoffset = newFree; + errmsg("out of shared memory in segment %s (%zu bytes requested)", + MappingName(shmem_segment), size))); + Segments[shmem_segment].ShmemSegHdr->freeoffset = newFree; - newSpace = (char *) ShmemBase + newStart; + newSpace = (char *) Segments[shmem_segment].ShmemBase + newStart; Assert(newSpace == (void *) MAXALIGN(newSpace)); @@ -274,7 +300,13 @@ ShmemAllocUnlocked(Size size) bool ShmemAddrIsValid(const void *addr) { - return (addr >= ShmemBase) && (addr < ShmemEnd); + return ShmemAddrIsValidInSegment(addr, MAIN_SHMEM_SEGMENT); +} + +bool +ShmemAddrIsValidInSegment(const void *addr, int shmem_segment) +{ + return (addr >= Segments[shmem_segment].ShmemBase) && (addr < Segments[shmem_segment].ShmemEnd); } /* @@ -335,6 +367,18 @@ ShmemInitHash(const char *name, /* table string name for shmem index */ int64 max_size, /* max size of the table */ HASHCTL *infoP, /* info about key and bucket size */ int hash_flags) /* info about infoP */ +{ + return ShmemInitHashInSegment(name, init_size, max_size, infoP, hash_flags, + MAIN_SHMEM_SEGMENT); +} + +HTAB * +ShmemInitHashInSegment(const char *name, /* table string name for shmem index */ + long init_size, /* initial table size */ + long max_size, /* max size of the table */ + HASHCTL *infoP, /* info about key and bucket size */ + int hash_flags, /* info about infoP */ + int shmem_segment) /* in which segment to keep the table */ { bool found; void *location; @@ -351,9 +395,9 @@ ShmemInitHash(const char *name, /* table string name for shmem index */ hash_flags |= HASH_SHARED_MEM | HASH_ALLOC | HASH_DIRSIZE; /* look it up in the shmem index */ - location = ShmemInitStruct(name, + location = ShmemInitStructInSegment(name, hash_get_shared_size(infoP, hash_flags), - &found); + &found, shmem_segment); /* * if it already exists, attach to it rather than allocate and initialize @@ -386,6 +430,13 @@ ShmemInitHash(const char *name, /* table string name for shmem index */ */ void * ShmemInitStruct(const char *name, Size size, bool *foundPtr) +{ + return ShmemInitStructInSegment(name, size, foundPtr, MAIN_SHMEM_SEGMENT); +} + +void * +ShmemInitStructInSegment(const char *name, Size size, bool *foundPtr, + int shmem_segment) { ShmemIndexEnt *result; void *structPtr; @@ -394,7 +445,7 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr) if (!ShmemIndex) { - PGShmemHeader *shmemseghdr = ShmemSegHdr; + PGShmemHeader *shmemseghdr = Segments[shmem_segment].ShmemSegHdr; /* Must be trying to create/attach to ShmemIndex itself */ Assert(strcmp(name, "ShmemIndex") == 0); @@ -417,7 +468,7 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr) * process can be accessing shared memory yet. */ Assert(shmemseghdr->index == NULL); - structPtr = ShmemAlloc(size); + structPtr = ShmemAllocInSegment(size, shmem_segment); shmemseghdr->index = structPtr; *foundPtr = false; } @@ -434,8 +485,8 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr) LWLockRelease(ShmemIndexLock); ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("could not create ShmemIndex entry for data structure \"%s\"", - name))); + errmsg("could not create ShmemIndex entry for data structure \"%s\" in segment %d", + name, shmem_segment))); } if (*foundPtr) @@ -460,7 +511,7 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr) Size allocated_size; /* It isn't in the table yet. allocate and initialize it */ - structPtr = ShmemAllocRaw(size, &allocated_size); + structPtr = ShmemAllocRawInSegment(size, &allocated_size, shmem_segment); if (structPtr == NULL) { /* out of memory; remove the failed ShmemIndex entry */ @@ -475,18 +526,18 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr) result->size = size; result->allocated_size = allocated_size; result->location = structPtr; + result->shmem_segment = shmem_segment; } LWLockRelease(ShmemIndexLock); - Assert(ShmemAddrIsValid(structPtr)); + Assert(ShmemAddrIsValidInSegment(structPtr, shmem_segment)); Assert(structPtr == (void *) CACHELINEALIGN(structPtr)); return structPtr; } - /* * Add two Size values, checking for overflow */ @@ -527,13 +578,14 @@ mul_size(Size s1, Size s2) Datum pg_get_shmem_allocations(PG_FUNCTION_ARGS) { -#define PG_GET_SHMEM_SIZES_COLS 4 +#define PG_GET_SHMEM_SIZES_COLS 5 ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; HASH_SEQ_STATUS hstat; ShmemIndexEnt *ent; - Size named_allocated = 0; + Size named_allocated[NUM_MEMORY_MAPPINGS] = {0}; Datum values[PG_GET_SHMEM_SIZES_COLS]; bool nulls[PG_GET_SHMEM_SIZES_COLS]; + int i; InitMaterializedSRF(fcinfo, 0); @@ -546,29 +598,40 @@ pg_get_shmem_allocations(PG_FUNCTION_ARGS) while ((ent = (ShmemIndexEnt *) hash_seq_search(&hstat)) != NULL) { values[0] = CStringGetTextDatum(ent->key); - values[1] = Int64GetDatum((char *) ent->location - (char *) ShmemSegHdr); - values[2] = Int64GetDatum(ent->size); - values[3] = Int64GetDatum(ent->allocated_size); - named_allocated += ent->allocated_size; + values[1] = CStringGetTextDatum(MappingName(ent->shmem_segment)); + values[2] = Int64GetDatum((char *) ent->location - (char *) Segments[ent->shmem_segment].ShmemSegHdr); + values[3] = Int64GetDatum(ent->size); + values[4] = Int64GetDatum(ent->allocated_size); + named_allocated[ent->shmem_segment] += ent->allocated_size; tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); } /* output shared memory allocated but not counted via the shmem index */ - values[0] = CStringGetTextDatum(""); - nulls[1] = true; - values[2] = Int64GetDatum(ShmemSegHdr->freeoffset - named_allocated); - values[3] = values[2]; - tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); + for (i = 0; i < NUM_MEMORY_MAPPINGS; i++) + { + values[0] = CStringGetTextDatum(""); + values[1] = CStringGetTextDatum(MappingName(i)); + nulls[2] = true; + values[3] = Int64GetDatum(Segments[i].ShmemSegHdr->freeoffset - named_allocated[i]); + values[4] = values[3]; + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); + } /* output as-of-yet unused shared memory */ - nulls[0] = true; - values[1] = Int64GetDatum(ShmemSegHdr->freeoffset); - nulls[1] = false; - values[2] = Int64GetDatum(ShmemSegHdr->totalsize - ShmemSegHdr->freeoffset); - values[3] = values[2]; - tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); + memset(nulls, 0, sizeof(nulls)); + + for (i = 0; i < NUM_MEMORY_MAPPINGS; i++) + { + PGShmemHeader *shmhdr = Segments[i].ShmemSegHdr; + nulls[0] = true; + values[1] = CStringGetTextDatum(MappingName(i)); + values[2] = Int64GetDatum(shmhdr->freeoffset); + values[3] = Int64GetDatum(shmhdr->totalsize - shmhdr->freeoffset); + values[4] = values[3]; + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); + } LWLockRelease(ShmemIndexLock); @@ -593,7 +656,7 @@ pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS) Size os_page_size; void **page_ptrs; int *pages_status; - uint64 shm_total_page_count, + uint64 shm_total_page_count = 0, shm_ent_page_count, max_nodes; Size *nodes; @@ -628,7 +691,12 @@ pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS) * this is not very likely, and moreover we have more entries, each of * them using only fraction of the total pages. */ - shm_total_page_count = (ShmemSegHdr->totalsize / os_page_size) + 1; + for(int segment = 0; segment < NUM_MEMORY_MAPPINGS; segment++) + { + PGShmemHeader *shmhdr = Segments[segment].ShmemSegHdr; + shm_total_page_count += (shmhdr->totalsize / os_page_size) + 1; + } + page_ptrs = palloc0(sizeof(void *) * shm_total_page_count); pages_status = palloc(sizeof(int) * shm_total_page_count); @@ -751,7 +819,7 @@ pg_get_shmem_pagesize(void) Assert(huge_pages_status != HUGE_PAGES_UNKNOWN); if (huge_pages_status == HUGE_PAGES_ON) - GetHugePageSize(&os_page_size, NULL); + GetHugePageSize(&os_page_size, NULL, NULL); return os_page_size; } @@ -761,3 +829,46 @@ pg_numa_available(PG_FUNCTION_ARGS) { PG_RETURN_BOOL(pg_numa_init() != -1); } + +/* SQL SRF showing shared memory segments */ +Datum +pg_get_shmem_segments(PG_FUNCTION_ARGS) +{ +#define PG_GET_SHMEM_SEGS_COLS 6 + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + Datum values[PG_GET_SHMEM_SEGS_COLS]; + bool nulls[PG_GET_SHMEM_SEGS_COLS]; + int i; + + InitMaterializedSRF(fcinfo, 0); + + /* output all allocated entries */ + for (i = 0; i < NUM_MEMORY_MAPPINGS; i++) + { + ShmemSegment *segment = &Segments[i]; + PGShmemHeader *shmhdr = segment->ShmemSegHdr; + int j; + + if (shmhdr == NULL) + { + for (j = 0; j < PG_GET_SHMEM_SEGS_COLS; j++) + nulls[j] = true; + } + else + { + memset(nulls, 0, sizeof(nulls)); + values[0] = Int32GetDatum(i); + values[1] = CStringGetTextDatum(MappingName(i)); + values[2] = Int64GetDatum(shmhdr->totalsize); + values[3] = Int64GetDatum(shmhdr->freeoffset); + values[4] = Int64GetDatum(segment->shmem_size); + values[5] = Int64GetDatum(segment->shmem_reserved); + } + + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, + values, nulls); + } + + return (Datum) 0; +} + diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c index b017880f5e45..c25dd13b63af 100644 --- a/src/backend/storage/lmgr/lwlock.c +++ b/src/backend/storage/lmgr/lwlock.c @@ -80,6 +80,8 @@ #include "pg_trace.h" #include "pgstat.h" #include "port/pg_bitutils.h" +#include "postmaster/postmaster.h" +#include "storage/pg_shmem.h" #include "storage/proc.h" #include "storage/proclist.h" #include "storage/procnumber.h" @@ -612,12 +614,15 @@ LWLockNewTrancheId(const char *name) /* * We use the ShmemLock spinlock to protect LWLockCounter and * LWLockTrancheNames. + * + * XXX: Looks like this is the only use of Segments outside of shmem.c, + * it's maybe worth it to reshape this part to hide Segments structure. */ - SpinLockAcquire(ShmemLock); + SpinLockAcquire(Segments[MAIN_SHMEM_SEGMENT].ShmemLock); if (*LWLockCounter - LWTRANCHE_FIRST_USER_DEFINED >= MAX_NAMED_TRANCHES) { - SpinLockRelease(ShmemLock); + SpinLockRelease(Segments[MAIN_SHMEM_SEGMENT].ShmemLock); ereport(ERROR, (errmsg("maximum number of tranches already registered"), errdetail("No more than %d tranches may be registered.", @@ -628,7 +633,7 @@ LWLockNewTrancheId(const char *name) LocalLWLockCounter = *LWLockCounter; strlcpy(LWLockTrancheNames[result - LWTRANCHE_FIRST_USER_DEFINED], name, NAMEDATALEN); - SpinLockRelease(ShmemLock); + SpinLockRelease(Segments[MAIN_SHMEM_SEGMENT].ShmemLock); return result; } @@ -750,9 +755,9 @@ GetLWTrancheName(uint16 trancheId) */ if (trancheId >= LocalLWLockCounter) { - SpinLockAcquire(ShmemLock); + SpinLockAcquire(Segments[MAIN_SHMEM_SEGMENT].ShmemLock); LocalLWLockCounter = *LWLockCounter; - SpinLockRelease(ShmemLock); + SpinLockRelease(Segments[MAIN_SHMEM_SEGMENT].ShmemLock); if (trancheId >= LocalLWLockCounter) elog(ERROR, "tranche %d is not registered", trancheId); diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 1edb18958f75..56c215b7be9c 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -8576,8 +8576,8 @@ { oid => '5052', descr => 'allocations from the main shared memory segment', proname => 'pg_get_shmem_allocations', prorows => '50', proretset => 't', provolatile => 'v', prorettype => 'record', proargtypes => '', - proallargtypes => '{text,int8,int8,int8}', proargmodes => '{o,o,o,o}', - proargnames => '{name,off,size,allocated_size}', + proallargtypes => '{text,text,int8,int8,int8}', proargmodes => '{o,o,o,o,o}', + proargnames => '{name,segment,off,size,allocated_size}', prosrc => 'pg_get_shmem_allocations' }, { oid => '4099', descr => 'Is NUMA support available?', @@ -8600,6 +8600,14 @@ proargmodes => '{o,o,o}', proargnames => '{name,type,size}', prosrc => 'pg_get_dsm_registry_allocations' }, +# shared memory segments +{ oid => '5101', descr => 'shared memory segments', + proname => 'pg_get_shmem_segments', prorows => '6', proretset => 't', + provolatile => 'v', prorettype => 'record', proargtypes => '', + proallargtypes => '{int4,text,int8,int8,int8,int8}', proargmodes => '{o,o,o,o,o,o}', + proargnames => '{id,name,size,freeoffset,mapping_size,mapping_reserved_size}', + prosrc => 'pg_get_shmem_segments' }, + # memory context of local backend { oid => '2282', descr => 'information about all memory contexts of local backend', diff --git a/src/include/portability/mem.h b/src/include/portability/mem.h index ef9800732d90..40588ff69683 100644 --- a/src/include/portability/mem.h +++ b/src/include/portability/mem.h @@ -38,7 +38,7 @@ #define MAP_NOSYNC 0 #endif -#define PG_MMAP_FLAGS (MAP_SHARED|MAP_ANONYMOUS|MAP_HASSEMAPHORE) +#define PG_MMAP_FLAGS (MAP_SHARED|MAP_HASSEMAPHORE) /* Some really old systems don't define MAP_FAILED. */ #ifndef MAP_FAILED diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index b5f8f3c5d42f..3769f4db7dc6 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -19,6 +19,7 @@ #include "storage/block.h" #include "storage/buf.h" #include "storage/bufpage.h" +#include "storage/pg_shmem.h" #include "storage/relfilelocator.h" #include "utils/relcache.h" #include "utils/snapmgr.h" @@ -326,7 +327,7 @@ extern void EvictRelUnpinnedBuffers(Relation rel, /* in buf_init.c */ extern void BufferManagerShmemInit(void); -extern Size BufferManagerShmemSize(void); +extern Size BufferManagerShmemSize(MemoryMappingSizes *mapping_sizes); /* in localbuf.c */ extern void AtProcExit_LocalBuffers(void); diff --git a/src/include/storage/ipc.h b/src/include/storage/ipc.h index 2a8a8f0eabdb..d73f1b407db8 100644 --- a/src/include/storage/ipc.h +++ b/src/include/storage/ipc.h @@ -18,6 +18,8 @@ #ifndef IPC_H #define IPC_H +#include "storage/pg_shmem.h" + typedef void (*pg_on_exit_callback) (int code, Datum arg); typedef void (*shmem_startup_hook_type) (void); @@ -77,7 +79,7 @@ extern void check_on_shmem_exit_lists_are_empty(void); /* ipci.c */ extern PGDLLIMPORT shmem_startup_hook_type shmem_startup_hook; -extern Size CalculateShmemSize(void); +extern Size CalculateShmemSize(MemoryMappingSizes *mapping_sizes); extern void CreateSharedMemoryAndSemaphores(void); #ifdef EXEC_BACKEND extern void AttachSharedMemoryStructs(void); diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h index 5f7d4b83a60e..beee0a53d2da 100644 --- a/src/include/storage/pg_shmem.h +++ b/src/include/storage/pg_shmem.h @@ -25,6 +25,13 @@ #define PG_SHMEM_H #include "storage/dsm_impl.h" +#include "storage/spin.h" + +typedef struct MemoryMappingSizes +{ + Size shmem_req_size; /* Required size of the segment */ + Size shmem_reserved; /* Required size of the reserved address space. */ +} MemoryMappingSizes; typedef struct PGShmemHeader /* standard header for all Postgres shmem */ { @@ -41,6 +48,27 @@ typedef struct PGShmemHeader /* standard header for all Postgres shmem */ #endif } PGShmemHeader; +typedef struct ShmemSegment +{ + PGShmemHeader *ShmemSegHdr; /* shared mem segment header */ + void *ShmemBase; /* start address of shared memory */ + void *ShmemEnd; /* end+1 address of shared memory */ + slock_t *ShmemLock; /* spinlock for shared memory and LWLock + * allocation */ + int segment_fd; /* fd for the backing anon file */ + unsigned long seg_id; /* IPC key */ + int shmem_segment; /* TODO: Do we really need it? */ + Size shmem_size; /* Size of the actually used memory */ + Size shmem_reserved; /* Size of the reserved mapping */ + Pointer shmem; /* Pointer to the start of the mapped memory */ + Pointer seg_addr; /* SysV shared memory for the header */ +} ShmemSegment; + +/* Number of available segments for anonymous memory mappings */ +#define NUM_MEMORY_MAPPINGS 6 + +extern PGDLLIMPORT ShmemSegment Segments[NUM_MEMORY_MAPPINGS]; + /* GUC variables */ extern PGDLLIMPORT int shared_memory_type; extern PGDLLIMPORT int huge_pages; @@ -85,10 +113,38 @@ extern void PGSharedMemoryReAttach(void); extern void PGSharedMemoryNoReAttach(void); #endif -extern PGShmemHeader *PGSharedMemoryCreate(Size size, +extern PGShmemHeader *PGSharedMemoryCreate(MemoryMappingSizes *mapping_sizes, int segment_id, PGShmemHeader **shim); extern bool PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2); extern void PGSharedMemoryDetach(void); -extern void GetHugePageSize(Size *hugepagesize, int *mmap_flags); +extern const char *MappingName(int shmem_segment); +extern void GetHugePageSize(Size *hugepagesize, int *mmap_flags, + int *memfd_flags); +void PrepareHugePages(void); + +/* + * To be able to dynamically resize largest parts of the data stored in shared + * memory, we split it into multiple shared memory mappings segments. Each + * segment contains only certain part of the data, which size depends on + * NBuffers. + */ + +/* The main segment, contains everything except buffer blocks and related data. */ +#define MAIN_SHMEM_SEGMENT 0 + +/* Buffer blocks */ +#define BUFFERS_SHMEM_SEGMENT 1 + +/* Buffer descriptors */ +#define BUFFER_DESCRIPTORS_SHMEM_SEGMENT 2 + +/* Condition variables for buffers */ +#define BUFFER_IOCV_SHMEM_SEGMENT 3 + +/* Checkpoint BufferIds */ +#define CHECKPOINT_BUFFERS_SHMEM_SEGMENT 4 + +/* Buffer strategy status */ +#define STRATEGY_SHMEM_SEGMENT 5 #endif /* PG_SHMEM_H */ diff --git a/src/include/storage/shmem.h b/src/include/storage/shmem.h index 70a5b8b172c6..c56712555f09 100644 --- a/src/include/storage/shmem.h +++ b/src/include/storage/shmem.h @@ -30,14 +30,25 @@ extern PGDLLIMPORT slock_t *ShmemLock; typedef struct PGShmemHeader PGShmemHeader; /* avoid including * storage/pg_shmem.h here */ extern void InitShmemAccess(PGShmemHeader *seghdr); +extern void InitShmemAccessInSegment(struct PGShmemHeader *seghdr, + int shmem_segment); extern void InitShmemAllocation(void); +extern void InitShmemAllocationInSegment(int shmem_segment); extern void *ShmemAlloc(Size size); +extern void *ShmemAllocInSegment(Size size, int shmem_segment); extern void *ShmemAllocNoError(Size size); +extern void *ShmemAllocUnlockedInSegment(Size size, int shmem_segment); extern bool ShmemAddrIsValid(const void *addr); +extern bool ShmemAddrIsValidInSegment(const void *addr, int shmem_segment); extern void InitShmemIndex(void); extern HTAB *ShmemInitHash(const char *name, int64 init_size, int64 max_size, HASHCTL *infoP, int hash_flags); +extern HTAB *ShmemInitHashInSegment(const char *name, long init_size, + long max_size, HASHCTL *infoP, + int hash_flags, int shmem_segment); extern void *ShmemInitStruct(const char *name, Size size, bool *foundPtr); +extern void *ShmemInitStructInSegment(const char *name, Size size, + bool *foundPtr, int shmem_segment); extern Size add_size(Size s1, Size s2); extern Size mul_size(Size s1, Size s2); @@ -59,6 +70,7 @@ typedef struct void *location; /* location in shared mem */ Size size; /* # bytes requested for the structure */ Size allocated_size; /* # bytes actually allocated */ + int shmem_segment; /* segment in which the structure is allocated */ } ShmemIndexEnt; #endif /* SHMEM_H */ diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index 372a2188c22a..f02e82de520d 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -1765,14 +1765,22 @@ pg_shadow| SELECT pg_authid.rolname AS usename, LEFT JOIN pg_db_role_setting s ON (((pg_authid.oid = s.setrole) AND (s.setdatabase = (0)::oid)))) WHERE pg_authid.rolcanlogin; pg_shmem_allocations| SELECT name, + segment, off, size, allocated_size - FROM pg_get_shmem_allocations() pg_get_shmem_allocations(name, off, size, allocated_size); + FROM pg_get_shmem_allocations() pg_get_shmem_allocations(name, segment, off, size, allocated_size); pg_shmem_allocations_numa| SELECT name, numa_node, size FROM pg_get_shmem_allocations_numa() pg_get_shmem_allocations_numa(name, numa_node, size); +pg_shmem_segments| SELECT id, + name, + size, + freeoffset, + mapping_size, + mapping_reserved_size + FROM pg_get_shmem_segments() pg_get_shmem_segments(id, name, size, freeoffset, mapping_size, mapping_reserved_size); pg_stat_activity| SELECT s.datid, d.datname, s.pid, From b7678e1ac14782c8f3ba99dfdddcac92322f19b4 Mon Sep 17 00:00:00 2001 From: Dmitrii Dolgov <9erthalion6@gmail.com> Date: Tue, 17 Jun 2025 14:16:55 +0200 Subject: [PATCH 3/4] Allow to resize shared memory without restart shared_buffers is now PGC_SIGHUP instead of PGC_POSTMASTER. The value of this GUC is saved in NBuffersPending instead of NBuffers. When the server starts, the shared memory size is estimated and the memory is allocated using NBuffersPending. When a server is running, the new value of GUC (set using ALTER SYSTEM ... SET shared_buffers = ...; followed by SELECT pg_reload_conf()) does not come into effect immediately. Instead a function pg_resize_shared_buffers() is used to resize the buffer pool. The function uses the current value of GUC in the backends where it is executed. The function also coordinates the buffer resizing synchronization across backends. SHOW shared_buffers now shows the current size of the shared buffer pool but it also shows pending size of shared buffers, if any. A new GUC max_shared_buffers is introduced to control the maximum value of shared_buffers that can be set. By default it is 0. When explicitly set it needs to be higher than 'shared_buffers'. When max_shared_buffers is set to 0, it assumes the same value as GUC shared_buffers. This GUC determines the size of address space reserved for future buffer pool sizes and the size of buffer look up table. TBD: Describe the protocol used by pg_resize_shared_buffers() to synchronize buffer resizing operation with other backends. When shrinking the shared buffers pool, each buffer in the area being shrunk needs to be flushed if it's dirty so as not to loose the changes to that buffer after shrinking. Also, each such buffer needs to be removed from the buffer mapping table so that backends do not access it after shrinking. If a buffer being evicted is pinned, we abort the resizing operation. There are other alternative which are not implemented in the current patches 1. to wait for the pinned buffer to get unpinned, 2. the backend is killed or it itself cancels the query or 3. rollback the operation. Note that option 1 and 2 would require the pinning related local and shared records to be accessed. But we need infrastructure to do either of this right now. So far the buffer pool metdata (NBuffers and the shared memory segment address space) is saved in process local heap memory since it's static for the life of a server. It is passed to a new backend through Postmaster. But with buffer pool being resized while the server running, we need Postmaster to update its buffer pool metadata as the resizing progresses and pass it to the new backend. This has few complications: 1. Postmaster does not receive ProcSignalBarrier. So we need to signal it separately. 2. Postmaster's local state is inherited by the new backend when fork()ed. But we need more complex implementation to pass it to an exec()ed backend. 3. A new backend may receive the updated state from Postmaster and also the signal barrier which prompts the same update. Thus the proc signal barrier code needs to be idempotent; adding further complexity to it. 4. This task takes away Postmaster resources from it's core functionality. This can be avoided by following two changes: 1. The shared memory is resized only in a single backend without requiring any changes to the memory address space. 2. Maintaining the buffer pool metadata in the shared memory instead of process local memory. This change may affect performance so verify that performance is not degraded. TODO: In case the backend executing pg_resize_shared_buffers() exits before the operation finishes, we need to make sure that the changes made to the shared memory while resizing are cleaned up properly. Removing the evicted buffers from buffer ring ============================================= If the buffer pool has been shrunk, the buffers in the buffer ring may not be valid anymore. Modify GetBufferFromRing to check if the buffer is still valid before using it. This makes GetBufferFromRing() a bit more expensive because of additional boolean condition and masks any bug that introduces an invalid buffer into the ring. The alternative fix is more complex as explained below. The strategy object is created in CurrentMemoryContext and is not available in any global structure thus accessible when processing buffer resizing barriers. We may modify GetAccessStrategy() to register strategy in a global linked list and then arrange to deregister it once it's no more in use. Looking at the places which use GetAccessStrategy(), fixing all those may be some work. Author: Ashutosh Bapat Author: Dmitrii Dolgov Author of some tests: Palak Chaturvedi Reviewed-by: Tomas Vondra More detailed note follow: Need to see which of those fit in the commit message and which should be removed. Reinitializing strategry control area ===================================== The commit introduces a separate function StrategyReInitialize() instead of reusing StrategyInitialize() since some of the things that the second one does are not required in the first one. Here's list of what StrategyReInitialize() does and how does it differ from StrategyInitialize(). 1. StrategyControl pointer needn't be fetched again since it should not change. But added an Assert to make sure the pointer is valid. 2. &StrategyControl->buffer_strategy_lock need not be initialized again. 3. nextVictimBuffer, completePasses and numBufferAllocs are viewed in the context of NBuffers. Now that NBuffers itself has changed, those three do not make sense. Reset them as if the server has restarted again. Ability to delay resizing operation =================================== This commit introduces a flag delay_shmem_resize, which postgresql backends and workers can use to signal the coordinator to delay resizing operation. Background writer sets this flag when its scanning buffers. Background writer operation (needs a rethink) =========================== Background writer is blocked when the actual resizing is in progress. It stops a scan in progress when it sees that the resizing has begun or is about to begin. Once the buffer resizing is finished, before resuming the regular operation, bgwriter resets the information saved so far. This information is viewed in the context of NBuffers and hence does not make sense after resizing which chanegs NBuffers. Buffer lookup table =================== Right now there is no way to free shared memory. Even if we shrink the buffer lookup table when shrinking the buffer pool the unused hash table entries can not be freed. When we expand the buffer pool, more entries can be allocated but we can not resize the hash table directory without rehashing all the entries. Just allocating more entries will lead to more contention. Hence we setup the buffer lookup table considering the maximum possible size of the buffer pool which is MaxAvailableMemory only once at the beginning. Shared buffer lookup table and StrategyControl are not resized even if the buffer pool is resized hence they are allocated in the main shared memory segment BgWriter refactoring ==================== The way BgBufferSync is written today, it packs four functionalities: setting up the buffer sync state, performing the buffer sync, resetting the buffer sync state when bgwriter_lru_maxpages <= 0 and setting it up again after bgwriter_lru_maxpages > 0. That makes the code hard to read. It will be good to divide this function into 3/4 different functions each performing one functionality. Then pack all the state (the local variables from that function converted to static global) into a structure, which is passed to these functions. Once that happens BgBufferSyncReset() will call one of the functions to reset the state when buffer pool is resized. --- contrib/pg_buffercache/pg_buffercache_pages.c | 18 +- doc/src/sgml/config.sgml | 44 +- doc/src/sgml/func/func-admin.sgml | 57 +++ src/backend/access/transam/slru.c | 2 +- src/backend/access/transam/xlog.c | 2 +- src/backend/bootstrap/bootstrap.c | 2 + src/backend/port/sysv_shmem.c | 176 +++++++- src/backend/postmaster/checkpointer.c | 12 +- src/backend/postmaster/postmaster.c | 10 +- src/backend/storage/buffer/Makefile | 3 +- src/backend/storage/buffer/buf_init.c | 279 ++++++++++-- src/backend/storage/buffer/buf_resize.c | 399 ++++++++++++++++++ src/backend/storage/buffer/buf_table.c | 9 +- src/backend/storage/buffer/bufmgr.c | 160 ++++++- src/backend/storage/buffer/freelist.c | 106 ++++- src/backend/storage/buffer/meson.build | 1 + src/backend/storage/ipc/ipci.c | 13 +- src/backend/storage/ipc/procsignal.c | 55 +++ src/backend/storage/ipc/shmem.c | 66 ++- src/backend/tcop/postgres.c | 11 + .../utils/activity/wait_event_names.txt | 2 + src/backend/utils/init/globals.c | 5 +- src/backend/utils/init/postinit.c | 49 +++ src/backend/utils/misc/guc.c | 2 +- src/backend/utils/misc/guc_parameters.dat | 15 +- src/include/catalog/pg_proc.dat | 6 + src/include/miscadmin.h | 5 + src/include/storage/buf_internals.h | 1 + src/include/storage/bufmgr.h | 20 +- src/include/storage/ipc.h | 3 + src/include/storage/lwlocklist.h | 1 + src/include/storage/pg_shmem.h | 52 ++- src/include/storage/pmsignal.h | 3 +- src/include/storage/procsignal.h | 4 + src/include/storage/shmem.h | 3 + src/include/utils/guc.h | 2 + src/test/Makefile | 2 +- src/test/README | 3 + src/test/buffermgr/Makefile | 30 ++ src/test/buffermgr/README | 26 ++ src/test/buffermgr/buffermgr_test.conf | 11 + src/test/buffermgr/expected/buffer_resize.out | 329 +++++++++++++++ src/test/buffermgr/meson.build | 23 + src/test/buffermgr/sql/buffer_resize.sql | 95 +++++ src/test/buffermgr/t/001_resize_buffer.pl | 135 ++++++ .../buffermgr/t/003_parallel_resize_buffer.pl | 71 ++++ .../t/004_client_join_buffer_resize.pl | 241 +++++++++++ src/test/meson.build | 1 + .../perl/PostgreSQL/Test/BackgroundPsql.pm | 76 ++++ src/tools/pgindent/typedefs.list | 1 + 50 files changed, 2535 insertions(+), 107 deletions(-) create mode 100644 src/backend/storage/buffer/buf_resize.c create mode 100644 src/test/buffermgr/Makefile create mode 100644 src/test/buffermgr/README create mode 100644 src/test/buffermgr/buffermgr_test.conf create mode 100644 src/test/buffermgr/expected/buffer_resize.out create mode 100644 src/test/buffermgr/meson.build create mode 100644 src/test/buffermgr/sql/buffer_resize.sql create mode 100644 src/test/buffermgr/t/001_resize_buffer.pl create mode 100644 src/test/buffermgr/t/003_parallel_resize_buffer.pl create mode 100644 src/test/buffermgr/t/004_client_join_buffer_resize.pl diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c index 31c4a339d74a..45efc6a314bf 100644 --- a/contrib/pg_buffercache/pg_buffercache_pages.c +++ b/contrib/pg_buffercache/pg_buffercache_pages.c @@ -118,6 +118,7 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) TupleDesc tupledesc; TupleDesc expected_tupledesc; HeapTuple tuple; + int currentNBuffers = pg_atomic_read_u32(&ShmemCtrl->currentNBuffers); if (SRF_IS_FIRSTCALL()) { @@ -174,10 +175,10 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) /* Allocate NBuffers worth of BufferCachePagesRec records. */ fctx->record = (BufferCachePagesRec *) MemoryContextAllocHuge(CurrentMemoryContext, - sizeof(BufferCachePagesRec) * NBuffers); + sizeof(BufferCachePagesRec) * currentNBuffers); /* Set max calls and remember the user function context. */ - funcctx->max_calls = NBuffers; + funcctx->max_calls = currentNBuffers; funcctx->user_fctx = fctx; /* Return to original context when allocating transient memory */ @@ -191,13 +192,24 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) * snapshot across all buffers, but we do grab the buffer header * locks, so the information of each buffer is self-consistent. */ - for (i = 0; i < NBuffers; i++) + for (i = 0; i < currentNBuffers; i++) { BufferDesc *bufHdr; uint32 buf_state; CHECK_FOR_INTERRUPTS(); + /* + * TODO: We should just scan the entire buffer descriptor + * array instead of relying on curent buffer pool size. But that can + * happen if only we setup the descriptor array large enough at the + * server startup time. + */ + if (currentNBuffers != pg_atomic_read_u32(&ShmemCtrl->currentNBuffers)) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("number of shared buffers changed during scan of buffer cache"))); + bufHdr = GetBufferDescriptor(i); /* Lock each buffer header before inspecting. */ buf_state = LockBufHdr(bufHdr); diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 023b3f03ba93..d007055eed79 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -1724,7 +1724,6 @@ include_dir 'conf.d' that is BLCKSZ bytes, typically 8kB. (Non-default values of BLCKSZ change the minimum value.) - This parameter can only be set at server start. @@ -1747,6 +1746,49 @@ include_dir 'conf.d' appropriate, so as to leave adequate space for the operating system. + + The shared memory consumed by the buffer pool is allocated and + initialized according to the value of the GUC at the time of starting + the server. A desired new value of GUC can be loaded while the server is + running using SIGHUP. But the buffer pool will + not be resized immediately. Use + pg_resize_shared_buffers() to dynamically resize + the shared buffer pool (see for details). + SHOW shared_buffers shows the current number of + shared buffers and pending number, if any. Please note that when the GUC + is changed, the other GUCS which use this GUCs value to set their + defaults will not be changed. They may still require a server restart to + consider new value. + + + + + + max_shared_buffers (integer) + + max_shared_buffers configuration parameter + + + + + Sets the upper limit for the shared_buffers value. + The default value is 0, + which means no explicit limit is set and max_shared_buffers + will be automatically set to the value of shared_buffers + at server startup. + If this value is specified without units, it is taken as blocks, + that is BLCKSZ bytes, typically 8kB. + This parameter can only be set at server start. + + + + This parameter determines the amount of memory address space to reserve + in each backend for expanding the buffer pool in future. While the + memory for buffer pool is allocated on demand as it is resized, the + memory required to hold the buffer manager metadata is allocated + statically at the server start accounting for the largest buffer pool + size allowed by this parameter. + diff --git a/doc/src/sgml/func/func-admin.sgml b/doc/src/sgml/func/func-admin.sgml index 1b465bc8ba71..0dc89b07c765 100644 --- a/doc/src/sgml/func/func-admin.sgml +++ b/doc/src/sgml/func/func-admin.sgml @@ -99,6 +99,63 @@ off + + + + + pg_resize_shared_buffers + + pg_resize_shared_buffers () + boolean + + + Dynamically resizes the shared buffer pool to match the current + value of the shared_buffers parameter. This + function implements a coordinated resize process that ensures all + backend processes acknowledge the change before completing the + operation. The resize happens in multiple phases to maintain + data consistency and system stability. Returns true + if the resize was successful, or raises an error if the operation + fails. This function can only be called by superusers. + + + To resize shared buffers, first update the shared_buffers + setting and reload the configuration, then verify the new value is loaded + before calling this function. For example: + +postgres=# ALTER SYSTEM SET shared_buffers = '256MB'; +ALTER SYSTEM +postgres=# SELECT pg_reload_conf(); + pg_reload_conf +---------------- + t +(1 row) + +postgres=# SHOW shared_buffers; + shared_buffers +------------------------- + 128MB (pending: 256MB) +(1 row) + +postgres=# SELECT pg_resize_shared_buffers(); + pg_resize_shared_buffers +-------------------------- + t +(1 row) + +postgres=# SHOW shared_buffers; + shared_buffers +---------------- + 256MB +(1 row) + + The SHOW shared_buffers step is important to verify + that the configuration reload was successful and the new value is + available to the current session before attempting the resize. The + output shows both the current and pending values when a change is waiting + to be applied. + + diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index 77676d6d0359..73df59098866 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -232,7 +232,7 @@ SimpleLruAutotuneBuffers(int divisor, int max) { return Min(max - (max % SLRU_BANK_SIZE), Max(SLRU_BANK_SIZE, - NBuffers / divisor - (NBuffers / divisor) % SLRU_BANK_SIZE)); + NBuffersPending / divisor - (NBuffersPending / divisor) % SLRU_BANK_SIZE)); } /* diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 22d0a2e8c3a6..f4363e0035d9 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -4676,7 +4676,7 @@ XLOGChooseNumBuffers(void) { int xbuffers; - xbuffers = NBuffers / 32; + xbuffers = NBuffersPending / 32; if (xbuffers > (wal_segment_size / XLOG_BLCKSZ)) xbuffers = (wal_segment_size / XLOG_BLCKSZ); if (xbuffers < 8) diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c index fc8638c1b61b..226944e45882 100644 --- a/src/backend/bootstrap/bootstrap.c +++ b/src/backend/bootstrap/bootstrap.c @@ -335,6 +335,8 @@ BootstrapModeMain(int argc, char *argv[], bool check_only) InitializeFastPathLocks(); + InitializeMaxNBuffers(); + CreateSharedMemoryAndSemaphores(); /* diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c index cc4b2c80e1ab..68de301441bb 100644 --- a/src/backend/port/sysv_shmem.c +++ b/src/backend/port/sysv_shmem.c @@ -30,13 +30,19 @@ #include "miscadmin.h" #include "port/pg_bitutils.h" #include "portability/mem.h" +#include "storage/bufmgr.h" #include "storage/dsm.h" #include "storage/fd.h" #include "storage/ipc.h" +#include "storage/lwlock.h" #include "storage/pg_shmem.h" +#include "storage/pmsignal.h" +#include "storage/procsignal.h" +#include "storage/shmem.h" #include "utils/guc.h" #include "utils/guc_hooks.h" #include "utils/pidfile.h" +#include "utils/wait_event.h" /* @@ -98,6 +104,8 @@ typedef enum unsigned long UsedShmemSegID = 0; void *UsedShmemSegAddr = NULL; +volatile bool delay_shmem_resize = false; + /* * Anonymous mapping layout we use looks like this: * @@ -124,6 +132,9 @@ void *UsedShmemSegAddr = NULL; * being counted against memory limits). The mapping serves as an address space * reservation, into which shared memory segment can be extended and is * represented by the second /memfd:main with no permissions. + * + * The reserved space for buffer manager related segments is calculated based on + * MaxNBuffers. */ /* @@ -134,6 +145,42 @@ void *UsedShmemSegAddr = NULL; */ static bool huge_pages_on = false; +/* + * Currently broadcasted value of NBuffers in shared memory. + * + * Most of the time this value is going to be equal to NBuffers. But if + * postmaster is resizing shared memory and a new backend was created + * at the same time, there is a possibility for the new backend to inherit the + * old NBuffers value, but miss the resize signal if ProcSignal infrastructure + * was not initialized yet. Consider this situation: + * + * Postmaster ------> New Backend + * | | + * | Launch + * | | + * | Inherit NBuffers + * | | + * Resize NBuffers | + * | | + * Emit Barrier | + * | Init ProcSignal + * | | + * Finish resize | + * | | + * New NBuffers Old NBuffers + * + * In this case the backend is not yet ready to receive a signal from + * EmitProcSignalBarrier, and will be ignored. The same happens if ProcSignal + * is initialized even later, after the resizing was finished. + * + * To address resulting inconsistency, postmaster broadcasts the current + * NBuffers value via shared memory. Every new backend has to verify this value + * before it will access the buffer pool: if it differs from its own value, + * this indicates a shared memory resize has happened and the backend has to + * first synchronize with rest of the pack. + */ +ShmemControl *ShmemCtrl = NULL; + static void *InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size); static void IpcMemoryDetach(int status, Datum shmaddr); static void IpcMemoryDelete(int status, Datum shmId); @@ -156,8 +203,6 @@ MappingName(int shmem_segment) return "iocv"; case CHECKPOINT_BUFFERS_SHMEM_SEGMENT: return "checkpoint"; - case STRATEGY_SHMEM_SEGMENT: - return "strategy"; default: return "unknown"; } @@ -921,6 +966,114 @@ AnonymousShmemDetach(int status, Datum arg) } } +/* + * Resize all shared memory segments based on the new shared_buffers value (saved + * in ShmemCtrl area). The actual segment resizing is done via ftruncate, which + * will fail if there is not sufficient space to expand the anon file. + * + * TODO: Rename this to BufferShmemResize() or something. Only buffer manager's + * memory should be resized in this function. + * + * TODO: This function changes the amount of shared memory used. So it should + * also update the show only GUCs shared_memory_size and + * shared_memory_size_in_huge_pages in all backends. SetConfigOption() may be + * used for that. But it's not clear whether is_reload parameter is safe to use + * while resizing is going on; also at what stage it should be done. + */ +bool +AnonymousShmemResize(void) +{ + int mmap_flags = PG_MMAP_FLAGS; + Size hugepagesize; + MemoryMappingSizes mapping_sizes[NUM_MEMORY_MAPPINGS]; + + Assert(!pg_atomic_unlocked_test_flag(&ShmemCtrl->resize_in_progress)); + + /* TODO: This is a hack. NBuffersPending should never be written by anything + * other than GUC system. Find a way to pass new NBuffers value to + * BufferManagerShmemSize(). */ + NBuffersPending = pg_atomic_read_u32(&ShmemCtrl->targetNBuffers); + elog(DEBUG1, "Resize shmem from %d to %d", NBuffers, NBuffersPending); + +#ifndef MAP_HUGETLB + /* PrepareHugePages should have dealt with this case */ + Assert(huge_pages != HUGE_PAGES_ON && !huge_pages_on); +#else + if (huge_pages_on) + { + Assert(huge_pages == HUGE_PAGES_ON || huge_pages == HUGE_PAGES_TRY); + GetHugePageSize(&hugepagesize, &mmap_flags, NULL); + } +#endif + + /* Note that BufferManagerShmemSize() indirectly depends on NBuffersPending. */ + BufferManagerShmemSize(mapping_sizes); + + for(int i = 0; i < NUM_MEMORY_MAPPINGS; i++) + { + MemoryMappingSizes *mapping = &mapping_sizes[i]; + ShmemSegment *segment = &Segments[i]; + PGShmemHeader *shmem_hdr = segment->ShmemSegHdr; + + /* Main shared memory segment is always static. Ignore it. */ + if (i == MAIN_SHMEM_SEGMENT) + continue; + + round_off_mapping_sizes(mapping); + round_off_mapping_sizes_for_hugepages(mapping, hugepagesize); + + /* + * Size of the reserved address space should not change, since it depends + * upon MaxNBuffers, which can be changed only on restart. + */ + Assert(segment->shmem_reserved == mapping->shmem_reserved); +#ifdef MAP_HUGETLB + if (huge_pages_on && (mapping_sizes->shmem_req_size % hugepagesize != 0)) + mapping_sizes->shmem_req_size += hugepagesize - (mapping_sizes->shmem_req_size % hugepagesize); +#endif + elog(DEBUG1, "segment[%s]: requested size %zu, current size %zu, reserved %zu", + MappingName(i), mapping->shmem_req_size, segment->shmem_size, + segment->shmem_reserved); + + if (segment->shmem == NULL) + continue; + + if (segment->shmem_size == mapping->shmem_req_size) + continue; + + /* + * We should have reserved enough address space for resizing. PANIC if + * that's not the case. + */ + if (segment->shmem_reserved < mapping->shmem_req_size) + ereport(PANIC, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("not enough shared memory is reserved"))); + + elog(DEBUG1, "segment[%s]: resize from %zu to %zu at address %p", + MappingName(i), segment->shmem_size, + mapping->shmem_req_size, segment->shmem); + + /* + * Resize the backing file to resize the allocated memory, and allocate + * more memory on supported platforms if required. + */ + if(ftruncate(segment->segment_fd, mapping->shmem_req_size) == -1) + ereport(ERROR, + (errcode(ERRCODE_SYSTEM_ERROR), + errmsg("could not truncate anonymous file for \"%s\": %m", + MappingName(i)))); + if (mapping->shmem_req_size > segment->shmem_size) + shmem_fallocate(segment->segment_fd, MappingName(i), mapping->shmem_req_size, ERROR); + + segment->shmem_size = mapping->shmem_req_size; + shmem_hdr->totalsize = segment->shmem_size; + segment->ShmemEnd = segment->shmem + segment->shmem_size; + } + + return true; +} + /* * PGSharedMemoryCreate * @@ -1224,3 +1377,22 @@ PGSharedMemoryDetach(void) } } } + +void +ShmemControlInit(void) +{ + bool foundShmemCtrl; + + ShmemCtrl = (ShmemControl *) + ShmemInitStruct("Shmem Control", sizeof(ShmemControl), + &foundShmemCtrl); + + if (!foundShmemCtrl) + { + pg_atomic_init_u32(&ShmemCtrl->targetNBuffers, 0); + pg_atomic_init_u32(&ShmemCtrl->currentNBuffers, 0); + pg_atomic_init_flag(&ShmemCtrl->resize_in_progress); + + ShmemCtrl->coordinator = 0; + } +} diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c index e84e8663e966..ef3f84a55f57 100644 --- a/src/backend/postmaster/checkpointer.c +++ b/src/backend/postmaster/checkpointer.c @@ -654,9 +654,12 @@ CheckpointerMain(const void *startup_data, size_t startup_data_len) static void ProcessCheckpointerInterrupts(void) { - if (ProcSignalBarrierPending) - ProcessProcSignalBarrier(); - + /* + * Reloading config can trigger further signals, complicating interrupts + * processing -- so let it run first. + * + * XXX: Is there any need in memory barrier after ProcessConfigFile? + */ if (ConfigReloadPending) { ConfigReloadPending = false; @@ -676,6 +679,9 @@ ProcessCheckpointerInterrupts(void) UpdateSharedMemoryConfig(); } + if (ProcSignalBarrierPending) + ProcessProcSignalBarrier(); + /* Perform logging of memory contexts of this process */ if (LogMemoryContextPending) ProcessLogMemoryContextInterrupt(); diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 7c064cf9fbb2..2095713d7c0e 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -110,11 +110,15 @@ #include "replication/slotsync.h" #include "replication/walsender.h" #include "storage/aio_subsys.h" +#include "storage/bufmgr.h" #include "storage/fd.h" #include "storage/io_worker.h" #include "storage/ipc.h" +#include "storage/pg_shmem.h" #include "storage/pmsignal.h" #include "storage/proc.h" +#include "storage/procsignal.h" +#include "storage/shmem.h" #include "tcop/backend_startup.h" #include "tcop/tcopprot.h" #include "utils/datetime.h" @@ -125,7 +129,6 @@ #ifdef EXEC_BACKEND #include "common/file_utils.h" -#include "storage/pg_shmem.h" #endif @@ -958,6 +961,11 @@ PostmasterMain(int argc, char *argv[]) */ InitializeFastPathLocks(); + /* + * Calculate MaxNBuffers for buffer pool resizing. + */ + InitializeMaxNBuffers(); + /* * Give preloaded libraries a chance to request additional shared memory. */ diff --git a/src/backend/storage/buffer/Makefile b/src/backend/storage/buffer/Makefile index fd7c40dcb089..3bc9aee85deb 100644 --- a/src/backend/storage/buffer/Makefile +++ b/src/backend/storage/buffer/Makefile @@ -17,6 +17,7 @@ OBJS = \ buf_table.o \ bufmgr.o \ freelist.o \ - localbuf.o + localbuf.o \ + buf_resize.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c index 4fa547f48dea..4a354107185d 100644 --- a/src/backend/storage/buffer/buf_init.c +++ b/src/backend/storage/buffer/buf_init.c @@ -17,7 +17,7 @@ #include "storage/aio.h" #include "storage/buf_internals.h" #include "storage/bufmgr.h" -#include "storage/pg_shmem.h" +#include "utils/guc.h" BufferDescPadded *BufferDescriptors; char *BufferBlocks; @@ -62,11 +62,12 @@ CkptSortItem *CkptBufferIds; /* * Initialize shared buffer pool * - * This is called once during shared-memory initialization (either in the - * postmaster, or in a standalone backend). Size of data structures initialized - * here depends on NBuffers, and to be able to change NBuffers without a - * restart we store each structure into a separate shared memory segment, which - * could be resized on demand. + * This is called once during shared-memory initialization. + * TODO: Restore this function to it's initial form. This function should see no + * change in buffer resize patches, except may be use of NBuffersPending. + * + * No locks are taking in this function, it is the caller responsibility to + * make sure only one backend can work with new buffers. */ void BufferManagerShmemInit(void) @@ -75,24 +76,25 @@ BufferManagerShmemInit(void) foundDescs, foundIOCV, foundBufCkpt; + int i; /* Align descriptors to a cacheline boundary. */ BufferDescriptors = (BufferDescPadded *) ShmemInitStructInSegment("Buffer Descriptors", - NBuffers * sizeof(BufferDescPadded), + NBuffersPending * sizeof(BufferDescPadded), &foundDescs, BUFFER_DESCRIPTORS_SHMEM_SEGMENT); /* Align buffer pool on IO page size boundary. */ BufferBlocks = (char *) TYPEALIGN(PG_IO_ALIGN_SIZE, ShmemInitStructInSegment("Buffer Blocks", - NBuffers * (Size) BLCKSZ + PG_IO_ALIGN_SIZE, + NBuffersPending * (Size) BLCKSZ + PG_IO_ALIGN_SIZE, &foundBufs, BUFFERS_SHMEM_SEGMENT)); /* Align condition variables to cacheline boundary. */ BufferIOCVArray = (ConditionVariableMinimallyPadded *) ShmemInitStructInSegment("Buffer IO Condition Variables", - NBuffers * sizeof(ConditionVariableMinimallyPadded), + NBuffersPending * sizeof(ConditionVariableMinimallyPadded), &foundIOCV, BUFFER_IOCV_SHMEM_SEGMENT); /* @@ -104,48 +106,54 @@ BufferManagerShmemInit(void) */ CkptBufferIds = (CkptSortItem *) ShmemInitStructInSegment("Checkpoint BufferIds", - NBuffers * sizeof(CkptSortItem), &foundBufCkpt, + NBuffersPending * sizeof(CkptSortItem), &foundBufCkpt, CHECKPOINT_BUFFERS_SHMEM_SEGMENT); if (foundDescs || foundBufs || foundIOCV || foundBufCkpt) { /* should find all of these, or none of them */ Assert(foundDescs && foundBufs && foundIOCV && foundBufCkpt); - /* note: this path is only taken in EXEC_BACKEND case */ - } - else - { - int i; - /* - * Initialize all the buffer headers. + * note: this path is only taken in EXEC_BACKEND case when initializing + * shared memory. */ - for (i = 0; i < NBuffers; i++) - { - BufferDesc *buf = GetBufferDescriptor(i); + } - ClearBufferTag(&buf->tag); + /* + * Initialize all the buffer headers. + */ + for (i = 0; i < NBuffersPending; i++) + { + BufferDesc *buf = GetBufferDescriptor(i); + + ClearBufferTag(&buf->tag); - pg_atomic_init_u32(&buf->state, 0); - buf->wait_backend_pgprocno = INVALID_PROC_NUMBER; + pg_atomic_init_u32(&buf->state, 0); + buf->wait_backend_pgprocno = INVALID_PROC_NUMBER; - buf->buf_id = i; + buf->buf_id = i; - pgaio_wref_clear(&buf->io_wref); + pgaio_wref_clear(&buf->io_wref); - LWLockInitialize(BufferDescriptorGetContentLock(buf), - LWTRANCHE_BUFFER_CONTENT); + LWLockInitialize(BufferDescriptorGetContentLock(buf), + LWTRANCHE_BUFFER_CONTENT); - ConditionVariableInit(BufferDescriptorGetIOCV(buf)); - } + ConditionVariableInit(BufferDescriptorGetIOCV(buf)); } - /* Init other shared buffer-management stuff */ + /* + * Init other shared buffer-management stuff. + */ StrategyInitialize(!foundDescs); /* Initialize per-backend file flush context */ WritebackContextInit(&BackendWritebackContext, &backend_flush_after); + + /* Declare the size of current buffer pool. */ + NBuffers = NBuffersPending; + pg_atomic_write_u32(&ShmemCtrl->currentNBuffers, NBuffers); + pg_atomic_write_u32(&ShmemCtrl->targetNBuffers, NBuffers); } /* @@ -156,6 +164,8 @@ BufferManagerShmemInit(void) * shared memory segment. The main segment must not allocate anything * related to buffers, every other segment will receive part of the * data. + * + * Also sets the shmem_reserved field for each segment based on MaxNBuffers. */ Size BufferManagerShmemSize(MemoryMappingSizes *mapping_sizes) @@ -163,31 +173,222 @@ BufferManagerShmemSize(MemoryMappingSizes *mapping_sizes) size_t size; /* size of buffer descriptors, plus alignment padding */ - size = add_size(0, mul_size(NBuffers, sizeof(BufferDescPadded))); + size = add_size(0, mul_size(NBuffersPending, sizeof(BufferDescPadded))); size = add_size(size, PG_CACHE_LINE_SIZE); mapping_sizes[BUFFER_DESCRIPTORS_SHMEM_SEGMENT].shmem_req_size = size; + size = add_size(0, mul_size(MaxNBuffers, sizeof(BufferDescPadded))); + size = add_size(size, PG_CACHE_LINE_SIZE); mapping_sizes[BUFFER_DESCRIPTORS_SHMEM_SEGMENT].shmem_reserved = size; /* size of data pages, plus alignment padding */ size = add_size(0, PG_IO_ALIGN_SIZE); - size = add_size(size, mul_size(NBuffers, BLCKSZ)); + size = add_size(size, mul_size(NBuffersPending, BLCKSZ)); mapping_sizes[BUFFERS_SHMEM_SEGMENT].shmem_req_size = size; + size = add_size(0, PG_IO_ALIGN_SIZE); + size = add_size(size, mul_size(MaxNBuffers, BLCKSZ)); mapping_sizes[BUFFERS_SHMEM_SEGMENT].shmem_reserved = size; - /* size of stuff controlled by freelist.c */ - mapping_sizes[STRATEGY_SHMEM_SEGMENT].shmem_req_size = StrategyShmemSize(); - mapping_sizes[STRATEGY_SHMEM_SEGMENT].shmem_reserved = StrategyShmemSize(); - /* size of I/O condition variables, plus alignment padding */ - size = add_size(0, mul_size(NBuffers, + size = add_size(0, mul_size(NBuffersPending, sizeof(ConditionVariableMinimallyPadded))); size = add_size(size, PG_CACHE_LINE_SIZE); mapping_sizes[BUFFER_IOCV_SHMEM_SEGMENT].shmem_req_size = size; + size = add_size(0, mul_size(MaxNBuffers, + sizeof(ConditionVariableMinimallyPadded))); + size = add_size(size, PG_CACHE_LINE_SIZE); mapping_sizes[BUFFER_IOCV_SHMEM_SEGMENT].shmem_reserved = size; /* size of checkpoint sort array in bufmgr.c */ - mapping_sizes[CHECKPOINT_BUFFERS_SHMEM_SEGMENT].shmem_req_size = mul_size(NBuffers, sizeof(CkptSortItem)); - mapping_sizes[CHECKPOINT_BUFFERS_SHMEM_SEGMENT].shmem_reserved = mul_size(NBuffers, sizeof(CkptSortItem)); + mapping_sizes[CHECKPOINT_BUFFERS_SHMEM_SEGMENT].shmem_req_size = mul_size(NBuffersPending, sizeof(CkptSortItem)); + mapping_sizes[CHECKPOINT_BUFFERS_SHMEM_SEGMENT].shmem_reserved = mul_size(MaxNBuffers, sizeof(CkptSortItem)); + + /* Allocations in the main memory segment, at the end. */ + + /* size of stuff controlled by freelist.c */ + size = add_size(0, StrategyShmemSize()); return size; } + +/* + * Reinitialize shared buffer manager structures when resizing the buffer pool. + * + * This function is called in the backend which coordinates buffer resizing + * operation. + * + * TODO: Avoid code duplication with BufferManagerShmemInit() and also assess + * which functionality in the latter is required in this function. + */ +void +BufferManagerShmemResize(int currentNBuffers, int targetNBuffers) +{ + bool found; + int i; + void *tmpPtr; + + tmpPtr = (BufferDescPadded *) + ShmemUpdateStructInSegment("Buffer Descriptors", + targetNBuffers * sizeof(BufferDescPadded), + &found, BUFFER_DESCRIPTORS_SHMEM_SEGMENT); + if (BufferDescriptors != tmpPtr || !found) + elog(FATAL, "resizing buffer descriptors failed: expected pointer %p, got %p, found=%d", + BufferDescriptors, tmpPtr, found); + + tmpPtr = (ConditionVariableMinimallyPadded *) + ShmemUpdateStructInSegment("Buffer IO Condition Variables", + targetNBuffers * sizeof(ConditionVariableMinimallyPadded), + &found, BUFFER_IOCV_SHMEM_SEGMENT); + if (BufferIOCVArray != tmpPtr || !found) + elog(FATAL, "resizing buffer IO condition variables failed: expected pointer %p, got %p, found=%d", + BufferIOCVArray, tmpPtr, found); + + tmpPtr = (CkptSortItem *) + ShmemUpdateStructInSegment("Checkpoint BufferIds", + targetNBuffers * sizeof(CkptSortItem), &found, + CHECKPOINT_BUFFERS_SHMEM_SEGMENT); + if (CkptBufferIds != tmpPtr || !found) + elog(FATAL, "resizing checkpoint buffer IDs failed: expected pointer %p, got %p, found=%d", + CkptBufferIds, tmpPtr, found); + + tmpPtr = (char *) + TYPEALIGN(PG_IO_ALIGN_SIZE, + ShmemUpdateStructInSegment("Buffer Blocks", + targetNBuffers * (Size) BLCKSZ + PG_IO_ALIGN_SIZE, + &found, BUFFERS_SHMEM_SEGMENT)); + if (BufferBlocks != tmpPtr || !found) + elog(FATAL, "resizing buffer blocks failed: expected pointer %p, got %p, found=%d", + BufferBlocks, tmpPtr, found); + + /* + * Initialize the headers for new buffers. If we are shrinking the + * buffers, currentNBuffers >= targetNBuffers, thus this loop doesn't execute. + */ + for (i = currentNBuffers; i < targetNBuffers; i++) + { + BufferDesc *buf = GetBufferDescriptor(i); + + ClearBufferTag(&buf->tag); + + pg_atomic_init_u32(&buf->state, 0); + buf->wait_backend_pgprocno = INVALID_PROC_NUMBER; + + buf->buf_id = i; + + LWLockInitialize(BufferDescriptorGetContentLock(buf), + LWTRANCHE_BUFFER_CONTENT); + + ConditionVariableInit(BufferDescriptorGetIOCV(buf)); + } + + /* + * We do not touch StrategyControl here. Instead it is done by background + * writer when handling PROCSIGNAL_BARRIER_SHBUF_EXPAND or + * PROCSIGNAL_BARRIER_SHBUF_SHRINK barrier. + */ +} + +/* + * BufferManagerShmemValidate + * Validate that buffer manager shared memory structures have correct + * pointers and sizes after a resize operation. + * + * This function is called by backends during ProcessBarrierShmemResizeStruct + * to ensure their view of the buffer structures is consistent after memory + * remapping. + */ +void +BufferManagerShmemValidate(int targetNBuffers) +{ + bool found; + void *tmpPtr; + + /* Validate Buffer Descriptors */ + tmpPtr = (BufferDescPadded *) + ShmemInitStructInSegment("Buffer Descriptors", + targetNBuffers * sizeof(BufferDescPadded), + &found, BUFFER_DESCRIPTORS_SHMEM_SEGMENT); + if (!found || BufferDescriptors != tmpPtr) + elog(FATAL, "validating buffer descriptors failed: expected pointer %p, got %p, found=%d", + BufferDescriptors, tmpPtr, found); + + /* Validate Buffer IO Condition Variables */ + tmpPtr = (ConditionVariableMinimallyPadded *) + ShmemInitStructInSegment("Buffer IO Condition Variables", + targetNBuffers * sizeof(ConditionVariableMinimallyPadded), + &found, BUFFER_IOCV_SHMEM_SEGMENT); + if (!found || BufferIOCVArray != tmpPtr) + elog(FATAL, "validating buffer IO condition variables failed: expected pointer %p, got %p, found=%d", + BufferIOCVArray, tmpPtr, found); + + /* Validate Checkpoint BufferIds */ + tmpPtr = (CkptSortItem *) + ShmemInitStructInSegment("Checkpoint BufferIds", + targetNBuffers * sizeof(CkptSortItem), &found, + CHECKPOINT_BUFFERS_SHMEM_SEGMENT); + if (!found || CkptBufferIds != tmpPtr) + elog(FATAL, "validating checkpoint buffer IDs failed: expected pointer %p, got %p, found=%d", + CkptBufferIds, tmpPtr, found); + + /* Validate Buffer Blocks */ + tmpPtr = (char *) + TYPEALIGN(PG_IO_ALIGN_SIZE, + ShmemInitStructInSegment("Buffer Blocks", + targetNBuffers * (Size) BLCKSZ + PG_IO_ALIGN_SIZE, + &found, BUFFERS_SHMEM_SEGMENT)); + if (!found || BufferBlocks != tmpPtr) + elog(FATAL, "validating buffer blocks failed: expected pointer %p, got %p, found=%d", + BufferBlocks, tmpPtr, found); +} + +/* + * check_shared_buffers + * GUC check_hook for shared_buffers + * + * When reloading the configuration, shared_buffers should not be set to a value + * higher than max_shared_buffers fixed at the boot time. + */ +bool +check_shared_buffers(int *newval, void **extra, GucSource source) +{ + if (finalMaxNBuffers && *newval > MaxNBuffers) + { + GUC_check_errdetail("\"shared_buffers\" must be less than \"max_shared_buffers\"."); + return false; + } + return true; +} + +/* + * show_shared_buffers + * GUC show_hook for shared_buffers + * + * Shows both current and pending buffer counts with proper unit formatting. + */ +const char * +show_shared_buffers(void) +{ + static char buffer[128]; + int64 current_value, pending_value; + const char *current_unit, *pending_unit; + int currentNBuffers = pg_atomic_read_u32(&ShmemCtrl->currentNBuffers); + + if (currentNBuffers == NBuffersPending) + { + /* No buffer pool resizing pending. */ + convert_int_from_base_unit(currentNBuffers, GUC_UNIT_BLOCKS, ¤t_value, ¤t_unit); + snprintf(buffer, sizeof(buffer), INT64_FORMAT "%s", current_value, current_unit); + } + else + { + /* + * New value for NBuffers is loaded but not applied yet, show both + * current and pending. + */ + convert_int_from_base_unit(currentNBuffers, GUC_UNIT_BLOCKS, ¤t_value, ¤t_unit); + convert_int_from_base_unit(NBuffersPending, GUC_UNIT_BLOCKS, &pending_value, &pending_unit); + snprintf(buffer, sizeof(buffer), INT64_FORMAT "%s (pending: " INT64_FORMAT "%s)", + current_value, current_unit, pending_value, pending_unit); + } + + return buffer; +} diff --git a/src/backend/storage/buffer/buf_resize.c b/src/backend/storage/buffer/buf_resize.c new file mode 100644 index 000000000000..e815600c3ba0 --- /dev/null +++ b/src/backend/storage/buffer/buf_resize.c @@ -0,0 +1,399 @@ +/*------------------------------------------------------------------------- + * + * buf_resize.c + * shared buffer pool resizing functionality + * + * This module contains the implementation of shared buffer pool resizing, + * including the main resize coordination function and barrier processing + * functions that synchronize all backends during resize operations. + * + * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/buffer/buf_resize.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "fmgr.h" +#include "miscadmin.h" +#include "postmaster/bgwriter.h" +#include "storage/bufmgr.h" +#include "storage/buf_internals.h" +#include "storage/ipc.h" +#include "storage/pg_shmem.h" +#include "storage/pmsignal.h" +#include "storage/procsignal.h" +#include "storage/shmem.h" +#include "utils/injection_point.h" + + +/* + * Prepare ShmemCtrl for resizing the shared buffer pool. + */ +static void +MarkBufferResizingStart(int targetNBuffers, int currentNBuffers) +{ + Assert(!pg_atomic_unlocked_test_flag(&ShmemCtrl->resize_in_progress)); + + Assert(pg_atomic_read_u32(&ShmemCtrl->currentNBuffers) == currentNBuffers); + + pg_atomic_write_u32(&ShmemCtrl->targetNBuffers, targetNBuffers); + ShmemCtrl->coordinator = MyProcPid; +} + +/* + * Reset ShmemCtrl after resizing the shared buffer pool is done. + */ +static void +MarkBufferResizingEnd(int NBuffers) +{ + Assert(!pg_atomic_unlocked_test_flag(&ShmemCtrl->resize_in_progress)); + + Assert(pg_atomic_read_u32(&ShmemCtrl->currentNBuffers) == NBuffers); + pg_atomic_write_u32(&ShmemCtrl->targetNBuffers, 0); + ShmemCtrl->coordinator = -1; +} + +/* + * Communicate given buffer pool resize barrier to all other backends and the Postmaster. + * + * ProcSignalBarrier is not sent to the Postmaster but we need the Postmaster to + * update its knowledge about the buffer pool so that it can be inherited by the + * child processes. + */ +static void +SharedBufferResizeBarrier(ProcSignalBarrierType barrier, const char *barrier_name) +{ + WaitForProcSignalBarrier(EmitProcSignalBarrier(barrier)); + elog(LOG, "all backends acknowledged %s barrier", barrier_name); + +#ifdef USE_INJECTION_POINTS + /* Injection point specific to this barrier type */ + switch (barrier) + { + case PROCSIGNAL_BARRIER_SHBUF_SHRINK: + INJECTION_POINT("pgrsb-shrink-barrier-sent", NULL); + break; + case PROCSIGNAL_BARRIER_SHBUF_RESIZE_MAP_AND_MEM: + INJECTION_POINT("pgrsb-resize-barrier-sent", NULL); + break; + case PROCSIGNAL_BARRIER_SHBUF_EXPAND: + INJECTION_POINT("pgrsb-expand-barrier-sent", NULL); + break; + case PROCSIGNAL_BARRIER_SHBUF_RESIZE_FAILED: + /* TODO: Add an injection point here. */ + break; + case PROCSIGNAL_BARRIER_SMGRRELEASE: + /* + * Not relevant in this function but it's here so that the compiler + * can detect any missing shared buffer resizing barrier enum here. + */ + break; + } +#endif /* USE_INJECTION_POINTS */ +} + +/* + * C implementation of SQL interface to update the shared buffers according to + * the current values of shared_buffers GUCs. + * + * The current boundaries of the buffer pool are given by two ranges. + * + * - [1, StrategyControl::activeNBuffers] is the range of buffers from which new + * allocations can happen at any time. + * + * - [1, ShmemCtrl::currentNBuffers] is the range of valid buffers at any given + * time. + * + * Let's assume that before resizing, the number of buffers in the buffer pool is + * NBuffersOld. After resizing it is NBuffersNew. Before resizing + * StrategyControl::activeNBuffers == ShmemCtrl::currentNBuffers == NBuffersOld. + * After the resizing finishes StrategyControl::activeNBuffers == + * ShmemCtrl::currentNBuffers == NBuffersNew. Thus when no resizing happens these + * two ranges are same. + * + * Following steps are performed by the coordinator during resizing. + * + * 1. Marks resizing in progress to avoid multiple concurrent invocations of this + * function. + * + * 2. When shrinking the shared buffer pool, the coordinator sends SHBUF_SHRINK + * ProcSignalBarrier. In response to this barrier background writer is expected + * to set StrategyControl::activeNBuffers = NBuffersNew to restrict the new + * buffer allocations only to the new buffer pool size and also reset its + * internal state. Once every backend has acknowledged the barrier, the + * coordinator can be sure that new allocations will not happen in the buffer + * pool area being shrunk. Then it evicts the buffers in that area. Note that + * ShmemCtrl::currentNBuffers is still NBuffersOld, since backend may still + * access buffers allocated before the resizing started. Buffer eviction may fail + * if a buffer being evicted is pinned and the resizing operatino is aborted. + * Once the eviction is finished, the extra memory can be freed in the next step. + * + * 2. This step is executed in both cases, when expanding the buffer pool or + * shrinking the buffer pool. The anonymous file backing each of the shared + * memory segment containg the buffer pool shared data structures is resized to + * the amount of memory required for the new buffer pool size. When expanding the + * expanded portion of memory is initialized appropriately. + * ShmemCtrl::currentNBuffers is set to NBuffersNew to indicate new range of + * valid shared buffers. Every backend is sent SHBUF_RESIZE_MAP_AND_MEM barrier. + * All the backends validate that their pointers to the shared buffers structure + * are valid and have the right size. Once every backend has acknowledged the + * barrier, this step finishes. + * + * 3. When expanding the buffer pool, the coordinator sends SHBUF_EXPAND barrier + * to signal end of expansion. When expadning the background writer, in response + * to StrategyControl::activeNBuffers = NBufferNew so that new allocations can + * use expanded range of buffer pool. + * + * TODO: Handle the case when the backend executing this function dies or the + * query is cancelled or it hits an error while resizing. + */ +Datum +pg_resize_shared_buffers(PG_FUNCTION_ARGS) +{ + bool result = true; + int currentNBuffers = pg_atomic_read_u32(&ShmemCtrl->currentNBuffers); + int targetNBuffers = NBuffersPending; + + if (currentNBuffers == targetNBuffers) + { + elog(LOG, "shared buffers are already at %d, no need to resize", currentNBuffers); + PG_RETURN_BOOL(true); + } + + if (!pg_atomic_test_set_flag(&ShmemCtrl->resize_in_progress)) + { + elog(LOG, "shared buffer resizing already in progress"); + PG_RETURN_BOOL(false); + } + + /* + * TODO: What if the NBuffersPending value seen here is not the desired one + * because somebody did a pg_reload_conf() between the last pg_reload_conf() + * and execution of this function? + */ + MarkBufferResizingStart(targetNBuffers, currentNBuffers); + elog(LOG, "resizing shared buffers from %d to %d", currentNBuffers, targetNBuffers); + + INJECTION_POINT("pg-resize-shared-buffers-flag-set", NULL); + + /* Phase 1: SHBUF_SHRINK - Only for shrinking buffer pool */ + if (targetNBuffers < currentNBuffers) + { + /* + * Phase 1: Shrinking - send SHBUF_SHRINK barrier + * Every backend sets activeNBuffers = NewNBuffers to restrict + * buffer pool allocations to the new size + */ + elog(LOG, "Phase 1: Shrinking buffer pool, restricting allocations to %d buffers", targetNBuffers); + + SharedBufferResizeBarrier(PROCSIGNAL_BARRIER_SHBUF_SHRINK, CppAsString(PROCSIGNAL_BARRIER_SHBUF_SHRINK)); + + /* Evict buffers in the area being shrunk */ + elog(LOG, "evicting buffers %u..%u", targetNBuffers + 1, currentNBuffers); + if (!EvictExtraBuffers(targetNBuffers, currentNBuffers)) + { + elog(WARNING, "failed to evict extra buffers during shrinking"); + SharedBufferResizeBarrier(PROCSIGNAL_BARRIER_SHBUF_RESIZE_FAILED, CppAsString(PROCSIGNAL_BARRIER_SHBUF_RESIZE_FAILED)); + MarkBufferResizingEnd(currentNBuffers); + pg_atomic_clear_flag(&ShmemCtrl->resize_in_progress); + PG_RETURN_BOOL(false); + } + + /* Update the current NBuffers. */ + pg_atomic_write_u32(&ShmemCtrl->currentNBuffers, targetNBuffers); + } + + /* Phase 2: SHBUF_RESIZE_MAP_AND_MEM - Both expanding and shrinking */ + elog(LOG, "Phase 2: Remapping shared memory segments and updating structures"); + if (!AnonymousShmemResize()) + { + /* + * This should never fail since address map should already be reserved. + * So the failure should be treated as PANIC. + */ + elog(PANIC, "failed to resize anonymous shared memory"); + } + + /* Update structure pointers and sizes */ + BufferManagerShmemResize(currentNBuffers, targetNBuffers); + + INJECTION_POINT("pgrsb-after-shmem-resize", NULL); + + SharedBufferResizeBarrier(PROCSIGNAL_BARRIER_SHBUF_RESIZE_MAP_AND_MEM, CppAsString(PROCSIGNAL_BARRIER_SHBUF_RESIZE_MAP_AND_MEM)); + + /* Phase 3: SHBUF_EXPAND - Only for expanding buffer pool */ + if (targetNBuffers > currentNBuffers) + { + /* + * Phase 3: Expanding - send SHBUF_EXPAND barrier + * Backends set activeNBuffers = NewNBuffers and start allocating + * buffers from the expanded range + */ + elog(LOG, "Phase 3: Expanding buffer pool, enabling allocations up to %d buffers", targetNBuffers); + pg_atomic_write_u32(&ShmemCtrl->currentNBuffers, targetNBuffers); + + SharedBufferResizeBarrier(PROCSIGNAL_BARRIER_SHBUF_EXPAND, CppAsString(PROCSIGNAL_BARRIER_SHBUF_EXPAND)); + } + + /* + * Reset buffer resize control area. + */ + MarkBufferResizingEnd(targetNBuffers); + + pg_atomic_clear_flag(&ShmemCtrl->resize_in_progress); + + elog(LOG, "successfully resized shared buffers to %d", targetNBuffers); + + PG_RETURN_BOOL(result); +} + +bool +ProcessBarrierShmemShrink(void) +{ + int targetNBuffers = pg_atomic_read_u32(&ShmemCtrl->targetNBuffers); + + Assert(!pg_atomic_unlocked_test_flag(&ShmemCtrl->resize_in_progress)); + + /* + * Delay adjusting the new active size of buffer pool till this process + * becomes ready to resize buffers. + */ + if (delay_shmem_resize) + { + elog(LOG, "Phase 1: Delaying SHBUF_SHRINK barrier - restricting allocations to %d buffers, coordinator is %d", + targetNBuffers, ShmemCtrl->coordinator); + + return false; + } + + if (MyBackendType == B_BG_WRITER) + { + /* + * We have to reset the background writer's buffer allocation statistics + * and the strategy control together so that background writer doesn't go + * out of sync with ClockSweepTick(). + * + * TODO: But in case the background writer is not running, nobody would + * reset the strategy control area. So we can't rely on background + * worker to do that. So find a better way. + */ + BgBufferSyncReset(NBuffers, targetNBuffers); + /* Reset strategy control to new size */ + StrategyReset(targetNBuffers); + } + + elog(LOG, "Phase 1: Processing SHBUF_SHRINK barrier - NBuffers = %d, coordinator is %d", + NBuffers, ShmemCtrl->coordinator); + + return true; +} + +bool +ProcessBarrierShmemResizeMapAndMem(void) +{ + int targetNBuffers = pg_atomic_read_u32(&ShmemCtrl->targetNBuffers); + + Assert(!pg_atomic_unlocked_test_flag(&ShmemCtrl->resize_in_progress)); + + /* + * If buffer pool is being shrunk, we are already working with a smaller + * buffer pool, so shrinking address space and shared structures should not + * be a problem. When expanding, expanding the address space and shared + * structures beyond the current boundaries is not going to be a problem + * since we are not accessing that memory yet. So there is no reason to + * delay processing this barrier. + */ + + /* + * Coordinator has already adjusted its address map and also updated sizes + * of the shared buffer structures, no further validation needed. + */ + if (ShmemCtrl->coordinator == MyProcPid) + return true; + + /* + * Backends validate that their pointers to shared buffer structures are + * still valid and have the correct size after memory remapping. + * + * TODO: Do want to do this only in assert enabled builds? + */ + BufferManagerShmemValidate(targetNBuffers); + + elog(LOG, "Backend %d successfully validated structure pointers after resize", MyProcPid); + + return true; +} + +bool +ProcessBarrierShmemExpand(void) +{ + int targetNBuffers = pg_atomic_read_u32(&ShmemCtrl->targetNBuffers); + + Assert(!pg_atomic_unlocked_test_flag(&ShmemCtrl->resize_in_progress)); + + /* + * Delay adjusting the new active size of buffer pool till this process + * becomes ready to resize buffers. + */ + if (delay_shmem_resize) + { + elog(LOG, "Phase 3: delaying SHBUF_EXPAND barrier - enabling allocations up to %d buffers, coordinator is %d", + targetNBuffers, ShmemCtrl->coordinator); + return false; + } + + if (MyBackendType == B_BG_WRITER) + { + /* + * We have to reset the background writer's buffer allocation statistics + * and the strategy control together so that background writer doesn't go + * out of sync with ClockSweepTick(). + * + * TODO: But in case the background writer is not running, nobody would + * reset the strategy control area. So we can't rely on background + * worker to do that. So find a better way. + */ + BgBufferSyncReset(NBuffers, targetNBuffers); + StrategyReset(targetNBuffers); + } + + elog(LOG, "Phase 3: Processing SHBUF_EXPAND barrier - targetNBuffers = %d, ShmemCtrl->coordinator = %d", targetNBuffers, ShmemCtrl->coordinator); + + return true; +} + +bool +ProcessBarrierShmemResizeFailed(void) +{ + int currentNBuffers = pg_atomic_read_u32(&ShmemCtrl->currentNBuffers); + int targetNBuffers = pg_atomic_read_u32(&ShmemCtrl->targetNBuffers); + + Assert(!pg_atomic_unlocked_test_flag(&ShmemCtrl->resize_in_progress)); + + if (MyBackendType == B_BG_WRITER) + { + /* + * We have to reset the background writer's buffer allocation statistics + * and the strategy control together so that background writer doesn't go + * out of sync with ClockSweepTick(). + * + * TODO: But in case the background writer is not running, nobody would + * reset the strategy control area. So we can't rely on background + * worker to do that. So find a better way. + */ + BgBufferSyncReset(NBuffers, currentNBuffers); + /* Reset strategy control to new size */ + StrategyReset(currentNBuffers); + } + + elog(LOG, "received proc signal indicating failure to resize shared buffers from %d to %d, restoring to %d, coordinator is %d", + NBuffers, targetNBuffers, currentNBuffers, ShmemCtrl->coordinator); + + return true; +} \ No newline at end of file diff --git a/src/backend/storage/buffer/buf_table.c b/src/backend/storage/buffer/buf_table.c index 67e87f9935de..18c9c6f336c1 100644 --- a/src/backend/storage/buffer/buf_table.c +++ b/src/backend/storage/buffer/buf_table.c @@ -65,11 +65,18 @@ InitBufTable(int size) info.entrysize = sizeof(BufferLookupEnt); info.num_partitions = NUM_BUFFER_PARTITIONS; + /* + * The shared buffer look up table is set up only once with maximum possible + * entries considering maximum size of the buffer pool. It is not resized + * after that even if the buffer pool is resized. Hence it is allocated in + * the main shared memory segment and not in a resizeable shared memory + * segment. + */ SharedBufHash = ShmemInitHashInSegment("Shared Buffer Lookup Table", size, size, &info, HASH_ELEM | HASH_BLOBS | HASH_PARTITION | HASH_FIXED_SIZE, - STRATEGY_SHMEM_SEGMENT); + MAIN_SHMEM_SEGMENT); } /* diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 327ddb7adc88..6c8f8552a4cc 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -57,6 +57,7 @@ #include "storage/fd.h" #include "storage/ipc.h" #include "storage/lmgr.h" +#include "storage/pg_shmem.h" #include "storage/proc.h" #include "storage/read_stream.h" #include "storage/smgr.h" @@ -3607,6 +3608,32 @@ BufferSync(int flags) TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan); } +/* + * Information saved between BgBufferSync() calls so we can determine the + * strategy point's advance rate and avoid scanning already-cleaned buffers. The + * variables are global instead of static local so that BgBufferSyncReset() can + * adjust it when resizing shared buffers. + */ +static bool saved_info_valid = false; +static int prev_strategy_buf_id; +static uint32 prev_strategy_passes; +static int next_to_clean; +static uint32 next_passes; + +/* Moving averages of allocation rate and clean-buffer density */ +static float smoothed_alloc = 0; +static float smoothed_density = 10.0; + +void +BgBufferSyncReset(int currentNBuffers, int targetNBuffers) +{ + saved_info_valid = false; +#ifdef BGW_DEBUG + elog(DEBUG2, "invalidated background writer status after resizing buffers from %d to %d", + currentNBuffers, targetNBuffers); +#endif +} + /* * BgBufferSync -- Write out some dirty buffers in the pool. * @@ -3626,20 +3653,6 @@ BgBufferSync(WritebackContext *wb_context) uint32 strategy_passes; uint32 recent_alloc; - /* - * Information saved between calls so we can determine the strategy - * point's advance rate and avoid scanning already-cleaned buffers. - */ - static bool saved_info_valid = false; - static int prev_strategy_buf_id; - static uint32 prev_strategy_passes; - static int next_to_clean; - static uint32 next_passes; - - /* Moving averages of allocation rate and clean-buffer density */ - static float smoothed_alloc = 0; - static float smoothed_density = 10.0; - /* Potentially these could be tunables, but for now, not */ float smoothing_samples = 16; float scan_whole_pool_milliseconds = 120000.0; @@ -3662,6 +3675,25 @@ BgBufferSync(WritebackContext *wb_context) long new_strategy_delta; uint32 new_recent_alloc; + /* + * If buffer pool is being shrunk the buffer being written out may not remain + * valid. If the buffer pool is being expanded, more buffers will become + * available without even this function writing out any. Hence wait till + * buffer resizing finishes i.e. go into hibernation mode. + * + * TODO: We may not need this synchronization if background worker itself + * becomes the coordinator. + */ + if (!pg_atomic_unlocked_test_flag(&ShmemCtrl->resize_in_progress)) + return true; + + /* + * Resizing shared buffers while this function is performing an LRU scan on + * them may lead to wrong results. Indicate that the resizing should wait for + * the LRU scan to complete. + */ + delay_shmem_resize = true; + /* * Find out where the clock-sweep currently is, and how many buffer * allocations have happened since our last call. @@ -3679,6 +3711,7 @@ BgBufferSync(WritebackContext *wb_context) if (bgwriter_lru_maxpages <= 0) { saved_info_valid = false; + delay_shmem_resize = false; return true; } @@ -3838,8 +3871,17 @@ BgBufferSync(WritebackContext *wb_context) num_written = 0; reusable_buffers = reusable_buffers_est; - /* Execute the LRU scan */ - while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est) + /* + * Execute the LRU scan. + * + * If buffer pool is being shrunk, the buffer being written may not remain + * valid. If the buffer pool is being expanded, more buffers will become + * available without even this function writing any. Hence stop what we are doing. This + * also unblocks other processes that are waiting for buffer resizing to + * finish. + */ + while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est && + !pg_atomic_unlocked_test_flag(&ShmemCtrl->resize_in_progress)) { int sync_state = SyncOneBuffer(next_to_clean, true, wb_context); @@ -3898,6 +3940,9 @@ BgBufferSync(WritebackContext *wb_context) #endif } + /* Let the resizing commence. */ + delay_shmem_resize = false; + /* Return true if OK to hibernate */ return (bufs_to_lap == 0 && recent_alloc == 0); } @@ -4208,7 +4253,23 @@ DebugPrintBufferRefcount(Buffer buffer) void CheckPointBuffers(int flags) { + /* Mark that buffer sync is in progress - delay any shared memory resizing. */ + /* + * TODO: We need to assess whether we should allow checkpoint and buffer + * resizing to run in parallel. When expanding buffers it may be fine to let + * the checkpointer run in RESIZE_MAP_AND_MEM phase but delay phase EXPAND + * phase till the checkpoint finishes, at the same time not allow checkpoint + * to run during expansion phase. When shrinking the buffers, we should + * delay SHRINK phase till checkpoint finishes and not allow to start + * checkpoint till SHRINK phase is done, but allow it to run in + * RESIZE_MAP_AND_MEM phase. This needs careful analysis and testing. + */ + delay_shmem_resize = true; + BufferSync(flags); + + /* Mark that buffer sync is no longer in progress - allow shared memory resizing */ + delay_shmem_resize = false; } /* @@ -7466,3 +7527,70 @@ const PgAioHandleCallbacks aio_local_buffer_readv_cb = { .complete_local = local_buffer_readv_complete, .report = buffer_readv_report, }; + +/* + * When shrinking shared buffers pool, evict the buffers which will not be part + * of the shrunk buffer pool. + */ +bool +EvictExtraBuffers(int targetNBuffers, int currentNBuffers) +{ + bool result = true; + + Assert(targetNBuffers < currentNBuffers); + + /* + * If the buffer being evicated is locked, this function will need to wait. + * This function should not be called from a Postmaster since it can not wait on a lock. + */ + Assert(IsUnderPostmaster); + + /* + * TODO: Before evicting any buffer, we should check whether any of the + * buffers are pinned. If we find that a buffer is pinned after evicting + * most of them, that will impact performance since all those evicted + * buffers might need to be read again. + */ + for (Buffer buf = targetNBuffers + 1; buf <= currentNBuffers; buf++) + { + BufferDesc *desc = GetBufferDescriptor(buf - 1); + uint32 buf_state; + bool buffer_flushed; + + buf_state = pg_atomic_read_u32(&desc->state); + + /* + * Nobody is expected to touch the buffers while resizing is + * going one hence unlocked precheck should be safe and saves + * some cycles. + */ + if (!(buf_state & BM_VALID)) + continue; + + /* + * XXX: Looks like CurrentResourceOwner can be NULL here, find + * another one in that case? + * */ + if (CurrentResourceOwner) + ResourceOwnerEnlarge(CurrentResourceOwner); + + ReservePrivateRefCountEntry(); + + LockBufHdr(desc); + + /* + * Now that we have locked buffer descriptor, make sure that the + * buffer without valid data has been skipped above. + */ + Assert(buf_state & BM_VALID); + + if (!EvictUnpinnedBufferInternal(desc, &buffer_flushed)) + { + elog(WARNING, "could not remove buffer %u, it is pinned", buf); + result = false; + break; + } + } + + return result; +} diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c index 13ee840ab9f5..256521d889af 100644 --- a/src/backend/storage/buffer/freelist.c +++ b/src/backend/storage/buffer/freelist.c @@ -33,10 +33,16 @@ typedef struct /* Spinlock: protects the values below */ slock_t buffer_strategy_lock; + /* + * Number of active buffers that can be allocated. During buffer resizing, + * this may be different from NBuffers which tracks the global buffer count. + */ + pg_atomic_uint32 activeNBuffers; + /* * clock-sweep hand: index of next buffer to consider grabbing. Note that * this isn't a concrete buffer - we only ever increase the value. So, to - * get an actual buffer, it needs to be used modulo NBuffers. + * get an actual buffer, it needs to be used modulo activeNBuffers. */ pg_atomic_uint32 nextVictimBuffer; @@ -101,21 +107,27 @@ static inline uint32 ClockSweepTick(void) { uint32 victim; + int activeBuffers; /* - * Atomically move hand ahead one buffer - if there's several processes - * doing this, this can lead to buffers being returned slightly out of - * apparent order. + * Atomically move hand ahead one buffer - if there's several processes doing + * this, this can lead to buffers being returned slightly out of apparent + * order. We need to read both the current position of hand and the current + * buffer allocation limit together consistently. They may be reset by + * concurrent resize. */ + SpinLockAcquire(&StrategyControl->buffer_strategy_lock); victim = pg_atomic_fetch_add_u32(&StrategyControl->nextVictimBuffer, 1); + activeBuffers = pg_atomic_read_u32(&StrategyControl->activeNBuffers); + SpinLockRelease(&StrategyControl->buffer_strategy_lock); - if (victim >= NBuffers) + if (victim >= activeBuffers) { uint32 originalVictim = victim; /* always wrap what we look up in BufferDescriptors */ - victim = victim % NBuffers; + victim = victim % activeBuffers; /* * If we're the one that just caused a wraparound, force @@ -143,7 +155,7 @@ ClockSweepTick(void) */ SpinLockAcquire(&StrategyControl->buffer_strategy_lock); - wrapped = expected % NBuffers; + wrapped = expected % activeBuffers; success = pg_atomic_compare_exchange_u32(&StrategyControl->nextVictimBuffer, &expected, wrapped); @@ -228,7 +240,8 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocs, 1); /* Use the "clock sweep" algorithm to find a free buffer */ - trycounter = NBuffers; + trycounter = pg_atomic_read_u32(&StrategyControl->activeNBuffers); + for (;;) { uint32 old_buf_state; @@ -281,7 +294,7 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state, local_buf_state)) { - trycounter = NBuffers; + trycounter = pg_atomic_read_u32(&StrategyControl->activeNBuffers); break; } } @@ -323,10 +336,12 @@ StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc) { uint32 nextVictimBuffer; int result; + uint32 activeNBuffers; SpinLockAcquire(&StrategyControl->buffer_strategy_lock); nextVictimBuffer = pg_atomic_read_u32(&StrategyControl->nextVictimBuffer); - result = nextVictimBuffer % NBuffers; + activeNBuffers = pg_atomic_read_u32(&StrategyControl->activeNBuffers); + result = nextVictimBuffer % activeNBuffers; if (complete_passes) { @@ -336,7 +351,7 @@ StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc) * Additionally add the number of wraparounds that happened before * completePasses could be incremented. C.f. ClockSweepTick(). */ - *complete_passes += nextVictimBuffer / NBuffers; + *complete_passes += nextVictimBuffer / activeNBuffers; } if (num_buf_alloc) @@ -383,7 +398,7 @@ StrategyShmemSize(void) Size size = 0; /* size of lookup hash table ... see comment in StrategyInitialize */ - size = add_size(size, BufTableShmemSize(NBuffers + NUM_BUFFER_PARTITIONS)); + size = add_size(size, BufTableShmemSize(MaxNBuffers + NUM_BUFFER_PARTITIONS)); /* size of the shared replacement strategy control block */ size = add_size(size, MAXALIGN(sizeof(BufferStrategyControl))); @@ -391,6 +406,31 @@ StrategyShmemSize(void) return size; } +void +StrategyReset(int activeNBuffers) +{ + Assert(StrategyControl); + + SpinLockAcquire(&StrategyControl->buffer_strategy_lock); + + /* Update the active buffer count for the strategy */ + pg_atomic_write_u32(&StrategyControl->activeNBuffers, activeNBuffers); + + /* Reset the clock-sweep pointer to start from beginning */ + pg_atomic_write_u32(&StrategyControl->nextVictimBuffer, 0); + + /* + * The statistics is viewed in the context of the number of shared buffers. + * Reset it as the size of active number of shared buffers changes. + */ + StrategyControl->completePasses = 0; + pg_atomic_write_u32(&StrategyControl->numBufferAllocs, 0); + + /* TODO: Do we need to seset background writer notifications? */ + StrategyControl->bgwprocno = -1; + SpinLockRelease(&StrategyControl->buffer_strategy_lock); +} + /* * StrategyInitialize -- initialize the buffer cache replacement * strategy. @@ -408,12 +448,21 @@ StrategyInitialize(bool init) * * Since we can't tolerate running out of lookup table entries, we must be * sure to specify an adequate table size here. The maximum steady-state - * usage is of course NBuffers entries, but BufferAlloc() tries to insert - * a new entry before deleting the old. In principle this could be - * happening in each partition concurrently, so we could need as many as - * NBuffers + NUM_BUFFER_PARTITIONS entries. + * usage is of course is as many number of entries as the number of buffers + * in the buffer pool. Right now there is no way to free shared memory. Even + * if we shrink the buffer lookup table when shrinking the buffer pool the + * unused hash table entries can not be freed. When we expand the buffer + * pool, more entries can be allocated but we can not resize the hash table + * directory without rehashing all the entries. Just allocating more entries + * will lead to more contention. Hence we setup the buffer lookup table + * considering the maximum possible size of the buffer pool which is + * MaxNBuffers. + * + * Additionally BufferAlloc() tries to insert a new entry before deleting the + * old. In principle this could be happening in each partition concurrently, + * so we need extra NUM_BUFFER_PARTITIONS entries. */ - InitBufTable(NBuffers + NUM_BUFFER_PARTITIONS); + InitBufTable(MaxNBuffers + NUM_BUFFER_PARTITIONS); /* * Get or create the shared strategy control block @@ -421,7 +470,7 @@ StrategyInitialize(bool init) StrategyControl = (BufferStrategyControl *) ShmemInitStructInSegment("Buffer Strategy Status", sizeof(BufferStrategyControl), - &found, STRATEGY_SHMEM_SEGMENT); + &found, MAIN_SHMEM_SEGMENT); if (!found) { @@ -432,6 +481,8 @@ StrategyInitialize(bool init) SpinLockInit(&StrategyControl->buffer_strategy_lock); + /* Initialize the active buffer count */ + pg_atomic_init_u32(&StrategyControl->activeNBuffers, NBuffersPending); /* Initialize the clock-sweep pointer */ pg_atomic_init_u32(&StrategyControl->nextVictimBuffer, 0); @@ -669,12 +720,23 @@ GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state) strategy->current = 0; /* - * If the slot hasn't been filled yet, tell the caller to allocate a new - * buffer with the normal allocation strategy. He will then fill this - * slot by calling AddBufferToRing with the new buffer. + * If the slot hasn't been filled yet or the buffer in the slot has been + * invalidated when buffer pool was shrunk, tell the caller to allocate a new + * buffer with the normal allocation strategy. He will then fill this slot + * by calling AddBufferToRing with the new buffer. + * + * TODO: Ideally we would want to check for bufnum > NBuffers only once + * after every time the buffer pool is shrunk so as to catch any runtime + * bugs that introduce invalid buffers in the ring. But that is complicated. + * The BufferAccessStrategy objects are not accessible outside the + * ScanState. Hence we can not purge the buffers while evicting the buffers. + * After the resizing is finished, it's not possible to notice when we touch + * the first of those objects and the last of objects. See if this can + * fixed. */ bufnum = strategy->buffers[strategy->current]; - if (bufnum == InvalidBuffer) + if (bufnum == InvalidBuffer || + bufnum > pg_atomic_read_u32(&StrategyControl->activeNBuffers)) return NULL; buf = GetBufferDescriptor(bufnum - 1); diff --git a/src/backend/storage/buffer/meson.build b/src/backend/storage/buffer/meson.build index 448976d2400b..2fc58db5a917 100644 --- a/src/backend/storage/buffer/meson.build +++ b/src/backend/storage/buffer/meson.build @@ -6,4 +6,5 @@ backend_sources += files( 'bufmgr.c', 'freelist.c', 'localbuf.c', + 'buf_resize.c', ) diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index 41190f966395..23e9b53ea074 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -154,6 +154,14 @@ CalculateShmemSize(MemoryMappingSizes *mapping_sizes) size = add_size(size, AioShmemSize()); size = add_size(size, WaitLSNShmemSize()); + /* + * XXX: For some reason slightly more memory is needed for larger + * shared_buffers, but this size is enough for any large value I've tested + * with. Is it a mistake in how slots are split, or there was a hidden + * inconsistency in shmem calculation? + */ + size = add_size(size, 1024 * 1024 * 100); + /* include additional requested shmem from preload libraries */ size = add_size(size, total_addin_request); @@ -168,8 +176,7 @@ CalculateShmemSize(MemoryMappingSizes *mapping_sizes) /* might as well round it off to a multiple of a typical page size */ for (int segment = 0; segment < NUM_MEMORY_MAPPINGS; segment++) { - mapping_sizes[segment].shmem_req_size = add_size(mapping_sizes[segment].shmem_req_size, 8192 - (mapping_sizes[segment].shmem_req_size % 8192)); - mapping_sizes[segment].shmem_reserved = add_size(mapping_sizes[segment].shmem_reserved, 8192 - (mapping_sizes[segment].shmem_reserved % 8192)); + round_off_mapping_sizes(&mapping_sizes[segment]); /* Compute the total size of all segments */ size = size + mapping_sizes[segment].shmem_req_size; } @@ -313,6 +320,8 @@ CreateOrAttachShmemStructs(void) CommitTsShmemInit(); SUBTRANSShmemInit(); MultiXactShmemInit(); + /* TODO: This should be part of BufferManagerShmemInit() */ + ShmemControlInit(); BufferManagerShmemInit(); /* diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c index 087821311cce..c7c36f2be675 100644 --- a/src/backend/storage/ipc/procsignal.c +++ b/src/backend/storage/ipc/procsignal.c @@ -24,9 +24,11 @@ #include "port/pg_bitutils.h" #include "replication/logicalworker.h" #include "replication/walsender.h" +#include "storage/bufmgr.h" #include "storage/condition_variable.h" #include "storage/ipc.h" #include "storage/latch.h" +#include "storage/pg_shmem.h" #include "storage/shmem.h" #include "storage/sinval.h" #include "storage/smgr.h" @@ -109,6 +111,10 @@ static bool CheckProcSignal(ProcSignalReason reason); static void CleanupProcSignalState(int status, Datum arg); static void ResetProcSignalBarrierBits(uint32 flags); +#ifdef DEBUG_SHMEM_RESIZE +bool delay_proc_signal_init = false; +#endif + /* * ProcSignalShmemSize * Compute space needed for ProcSignal's shared memory @@ -170,6 +176,43 @@ ProcSignalInit(const uint8 *cancel_key, int cancel_key_len) uint32 old_pss_pid; Assert(cancel_key_len >= 0 && cancel_key_len <= MAX_CANCEL_KEY_LENGTH); + +#ifdef DEBUG_SHMEM_RESIZE + /* + * Introduced for debugging purposes. You can change the variable at + * runtime using gdb, then start new backends with delayed ProcSignal + * initialization. Simple pg_usleep wont work here due to SIGHUP interrupt + * needed for testing. Taken from pg_sleep; + */ + if (delay_proc_signal_init) + { +#define GetNowFloat() ((float8) GetCurrentTimestamp() / 1000000.0) + float8 endtime = GetNowFloat() + 5; + + for (;;) + { + float8 delay; + long delay_ms; + + CHECK_FOR_INTERRUPTS(); + + delay = endtime - GetNowFloat(); + if (delay >= 600.0) + delay_ms = 600000; + else if (delay > 0.0) + delay_ms = (long) (delay * 1000.0); + else + break; + + (void) WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + delay_ms, + WAIT_EVENT_PG_SLEEP); + ResetLatch(MyLatch); + } + } +#endif + if (MyProcNumber < 0) elog(ERROR, "MyProcNumber not set"); if (MyProcNumber >= NumProcSignalSlots) @@ -576,6 +619,18 @@ ProcessProcSignalBarrier(void) case PROCSIGNAL_BARRIER_SMGRRELEASE: processed = ProcessBarrierSmgrRelease(); break; + case PROCSIGNAL_BARRIER_SHBUF_SHRINK: + processed = ProcessBarrierShmemShrink(); + break; + case PROCSIGNAL_BARRIER_SHBUF_RESIZE_MAP_AND_MEM: + processed = ProcessBarrierShmemResizeMapAndMem(); + break; + case PROCSIGNAL_BARRIER_SHBUF_EXPAND: + processed = ProcessBarrierShmemExpand(); + break; + case PROCSIGNAL_BARRIER_SHBUF_RESIZE_FAILED: + processed = ProcessBarrierShmemResizeFailed(); + break; } /* diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c index f303a9328dfc..eafcb665ba91 100644 --- a/src/backend/storage/ipc/shmem.c +++ b/src/backend/storage/ipc/shmem.c @@ -69,11 +69,19 @@ #include "funcapi.h" #include "miscadmin.h" #include "port/pg_numa.h" +#include "postmaster/bgwriter.h" +#include "storage/bufmgr.h" +#include "storage/buf_internals.h" +#include "storage/ipc.h" #include "storage/lwlock.h" #include "storage/pg_shmem.h" +#include "storage/pmsignal.h" +#include "storage/procsignal.h" #include "storage/shmem.h" #include "storage/spin.h" #include "utils/builtins.h" +#include "utils/injection_point.h" +#include "utils/wait_event.h" static void *ShmemAllocRaw(Size size, Size *allocated_size); static void *ShmemAllocRawInSegment(Size size, Size *allocated_size, @@ -493,8 +501,7 @@ ShmemInitStructInSegment(const char *name, Size size, bool *foundPtr, { /* * Structure is in the shmem index so someone else has allocated it - * already. The size better be the same as the size we are trying to - * initialize to, or there is a name conflict (or worse). + * already. The size better be the same as the size we are trying to */ if (result->size != size) { @@ -504,6 +511,7 @@ ShmemInitStructInSegment(const char *name, Size size, bool *foundPtr, " \"%s\": expected %zu, actual %zu", name, size, result->size))); } + structPtr = result->location; } else @@ -538,6 +546,59 @@ ShmemInitStructInSegment(const char *name, Size size, bool *foundPtr, return structPtr; } +/* + * ShmemUpdateStructInSegment -- Update the size of a structure in shared memory. + * + * This function updates the size of an existing shared memory structure. It + * finds the structure in the shmem index and updates its size information while + * preserving the existing memory location. + * + * Returns: pointer to the existing structure location. + */ +void * +ShmemUpdateStructInSegment(const char *name, Size size, bool *foundPtr, + int shmem_segment) +{ + ShmemIndexEnt *result; + void *structPtr; + Size delta; + + LWLockAcquire(ShmemIndexLock, LW_EXCLUSIVE); + + Assert(ShmemIndex); + + /* Look up the structure in the shmem index */ + result = (ShmemIndexEnt *) + hash_search(ShmemIndex, name, HASH_FIND, foundPtr); + + Assert(*foundPtr); + Assert(result); + Assert(result->shmem_segment == shmem_segment); + + delta = size - result->size; + /* Store the existing structure pointer */ + structPtr = result->location; + + /* Update the size information. + TODO: Ideally we should implement repalloc kind of functionality for shared memory which will return allocated size. */ + result->size = size; + result->allocated_size = size; + + /* Reflect size change in the shared segment */ + SpinLockAcquire(Segments[shmem_segment].ShmemLock); + Segments[shmem_segment].ShmemSegHdr->freeoffset += delta; + SpinLockRelease(Segments[shmem_segment].ShmemLock); + LWLockRelease(ShmemIndexLock); + + /* Verify the structure is still in the correct segment */ + Assert(ShmemAddrIsValidInSegment(structPtr, shmem_segment)); + Assert(structPtr == (void *) CACHELINEALIGN(structPtr)); + + return structPtr; +} + + + /* * Add two Size values, checking for overflow */ @@ -871,4 +932,3 @@ pg_get_shmem_segments(PG_FUNCTION_ARGS) return (Datum) 0; } - diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 7dd75a490aab..9c9ebe4280a0 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -63,6 +63,7 @@ #include "rewrite/rewriteHandler.h" #include "storage/bufmgr.h" #include "storage/ipc.h" +#include "storage/pg_shmem.h" #include "storage/pmsignal.h" #include "storage/proc.h" #include "storage/procsignal.h" @@ -4128,6 +4129,9 @@ PostgresSingleUserMain(int argc, char *argv[], /* Initialize size of fast-path lock cache. */ InitializeFastPathLocks(); + /* Initialize MaxNBuffers for buffer pool resizing. */ + InitializeMaxNBuffers(); + /* * Give preloaded libraries a chance to request additional shared memory. */ @@ -4318,6 +4322,13 @@ PostgresMain(const char *dbname, const char *username) */ BeginReportingGUCOptions(); + /* + * TODO: The new backend should fetch the shared buffers status. If the + * resizing is going on, it should bring itself upto speed with it. If not, + * simply fetch the latest pointers are sizes. Is this the right place to do + * that? + */ + /* * Also set up handler to log session end; we have to wait till now to be * sure Log_disconnections has its final value. diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt index c1ac71ff7f24..ee5887496baf 100644 --- a/src/backend/utils/activity/wait_event_names.txt +++ b/src/backend/utils/activity/wait_event_names.txt @@ -162,6 +162,7 @@ WAL_RECEIVER_EXIT "Waiting for the WAL receiver to exit." WAL_RECEIVER_WAIT_START "Waiting for startup process to send initial data for streaming replication." WAL_SUMMARY_READY "Waiting for a new WAL summary to be generated." XACT_GROUP_UPDATE "Waiting for the group leader to update transaction status at transaction end." +PM_BUFFER_RESIZE_WAIT "Waiting for the postmaster to complete shared buffer pool resize operations." ABI_compatibility: @@ -358,6 +359,7 @@ InjectionPoint "Waiting to read or update information related to injection point SerialControl "Waiting to read or update shared pg_serial state." AioWorkerSubmissionQueue "Waiting to access AIO worker submission queue." WaitLSN "Waiting to read or update shared Wait-for-LSN state." +ShmemResize "Waiting to resize shared memory." # # END OF PREDEFINED LWLOCKS (DO NOT CHANGE THIS LINE) diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c index d31cb45a0588..419c7fad8901 100644 --- a/src/backend/utils/init/globals.c +++ b/src/backend/utils/init/globals.c @@ -139,7 +139,10 @@ int max_parallel_maintenance_workers = 2; * MaxBackends is computed by PostmasterMain after modules have had a chance to * register background workers. */ -int NBuffers = 16384; +int NBuffers = 0; +int NBuffersPending = 16384; +bool finalMaxNBuffers = false; +int MaxNBuffers = 0; int MaxConnections = 100; int max_worker_processes = 8; int max_parallel_workers = 8; diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index 98f9598cd789..46a8a8a3faad 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -595,6 +595,55 @@ InitializeFastPathLocks(void) pg_nextpower2_32(FastPathLockGroupsPerBackend)); } +/* + * Initialize MaxNBuffers variable with validation. + * + * This must be called after GUCs have been loaded but before shared memory size + * is determined. + * + * Since MaxNBuffers limits the size of the buffer pool, it must be at least as + * much as NBuffersPending. If MaxNBuffers is 0 (default), set it to + * NBuffersPending. Otherwise, validate that MaxNBuffers is not less than + * NBuffersPending. + */ +void +InitializeMaxNBuffers(void) +{ + if (MaxNBuffers == 0) /* default/boot value */ + { + char buf[32]; + + snprintf(buf, sizeof(buf), "%d", NBuffersPending); + SetConfigOption("max_shared_buffers", buf, PGC_POSTMASTER, + PGC_S_DYNAMIC_DEFAULT); + + /* + * We prefer to report this value's source as PGC_S_DYNAMIC_DEFAULT. + * However, if the DBA explicitly set max_shared_buffers = 0 in + * the config file, then PGC_S_DYNAMIC_DEFAULT will fail to override + * that and we must force the matter with PGC_S_OVERRIDE. + */ + if (MaxNBuffers == 0) /* failed to apply it? */ + SetConfigOption("max_shared_buffers", buf, PGC_POSTMASTER, + PGC_S_OVERRIDE); + } + else + { + if (MaxNBuffers < NBuffersPending) + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("max_shared_buffers (%d) cannot be less than current shared_buffers (%d)", + MaxNBuffers, NBuffersPending), + errhint("Increase max_shared_buffers or decrease shared_buffers."))); + } + } + + Assert(MaxNBuffers > 0); + Assert(!finalMaxNBuffers); + finalMaxNBuffers = true; +} + /* * Early initialization of a backend (either standalone or under postmaster). * This happens even before InitPostgres. diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index c6484aea087c..96233ba5cb27 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -2599,7 +2599,7 @@ convert_to_base_unit(double value, const char *unit, * the value without loss. For example, if the base unit is GUC_UNIT_KB, 1024 * is converted to 1 MB, but 1025 is represented as 1025 kB. */ -static void +void convert_int_from_base_unit(int64 base_value, int base_unit, int64 *value, const char **unit) { diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat index 1128167c0251..539b29f0065a 100644 --- a/src/backend/utils/misc/guc_parameters.dat +++ b/src/backend/utils/misc/guc_parameters.dat @@ -2013,6 +2013,15 @@ max => 'MAX_BACKENDS /* XXX? */', }, +{ name => "max_shared_buffers", type => 'int', context => 'PGC_POSTMASTER', group => 'RESOURCES_MEM', + short_desc => 'Sets the upper limit for the shared_buffers value.', + flags => 'GUC_UNIT_BLOCKS', + variable => 'MaxNBuffers', + boot_val => '0', + min => '0', + max => 'INT_MAX / 2', +}, + { name => 'max_slot_wal_keep_size', type => 'int', context => 'PGC_SIGHUP', group => 'REPLICATION_SENDING', short_desc => 'Sets the maximum WAL size that can be reserved by replication slots.', long_desc => 'Replication slots will be marked as failed, and segments released for deletion or recycling, if this much space is occupied by WAL on disk. -1 means no maximum.', @@ -2581,13 +2590,15 @@ # We sometimes multiply the number of shared buffers by two without # checking for overflow, so we mustn't allow more than INT_MAX / 2. -{ name => 'shared_buffers', type => 'int', context => 'PGC_POSTMASTER', group => 'RESOURCES_MEM', +{ name => 'shared_buffers', type => 'int', context => 'PGC_SIGHUP', group => 'RESOURCES_MEM', short_desc => 'Sets the number of shared memory buffers used by the server.', flags => 'GUC_UNIT_BLOCKS', - variable => 'NBuffers', + variable => 'NBuffersPending', boot_val => '16384', min => '16', max => 'INT_MAX / 2', + check_hook => 'check_shared_buffers', + show_hook => 'show_shared_buffers', }, { name => 'shared_memory_size', type => 'int', context => 'PGC_INTERNAL', group => 'PRESET_OPTIONS', diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 56c215b7be9c..d0c9e6ec7577 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -12620,4 +12620,10 @@ proargnames => '{pid,io_id,io_generation,state,operation,off,length,target,handle_data_len,raw_result,result,target_desc,f_sync,f_localmem,f_buffered}', prosrc => 'pg_get_aios' }, +{ oid => '9999', descr => 'resize shared buffers according to the value of GUC `shared_buffers`', + proname => 'pg_resize_shared_buffers', + provolatile => 'v', + prorettype => 'bool', + proargtypes => '', + prosrc => 'pg_resize_shared_buffers'}, ] diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 9a7d733ddeff..b4dc2c4ba57d 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -173,7 +173,11 @@ extern PGDLLIMPORT bool ExitOnAnyError; extern PGDLLIMPORT char *DataDir; extern PGDLLIMPORT int data_directory_mode; +/* TODO: This is no more a GUC variable; should be moved somewhere else. */ extern PGDLLIMPORT int NBuffers; +extern PGDLLIMPORT int NBuffersPending; +extern PGDLLIMPORT bool finalMaxNBuffers; +extern PGDLLIMPORT int MaxNBuffers; extern PGDLLIMPORT int MaxBackends; extern PGDLLIMPORT int MaxConnections; extern PGDLLIMPORT int max_worker_processes; @@ -502,6 +506,7 @@ extern PGDLLIMPORT ProcessingMode Mode; extern void pg_split_opts(char **argv, int *argcp, const char *optstr); extern void InitializeMaxBackends(void); extern void InitializeFastPathLocks(void); +extern void InitializeMaxNBuffers(void); extern void InitPostgres(const char *in_dbname, Oid dboid, const char *username, Oid useroid, bits32 flags, diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index 519692702a02..4c53194e13e4 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -513,6 +513,7 @@ extern void StrategyNotifyBgWriter(int bgwprocno); extern Size StrategyShmemSize(void); extern void StrategyInitialize(bool init); +extern void StrategyReset(int activeNBuffers); /* buf_table.c */ extern Size BufTableShmemSize(int size); diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index 3769f4db7dc6..774cf8f38edd 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -21,6 +21,7 @@ #include "storage/bufpage.h" #include "storage/pg_shmem.h" #include "storage/relfilelocator.h" +#include "utils/guc.h" #include "utils/relcache.h" #include "utils/snapmgr.h" @@ -159,6 +160,7 @@ typedef struct WritebackContext WritebackContext; /* in globals.c ... this duplicates miscadmin.h */ extern PGDLLIMPORT int NBuffers; +extern PGDLLIMPORT int NBuffersPending; /* in bufmgr.c */ extern PGDLLIMPORT bool zero_damaged_pages; @@ -205,6 +207,11 @@ extern PGDLLIMPORT int32 *LocalRefCount; #define BUFFER_LOCK_SHARE 1 #define BUFFER_LOCK_EXCLUSIVE 2 +/* + * prototypes for functions in buf_init.c + */ +extern const char *show_shared_buffers(void); +extern bool check_shared_buffers(int *newval, void **extra, GucSource source); /* * prototypes for functions in bufmgr.c @@ -308,6 +315,7 @@ extern bool IsBufferCleanupOK(Buffer buffer); extern bool HoldingBufferPinThatDelaysRecovery(void); extern bool BgBufferSync(WritebackContext *wb_context); +extern void BgBufferSyncReset(int currentNBuffers, int targetNBuffers); extern uint32 GetPinLimit(void); extern uint32 GetLocalPinLimit(void); @@ -324,10 +332,13 @@ extern void EvictRelUnpinnedBuffers(Relation rel, int32 *buffers_evicted, int32 *buffers_flushed, int32 *buffers_skipped); +extern bool EvictExtraBuffers(int targetNBuffers, int currentNBuffers); /* in buf_init.c */ extern void BufferManagerShmemInit(void); extern Size BufferManagerShmemSize(MemoryMappingSizes *mapping_sizes); +extern void BufferManagerShmemResize(int currentNBuffers, int targetNBuffers); +extern void BufferManagerShmemValidate(int targetNBuffers); /* in localbuf.c */ extern void AtProcExit_LocalBuffers(void); @@ -376,7 +387,7 @@ extern void FreeAccessStrategy(BufferAccessStrategy strategy); static inline bool BufferIsValid(Buffer bufnum) { - Assert(bufnum <= NBuffers); + Assert(bufnum <= (Buffer) pg_atomic_read_u32(&ShmemCtrl->currentNBuffers)); Assert(bufnum >= -NLocBuffer); return bufnum != InvalidBuffer; @@ -430,4 +441,11 @@ BufferGetPage(Buffer buffer) #endif /* FRONTEND */ +/* buf_resize.c */ +extern Datum pg_resize_shared_buffers(PG_FUNCTION_ARGS); +extern bool ProcessBarrierShmemShrink(void); +extern bool ProcessBarrierShmemResizeMapAndMem(void); +extern bool ProcessBarrierShmemExpand(void); +extern bool ProcessBarrierShmemResizeFailed(void); + #endif /* BUFMGR_H */ diff --git a/src/include/storage/ipc.h b/src/include/storage/ipc.h index d73f1b407db8..6dbbb9ad064a 100644 --- a/src/include/storage/ipc.h +++ b/src/include/storage/ipc.h @@ -66,6 +66,7 @@ typedef void (*shmem_startup_hook_type) (void); /* ipc.c */ extern PGDLLIMPORT bool proc_exit_inprogress; extern PGDLLIMPORT bool shmem_exit_inprogress; +extern PGDLLIMPORT volatile bool delay_shmem_resize; pg_noreturn extern void proc_exit(int code); extern void shmem_exit(int code); @@ -85,5 +86,7 @@ extern void CreateSharedMemoryAndSemaphores(void); extern void AttachSharedMemoryStructs(void); #endif extern void InitializeShmemGUCs(void); +extern void CoordinateShmemResize(void); +extern bool AnonymousShmemResize(void); #endif /* IPC_H */ diff --git a/src/include/storage/lwlocklist.h b/src/include/storage/lwlocklist.h index 5b0ce383408c..9c4b928441ce 100644 --- a/src/include/storage/lwlocklist.h +++ b/src/include/storage/lwlocklist.h @@ -86,6 +86,7 @@ PG_LWLOCK(51, InjectionPoint) PG_LWLOCK(52, SerialControl) PG_LWLOCK(53, AioWorkerSubmissionQueue) PG_LWLOCK(54, WaitLSN) +PG_LWLOCK(55, ShmemResize) /* * There also exist several built-in LWLock tranches. As with the predefined diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h index beee0a53d2da..369000688209 100644 --- a/src/include/storage/pg_shmem.h +++ b/src/include/storage/pg_shmem.h @@ -24,8 +24,13 @@ #ifndef PG_SHMEM_H #define PG_SHMEM_H +#include "port/atomics.h" +#include "storage/barrier.h" #include "storage/dsm_impl.h" +#include "storage/procsignal.h" #include "storage/spin.h" +#include "storage/shmem.h" +#include "utils/guc.h" typedef struct MemoryMappingSizes { @@ -65,15 +70,39 @@ typedef struct ShmemSegment } ShmemSegment; /* Number of available segments for anonymous memory mappings */ -#define NUM_MEMORY_MAPPINGS 6 +#define NUM_MEMORY_MAPPINGS 5 extern PGDLLIMPORT ShmemSegment Segments[NUM_MEMORY_MAPPINGS]; +/* + * ShmemControl is shared between backends and helps to coordinate shared + * memory resize. + * + * TODO: I think we need a lock to protect this structure. If we do so, do we + * need to use atomic integers? + */ +typedef struct +{ + pg_atomic_flag resize_in_progress; /* true if resizing is in progress. false otherwise. */ + pg_atomic_uint32 currentNBuffers; /* Original NBuffers value before resize started */ + pg_atomic_uint32 targetNBuffers; + pid_t coordinator; +} ShmemControl; + +extern PGDLLIMPORT ShmemControl *ShmemCtrl; + +/* The phases for shared memory resizing, used by for ProcSignal barrier. */ +#define SHMEM_RESIZE_REQUESTED 0 +#define SHMEM_RESIZE_START 1 +#define SHMEM_RESIZE_DONE 2 + /* GUC variables */ extern PGDLLIMPORT int shared_memory_type; extern PGDLLIMPORT int huge_pages; extern PGDLLIMPORT int huge_page_size; extern PGDLLIMPORT int huge_pages_status; +extern PGDLLIMPORT bool finalMaxNBuffers; +extern PGDLLIMPORT int MaxNBuffers; /* Possible values for huge_pages and huge_pages_status */ typedef enum @@ -113,6 +142,17 @@ extern void PGSharedMemoryReAttach(void); extern void PGSharedMemoryNoReAttach(void); #endif +/* + * round off mapping size to a multiple of a typical page size. + */ +static inline void +round_off_mapping_sizes(MemoryMappingSizes *mapping_sizes) +{ + mapping_sizes->shmem_req_size = add_size(mapping_sizes->shmem_req_size, 8192 - (mapping_sizes->shmem_req_size % 8192)); + mapping_sizes->shmem_reserved = add_size(mapping_sizes->shmem_reserved, 8192 - (mapping_sizes->shmem_reserved % 8192)); +} + + extern PGShmemHeader *PGSharedMemoryCreate(MemoryMappingSizes *mapping_sizes, int segment_id, PGShmemHeader **shim); extern bool PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2); @@ -122,6 +162,13 @@ extern void GetHugePageSize(Size *hugepagesize, int *mmap_flags, int *memfd_flags); void PrepareHugePages(void); +bool ProcessBarrierShmemResize(Barrier *barrier); +const char *show_shared_buffers(void); +bool check_shared_buffers(int *newval, void **extra, GucSource source); +void AdjustShmemSize(void); +extern void WaitOnShmemBarrier(void); +extern void ShmemControlInit(void); + /* * To be able to dynamically resize largest parts of the data stored in shared * memory, we split it into multiple shared memory mappings segments. Each @@ -144,7 +191,4 @@ void PrepareHugePages(void); /* Checkpoint BufferIds */ #define CHECKPOINT_BUFFERS_SHMEM_SEGMENT 4 -/* Buffer strategy status */ -#define STRATEGY_SHMEM_SEGMENT 5 - #endif /* PG_SHMEM_H */ diff --git a/src/include/storage/pmsignal.h b/src/include/storage/pmsignal.h index 428aa3fd68a0..5ced2a835370 100644 --- a/src/include/storage/pmsignal.h +++ b/src/include/storage/pmsignal.h @@ -42,9 +42,10 @@ typedef enum PMSIGNAL_START_WALRECEIVER, /* start a walreceiver */ PMSIGNAL_ADVANCE_STATE_MACHINE, /* advance postmaster's state machine */ PMSIGNAL_XLOG_IS_SHUTDOWN, /* ShutdownXLOG() completed */ + PMSIGNAL_SHMEM_RESIZE, /* resize shared memory */ } PMSignalReason; -#define NUM_PMSIGNALS (PMSIGNAL_XLOG_IS_SHUTDOWN+1) +#define NUM_PMSIGNALS (PMSIGNAL_SHMEM_RESIZE+1) /* * Reasons why the postmaster would send SIGQUIT to its children. diff --git a/src/include/storage/procsignal.h b/src/include/storage/procsignal.h index afeeb1ca019f..4de11faf12d4 100644 --- a/src/include/storage/procsignal.h +++ b/src/include/storage/procsignal.h @@ -54,6 +54,10 @@ typedef enum typedef enum { PROCSIGNAL_BARRIER_SMGRRELEASE, /* ask smgr to close files */ + PROCSIGNAL_BARRIER_SHBUF_SHRINK, /* shrink buffer pool - restrict allocations to new size */ + PROCSIGNAL_BARRIER_SHBUF_RESIZE_MAP_AND_MEM, /* remap shared memory segments and update structure pointers */ + PROCSIGNAL_BARRIER_SHBUF_EXPAND, /* expand buffer pool - enable allocations in new range */ + PROCSIGNAL_BARRIER_SHBUF_RESIZE_FAILED, /* signal backends that the shared buffer resizing failed. */ } ProcSignalBarrierType; /* diff --git a/src/include/storage/shmem.h b/src/include/storage/shmem.h index c56712555f09..d59e5ba6dcd6 100644 --- a/src/include/storage/shmem.h +++ b/src/include/storage/shmem.h @@ -49,11 +49,14 @@ extern HTAB *ShmemInitHashInSegment(const char *name, long init_size, extern void *ShmemInitStruct(const char *name, Size size, bool *foundPtr); extern void *ShmemInitStructInSegment(const char *name, Size size, bool *foundPtr, int shmem_segment); +extern void *ShmemUpdateStructInSegment(const char *name, Size size, + bool *foundPtr, int shmem_segment); extern Size add_size(Size s1, Size s2); extern Size mul_size(Size s1, Size s2); extern PGDLLIMPORT Size pg_get_shmem_pagesize(void); + /* ipci.c */ extern void RequestAddinShmemSpace(Size size); diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h index f21ec37da893..08a84373fb70 100644 --- a/src/include/utils/guc.h +++ b/src/include/utils/guc.h @@ -459,6 +459,8 @@ extern config_handle *get_config_handle(const char *name); extern void AlterSystemSetConfigFile(AlterSystemStmt *altersysstmt); extern char *GetConfigOptionByName(const char *name, const char **varname, bool missing_ok); +extern void convert_int_from_base_unit(int64 base_value, int base_unit, + int64 *value, const char **unit); extern void TransformGUCArray(ArrayType *array, List **names, List **values); diff --git a/src/test/Makefile b/src/test/Makefile index 511a72e6238a..95f8858a8183 100644 --- a/src/test/Makefile +++ b/src/test/Makefile @@ -12,7 +12,7 @@ subdir = src/test top_builddir = ../.. include $(top_builddir)/src/Makefile.global -SUBDIRS = perl postmaster regress isolation modules authentication recovery subscription +SUBDIRS = perl postmaster regress isolation modules authentication recovery subscription buffermgr ifeq ($(with_icu),yes) SUBDIRS += icu diff --git a/src/test/README b/src/test/README index afdc76765190..77f11607ff76 100644 --- a/src/test/README +++ b/src/test/README @@ -15,6 +15,9 @@ examples/ Demonstration programs for libpq that double as regression tests via "make check" +buffermgr/ + Tests for resizing buffer pool without restarting the server + isolation/ Tests for concurrent behavior at the SQL level diff --git a/src/test/buffermgr/Makefile b/src/test/buffermgr/Makefile new file mode 100644 index 000000000000..eb275027fa60 --- /dev/null +++ b/src/test/buffermgr/Makefile @@ -0,0 +1,30 @@ +#------------------------------------------------------------------------- +# +# Makefile for src/test/buffermgr +# +# Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group +# Portions Copyright (c) 1994, Regents of the University of California +# +# src/test/buffermgr/Makefile +# +#------------------------------------------------------------------------- + +EXTRA_INSTALL = contrib/pg_buffercache + +REGRESS = buffer_resize + +# Custom configuration for buffer manager tests +TEMP_CONFIG = $(srcdir)/buffermgr_test.conf + +subdir = src/test/buffermgr +top_builddir = ../../.. +include $(top_builddir)/src/Makefile.global + +check: + $(prove_check) + +installcheck: + $(prove_installcheck) + +clean distclean: + rm -rf tmp_check diff --git a/src/test/buffermgr/README b/src/test/buffermgr/README new file mode 100644 index 000000000000..c375ad809892 --- /dev/null +++ b/src/test/buffermgr/README @@ -0,0 +1,26 @@ +src/test/buffermgr/README + +Regression tests for buffer manager +=================================== + +This directory contains a test suite for resizing buffer manager without restarting the server. + + +Running the tests +================= + +NOTE: You must have given the --enable-tap-tests argument to configure. + +Run + make check +or + make installcheck +You can use "make installcheck" if you previously did "make install". +In that case, the code in the installation tree is tested. With +"make check", a temporary installation tree is built from the current +sources and then tested. + +Either way, this test initializes, starts, and stops a test Postgres +cluster. + +See src/test/perl/README for more info about running these tests. diff --git a/src/test/buffermgr/buffermgr_test.conf b/src/test/buffermgr/buffermgr_test.conf new file mode 100644 index 000000000000..b7c0065c80b8 --- /dev/null +++ b/src/test/buffermgr/buffermgr_test.conf @@ -0,0 +1,11 @@ +# Configuration for buffer manager regression tests + +# Even if max_shared_buffers is set multiple times only the last one is used to +# as the limit on shared_buffers. +max_shared_buffers = 128kB +# Set initial shared_buffers as expected by test +shared_buffers = 128MB +# Set a larger value for max_shared_buffers to allow testing resize operations +max_shared_buffers = 300MB +# Turn huge pages off, since that affects the size of memory segments +huge_pages = off \ No newline at end of file diff --git a/src/test/buffermgr/expected/buffer_resize.out b/src/test/buffermgr/expected/buffer_resize.out new file mode 100644 index 000000000000..d5cb9d784372 --- /dev/null +++ b/src/test/buffermgr/expected/buffer_resize.out @@ -0,0 +1,329 @@ +-- Test buffer pool resizing and shared memory allocation tracking +-- This test resizes the buffer pool multiple times and monitors +-- shared memory allocations related to buffer management +-- TODO: The test sets shared_buffers values in MBs. Instead it could use values +-- in kBs so that the test runs on very small machines. +-- Create a view for buffer-related shared memory allocations +CREATE VIEW buffer_allocations AS +SELECT name, segment, size, allocated_size +FROM pg_shmem_allocations +WHERE name IN ('Buffer Blocks', 'Buffer Descriptors', 'Buffer IO Condition Variables', + 'Checkpoint BufferIds') +ORDER BY name; +-- Note: We exclude the 'main' segment even if it contains the shared buffer +-- lookup table because it contains other shared structures whose total sizes +-- may vary as the code changes. +CREATE VIEW buffer_segments AS +SELECT name, size, mapping_size, mapping_reserved_size +FROM pg_shmem_segments +WHERE name <> 'main' +ORDER BY name; +-- Enable pg_buffercache for buffer count verification +CREATE EXTENSION IF NOT EXISTS pg_buffercache; +-- Test 1: Default shared_buffers +SHOW shared_buffers; + shared_buffers +---------------- + 128MB +(1 row) + +SHOW max_shared_buffers; + max_shared_buffers +-------------------- + 300MB +(1 row) + +SELECT * FROM buffer_allocations; + name | segment | size | allocated_size +-------------------------------+-------------+-----------+---------------- + Buffer Blocks | buffers | 134221824 | 134221824 + Buffer Descriptors | descriptors | 1048576 | 1048576 + Buffer IO Condition Variables | iocv | 262144 | 262144 + Checkpoint BufferIds | checkpoint | 327680 | 327680 +(4 rows) + +SELECT * FROM buffer_segments; + name | size | mapping_size | mapping_reserved_size +-------------+-----------+--------------+----------------------- + buffers | 134225920 | 134225920 | 314580992 + checkpoint | 335872 | 335872 | 770048 + descriptors | 1056768 | 1056768 | 2465792 + iocv | 270336 | 270336 | 622592 +(4 rows) + +SELECT COUNT(*) AS buffer_count FROM pg_buffercache; + buffer_count +-------------- + 16384 +(1 row) + +-- Calling pg_resize_shared_buffers() without changing shared_buffers should be a no-op. +SELECT pg_resize_shared_buffers(); + pg_resize_shared_buffers +-------------------------- + t +(1 row) + +SHOW shared_buffers; + shared_buffers +---------------- + 128MB +(1 row) + +SELECT * FROM buffer_allocations; + name | segment | size | allocated_size +-------------------------------+-------------+-----------+---------------- + Buffer Blocks | buffers | 134221824 | 134221824 + Buffer Descriptors | descriptors | 1048576 | 1048576 + Buffer IO Condition Variables | iocv | 262144 | 262144 + Checkpoint BufferIds | checkpoint | 327680 | 327680 +(4 rows) + +SELECT * FROM buffer_segments; + name | size | mapping_size | mapping_reserved_size +-------------+-----------+--------------+----------------------- + buffers | 134225920 | 134225920 | 314580992 + checkpoint | 335872 | 335872 | 770048 + descriptors | 1056768 | 1056768 | 2465792 + iocv | 270336 | 270336 | 622592 +(4 rows) + +SELECT COUNT(*) AS buffer_count FROM pg_buffercache; + buffer_count +-------------- + 16384 +(1 row) + +-- Test 2: Set to 64MB +ALTER SYSTEM SET shared_buffers = '64MB'; +SELECT pg_reload_conf(); + pg_reload_conf +---------------- + t +(1 row) + +-- reconnect to ensure new setting is loaded +\c +SHOW shared_buffers; + shared_buffers +----------------------- + 128MB (pending: 64MB) +(1 row) + +SELECT pg_resize_shared_buffers(); + pg_resize_shared_buffers +-------------------------- + t +(1 row) + +SHOW shared_buffers; + shared_buffers +---------------- + 64MB +(1 row) + +SELECT * FROM buffer_allocations; + name | segment | size | allocated_size +-------------------------------+-------------+----------+---------------- + Buffer Blocks | buffers | 67112960 | 67112960 + Buffer Descriptors | descriptors | 524288 | 524288 + Buffer IO Condition Variables | iocv | 131072 | 131072 + Checkpoint BufferIds | checkpoint | 163840 | 163840 +(4 rows) + +SELECT * FROM buffer_segments; + name | size | mapping_size | mapping_reserved_size +-------------+----------+--------------+----------------------- + buffers | 67117056 | 67117056 | 314580992 + checkpoint | 172032 | 172032 | 770048 + descriptors | 532480 | 532480 | 2465792 + iocv | 139264 | 139264 | 622592 +(4 rows) + +SELECT COUNT(*) AS buffer_count FROM pg_buffercache; + buffer_count +-------------- + 8192 +(1 row) + +-- Test 3: Set to 256MB +ALTER SYSTEM SET shared_buffers = '256MB'; +SELECT pg_reload_conf(); + pg_reload_conf +---------------- + t +(1 row) + +-- reconnect to ensure new setting is loaded +\c +SHOW shared_buffers; + shared_buffers +----------------------- + 64MB (pending: 256MB) +(1 row) + +SELECT pg_resize_shared_buffers(); + pg_resize_shared_buffers +-------------------------- + t +(1 row) + +SHOW shared_buffers; + shared_buffers +---------------- + 256MB +(1 row) + +SELECT * FROM buffer_allocations; + name | segment | size | allocated_size +-------------------------------+-------------+-----------+---------------- + Buffer Blocks | buffers | 268439552 | 268439552 + Buffer Descriptors | descriptors | 2097152 | 2097152 + Buffer IO Condition Variables | iocv | 524288 | 524288 + Checkpoint BufferIds | checkpoint | 655360 | 655360 +(4 rows) + +SELECT * FROM buffer_segments; + name | size | mapping_size | mapping_reserved_size +-------------+-----------+--------------+----------------------- + buffers | 268443648 | 268443648 | 314580992 + checkpoint | 663552 | 663552 | 770048 + descriptors | 2105344 | 2105344 | 2465792 + iocv | 532480 | 532480 | 622592 +(4 rows) + +SELECT COUNT(*) AS buffer_count FROM pg_buffercache; + buffer_count +-------------- + 32768 +(1 row) + +-- Test 4: Set to 100MB (non-power-of-two) +ALTER SYSTEM SET shared_buffers = '100MB'; +SELECT pg_reload_conf(); + pg_reload_conf +---------------- + t +(1 row) + +-- reconnect to ensure new setting is loaded +\c +SHOW shared_buffers; + shared_buffers +------------------------ + 256MB (pending: 100MB) +(1 row) + +SELECT pg_resize_shared_buffers(); + pg_resize_shared_buffers +-------------------------- + t +(1 row) + +SHOW shared_buffers; + shared_buffers +---------------- + 100MB +(1 row) + +SELECT * FROM buffer_allocations; + name | segment | size | allocated_size +-------------------------------+-------------+-----------+---------------- + Buffer Blocks | buffers | 104861696 | 104861696 + Buffer Descriptors | descriptors | 819200 | 819200 + Buffer IO Condition Variables | iocv | 204800 | 204800 + Checkpoint BufferIds | checkpoint | 256000 | 256000 +(4 rows) + +SELECT * FROM buffer_segments; + name | size | mapping_size | mapping_reserved_size +-------------+-----------+--------------+----------------------- + buffers | 104865792 | 104865792 | 314580992 + checkpoint | 262144 | 262144 | 770048 + descriptors | 827392 | 827392 | 2465792 + iocv | 212992 | 212992 | 622592 +(4 rows) + +SELECT COUNT(*) AS buffer_count FROM pg_buffercache; + buffer_count +-------------- + 12800 +(1 row) + +-- Test 5: Set to minimum 128kB +ALTER SYSTEM SET shared_buffers = '128kB'; +SELECT pg_reload_conf(); + pg_reload_conf +---------------- + t +(1 row) + +-- reconnect to ensure new setting is loaded +\c +SHOW shared_buffers; + shared_buffers +------------------------ + 100MB (pending: 128kB) +(1 row) + +SELECT pg_resize_shared_buffers(); + pg_resize_shared_buffers +-------------------------- + t +(1 row) + +SHOW shared_buffers; + shared_buffers +---------------- + 128kB +(1 row) + +SELECT * FROM buffer_allocations; + name | segment | size | allocated_size +-------------------------------+-------------+--------+---------------- + Buffer Blocks | buffers | 135168 | 135168 + Buffer Descriptors | descriptors | 1024 | 1024 + Buffer IO Condition Variables | iocv | 256 | 256 + Checkpoint BufferIds | checkpoint | 320 | 320 +(4 rows) + +SELECT * FROM buffer_segments; + name | size | mapping_size | mapping_reserved_size +-------------+--------+--------------+----------------------- + buffers | 139264 | 139264 | 314580992 + checkpoint | 8192 | 8192 | 770048 + descriptors | 8192 | 8192 | 2465792 + iocv | 8192 | 8192 | 622592 +(4 rows) + +SELECT COUNT(*) AS buffer_count FROM pg_buffercache; + buffer_count +-------------- + 16 +(1 row) + +-- Test 6: Try to set shared_buffers higher than max_shared_buffers (should fail) +ALTER SYSTEM SET shared_buffers = '400MB'; +ERROR: invalid value for parameter "shared_buffers": 51200 +DETAIL: "shared_buffers" must be less than "max_shared_buffers". +SELECT pg_reload_conf(); + pg_reload_conf +---------------- + t +(1 row) + +-- reconnect to ensure new setting is loaded +\c +-- This should show the old value since the configuration was rejected +SHOW shared_buffers; + shared_buffers +---------------- + 128kB +(1 row) + +SHOW max_shared_buffers; + max_shared_buffers +-------------------- + 300MB +(1 row) + diff --git a/src/test/buffermgr/meson.build b/src/test/buffermgr/meson.build new file mode 100644 index 000000000000..c24bff721e60 --- /dev/null +++ b/src/test/buffermgr/meson.build @@ -0,0 +1,23 @@ +# Copyright (c) 2022-2025, PostgreSQL Global Development Group + +tests += { + 'name': 'buffermgr', + 'sd': meson.current_source_dir(), + 'bd': meson.current_build_dir(), + 'regress': { + 'sql': [ + 'buffer_resize', + ], + 'regress_args': ['--temp-config', files('buffermgr_test.conf')], + }, + 'tap': { + 'env': { + 'enable_injection_points': get_option('injection_points') ? 'yes' : 'no', + }, + 'tests': [ + 't/001_resize_buffer.pl', + 't/003_parallel_resize_buffer.pl', + 't/004_client_join_buffer_resize.pl', + ], + }, +} diff --git a/src/test/buffermgr/sql/buffer_resize.sql b/src/test/buffermgr/sql/buffer_resize.sql new file mode 100644 index 000000000000..dfaaeabfcbbb --- /dev/null +++ b/src/test/buffermgr/sql/buffer_resize.sql @@ -0,0 +1,95 @@ +-- Test buffer pool resizing and shared memory allocation tracking +-- This test resizes the buffer pool multiple times and monitors +-- shared memory allocations related to buffer management +-- TODO: The test sets shared_buffers values in MBs. Instead it could use values +-- in kBs so that the test runs on very small machines. + +-- Create a view for buffer-related shared memory allocations +CREATE VIEW buffer_allocations AS +SELECT name, segment, size, allocated_size +FROM pg_shmem_allocations +WHERE name IN ('Buffer Blocks', 'Buffer Descriptors', 'Buffer IO Condition Variables', + 'Checkpoint BufferIds') +ORDER BY name; + +-- Note: We exclude the 'main' segment even if it contains the shared buffer +-- lookup table because it contains other shared structures whose total sizes +-- may vary as the code changes. +CREATE VIEW buffer_segments AS +SELECT name, size, mapping_size, mapping_reserved_size +FROM pg_shmem_segments +WHERE name <> 'main' +ORDER BY name; + +-- Enable pg_buffercache for buffer count verification +CREATE EXTENSION IF NOT EXISTS pg_buffercache; + +-- Test 1: Default shared_buffers +SHOW shared_buffers; +SHOW max_shared_buffers; +SELECT * FROM buffer_allocations; +SELECT * FROM buffer_segments; +SELECT COUNT(*) AS buffer_count FROM pg_buffercache; +-- Calling pg_resize_shared_buffers() without changing shared_buffers should be a no-op. +SELECT pg_resize_shared_buffers(); +SHOW shared_buffers; +SELECT * FROM buffer_allocations; +SELECT * FROM buffer_segments; +SELECT COUNT(*) AS buffer_count FROM pg_buffercache; + +-- Test 2: Set to 64MB +ALTER SYSTEM SET shared_buffers = '64MB'; +SELECT pg_reload_conf(); +-- reconnect to ensure new setting is loaded +\c +SHOW shared_buffers; +SELECT pg_resize_shared_buffers(); +SHOW shared_buffers; +SELECT * FROM buffer_allocations; +SELECT * FROM buffer_segments; +SELECT COUNT(*) AS buffer_count FROM pg_buffercache; + +-- Test 3: Set to 256MB +ALTER SYSTEM SET shared_buffers = '256MB'; +SELECT pg_reload_conf(); +-- reconnect to ensure new setting is loaded +\c +SHOW shared_buffers; +SELECT pg_resize_shared_buffers(); +SHOW shared_buffers; +SELECT * FROM buffer_allocations; +SELECT * FROM buffer_segments; +SELECT COUNT(*) AS buffer_count FROM pg_buffercache; + +-- Test 4: Set to 100MB (non-power-of-two) +ALTER SYSTEM SET shared_buffers = '100MB'; +SELECT pg_reload_conf(); +-- reconnect to ensure new setting is loaded +\c +SHOW shared_buffers; +SELECT pg_resize_shared_buffers(); +SHOW shared_buffers; +SELECT * FROM buffer_allocations; +SELECT * FROM buffer_segments; +SELECT COUNT(*) AS buffer_count FROM pg_buffercache; + +-- Test 5: Set to minimum 128kB +ALTER SYSTEM SET shared_buffers = '128kB'; +SELECT pg_reload_conf(); +-- reconnect to ensure new setting is loaded +\c +SHOW shared_buffers; +SELECT pg_resize_shared_buffers(); +SHOW shared_buffers; +SELECT * FROM buffer_allocations; +SELECT * FROM buffer_segments; +SELECT COUNT(*) AS buffer_count FROM pg_buffercache; + +-- Test 6: Try to set shared_buffers higher than max_shared_buffers (should fail) +ALTER SYSTEM SET shared_buffers = '400MB'; +SELECT pg_reload_conf(); +-- reconnect to ensure new setting is loaded +\c +-- This should show the old value since the configuration was rejected +SHOW shared_buffers; +SHOW max_shared_buffers; diff --git a/src/test/buffermgr/t/001_resize_buffer.pl b/src/test/buffermgr/t/001_resize_buffer.pl new file mode 100644 index 000000000000..a0d7f0941713 --- /dev/null +++ b/src/test/buffermgr/t/001_resize_buffer.pl @@ -0,0 +1,135 @@ +# Copyright (c) 2025-2025, PostgreSQL Global Development Group +# +# Minimal test testing shared_buffer resizing under load + +use strict; +use warnings; +use IPC::Run; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +# Function to resize buffer pool and verify the change. +sub apply_and_verify_buffer_change +{ + my ($node, $new_size) = @_; + + # Use the new pg_resize_shared_buffers() interface which handles everything synchronously + $node->safe_psql('postgres', "ALTER SYSTEM SET shared_buffers = '$new_size'"); + $node->safe_psql('postgres', "SELECT pg_reload_conf()"); + + # If resize function fails, try a few times before giving up + my $max_retries = 5; + my $retry_delay = 1; # seconds + my $success = 0; + for my $attempt (1..$max_retries) { + my $result = $node->safe_psql('postgres', "SELECT pg_resize_shared_buffers()"); + if ($result eq 't') { + $success = 1; + last; + } + + # If not the last attempt, wait before retrying + if ($attempt < $max_retries) { + note "Resizing buffer pool to $new_size, attempt $attempt failed, retrying after $retry_delay seconds..."; + sleep($retry_delay); + } + } + + is($success, 1, 'resizing to ' . $new_size . ' succeeded after retries'); + is($node->safe_psql('postgres', "SHOW shared_buffers"), $new_size, + 'SHOW after resizing to '. $new_size . ' succeeded'); +} + +# Initialize a cluster and start pgbench in the background for concurrent load. +my $node = PostgreSQL::Test::Cluster->new('main'); +$node->init; + +# Permit resizing up to 1GB for this test and let the server start with 128MB. +$node->append_conf('postgresql.conf', qq{ +max_shared_buffers = 1GB +shared_buffers = 128MB +log_statement = none +}); + +$node->start; +$node->safe_psql('postgres', "CREATE EXTENSION pg_buffercache"); +my $pgb_scale = 10; +my $pgb_duration = 120; +my $pgb_num_clients = 10; +$node->pgbench( + "--initialize --init-steps=dtpvg --scale=$pgb_scale --quiet", + 0, + [qr{^$}], + [ # stderr patterns to verify initialization stages + qr{dropping old tables}, + qr{creating tables}, + qr{done in \d+\.\d\d s } + ], + "pgbench initialization (scale=$pgb_scale)" +); +my ($pgbench_stdin, $pgbench_stdout, $pgbench_stderr) = ('', '', ''); +# Use --exit-on-abort so that the test stops on the first server crash or error, +# thus making it easy to debug the failure. Use -C to increase the chances of a +# new backend being created while resizing the buffer pool. +my $pgbench_process = IPC::Run::start( + [ + 'pgbench', + '-p', $node->port, + '-T', $pgb_duration, + '-c', $pgb_num_clients, + '-C', + '--exit-on-abort', + 'postgres' + ], + '<' => \$pgbench_stdin, + '>' => \$pgbench_stdout, + '2>' => \$pgbench_stderr +); + +ok($pgbench_process, "pgbench started successfully"); + +# Allow pgbench to establish connections and start generating load. +# +# TODO: When creating new backends is known to work well with buffer pool +# resizing, this wait should be removed. +sleep(1); + +# Resize buffer pool to various sizes while pgbench is running in the +# background. +# +# TODO: These are pseudo-randomly picked sizes, but we can do better. +my $tests_completed = 0; +my @buffer_sizes = ('900MB', '500MB', '250MB', '400MB', '120MB', '600MB'); +for my $target_size (@buffer_sizes) +{ + # Verify workload generator is still running + if (!$pgbench_process->pumpable) { + ok(0, "pgbench is still running"); + last; + } + + apply_and_verify_buffer_change($node, $target_size); + $tests_completed++; + + # Wait for the resized buffer pool to stabilize. If the resized buffer pool + # is utilized fully, it might hit any wrongly initialized areas of shared + # memory. + sleep(2); +} +is($tests_completed, scalar(@buffer_sizes), "All buffer sizes were tested"); + +# Make sure that pgbench can end normally. +$pgbench_process->signal('TERM'); +IPC::Run::finish $pgbench_process; +ok(grep { $pgbench_process->result == $_ } (0, 15), "pgbench exited gracefully"); + +# Log any error output from pgbench for debugging +diag("pgbench stderr:\n$pgbench_stderr"); +diag("pgbench stdout:\n$pgbench_stdout"); + +# Ensure database is still functional after all the buffer changes +$node->connect_ok("dbname=postgres", + "Database remains accessible after $tests_completed buffer resize operations"); + +done_testing(); \ No newline at end of file diff --git a/src/test/buffermgr/t/003_parallel_resize_buffer.pl b/src/test/buffermgr/t/003_parallel_resize_buffer.pl new file mode 100644 index 000000000000..9cbb5452fd27 --- /dev/null +++ b/src/test/buffermgr/t/003_parallel_resize_buffer.pl @@ -0,0 +1,71 @@ +# Copyright (c) 2025-2025, PostgreSQL Global Development Group +# +# Test that only one pg_resize_shared_buffers() call succeeds when multiple +# sessions attempt to resize buffers concurrently + +use strict; +use warnings; +use IPC::Run; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +# Skip this test if injection points are not supported +if ($ENV{enable_injection_points} ne 'yes') +{ + plan skip_all => 'Injection points not supported by this build'; +} + +# Initialize a cluster +my $node = PostgreSQL::Test::Cluster->new('main'); +$node->init; +$node->append_conf('postgresql.conf', 'shared_preload_libraries = injection_points'); +$node->append_conf('postgresql.conf', 'shared_buffers = 128kB'); +$node->append_conf('postgresql.conf', 'max_shared_buffers = 256kB'); +$node->start; + +# Load injection points extension for test coordination +$node->safe_psql('postgres', "CREATE EXTENSION injection_points"); + +# Test 1: Two concurrent pg_resize_shared_buffers() calls +# Set up injection point to pause the first resize call +$node->safe_psql('postgres', + "SELECT injection_points_attach('pg-resize-shared-buffers-flag-set', 'wait')"); + +# Change shared_buffers for the resize operation +$node->safe_psql('postgres', "ALTER SYSTEM SET shared_buffers = '144kB'"); +$node->safe_psql('postgres', "SELECT pg_reload_conf()"); + +# Start first resize session (will pause at injection point) +my $session1 = $node->background_psql('postgres'); +$session1->query_until( + qr/starting_resize/, + q( + \echo starting_resize + SELECT pg_resize_shared_buffers(); + ) +); + +# Wait until session actually reaches the injection point +$node->wait_for_event('client backend', 'pg-resize-shared-buffers-flag-set'); + +# Start second resize session (should fail immediately since resize is in progress) +my $result2 = $node->safe_psql('postgres', "SELECT pg_resize_shared_buffers()"); + +# The second call should return false (already in progress) +is($result2, 'f', 'Second concurrent resize call returns false'); + +# Wake up the first session +$node->safe_psql('postgres', + "SELECT injection_points_wakeup('pg-resize-shared-buffers-flag-set')"); + +# The pg_resize_shared_buffers() in session1 should now complete successfully +# We can't easily capture the return value from query_until, but we can +# verify the session completes without error and the resize actually happened +$session1->quit; + +# Detach injection point +$node->safe_psql('postgres', + "SELECT injection_points_detach('pg-resize-shared-buffers-flag-set')"); + +done_testing(); \ No newline at end of file diff --git a/src/test/buffermgr/t/004_client_join_buffer_resize.pl b/src/test/buffermgr/t/004_client_join_buffer_resize.pl new file mode 100644 index 000000000000..06f0de6b4091 --- /dev/null +++ b/src/test/buffermgr/t/004_client_join_buffer_resize.pl @@ -0,0 +1,241 @@ +# Copyright (c) 2025-2025, PostgreSQL Global Development Group +# +# Test shared_buffer resizing coordination with client connections joining using injection points + +use strict; +use warnings; +use IPC::Run; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; +use Time::HiRes qw(sleep); + +# Skip this test if injection points are not supported +if ($ENV{enable_injection_points} ne 'yes') +{ + plan skip_all => 'Injection points not supported by this build'; +} + +# Function to calculate the size of test table required to fill up maximum +# buffer pool when populating it. +sub calculate_test_sizes +{ + my ($node, $block_size) = @_; + + # Get the maximum buffer pool size from configuration + my $max_shared_buffers = $node->safe_psql('postgres', "SHOW max_shared_buffers"); + my ($max_val, $max_unit) = ($max_shared_buffers =~ /(\d+)(\w+)/); + my $max_size_bytes; + if (lc($max_unit) eq 'kb') { + $max_size_bytes = $max_val * 1024; + } elsif (lc($max_unit) eq 'mb') { + $max_size_bytes = $max_val * 1024 * 1024; + } elsif (lc($max_unit) eq 'gb') { + $max_size_bytes = $max_val * 1024 * 1024 * 1024; + } else { + # Default to kB if unit is not recognized + $max_size_bytes = $max_val * 1024; + } + + # Fill more pages than minimally required to increase the chances of pages + # from the test table filling the buffer cache. + $max_size_bytes = $max_size_bytes; + my $pages_needed = int($max_size_bytes / $block_size) + 10; # Add some extra to ensure buffers are filled + my $rows_to_insert = $pages_needed * 100; # Assuming roughly 100 rows per page for our table structure + + return ($max_size_bytes, $pages_needed, $rows_to_insert); +} + +# Function to calculate expected buffer count from size string +sub calculate_buffer_count +{ + my ($size_string, $block_size) = @_; + + # Parse size and convert to bytes + my ($size_val, $unit) = ($size_string =~ /(\d+)(\w+)/); + my $size_bytes; + if (lc($unit) eq 'kb') { + $size_bytes = $size_val * 1024; + } elsif (lc($unit) eq 'mb') { + $size_bytes = $size_val * 1024 * 1024; + } elsif (lc($unit) eq 'gb') { + $size_bytes = $size_val * 1024 * 1024 * 1024; + } else { + # Default to kB if unit is not recognized + $size_bytes = $size_val * 1024; + } + + return int($size_bytes / $block_size); +} + +# Initialize cluster with very small buffer sizes for testing +my $node = PostgreSQL::Test::Cluster->new('main'); +$node->init; + +# Configure for buffer resizing with very small buffer pool sizes for faster tests. +# TODO: for some reason parallel workers try to load default number of shared_buffers which doesn't work with lower max_shared_buffers. We need to fix that - somewhere it's picking default value of shared buffers. For now disable parallelism +$node->append_conf('postgresql.conf', 'shared_preload_libraries = injection_points'); +$node->append_conf('postgresql.conf', qq{ +max_shared_buffers = 512kB +shared_buffers = 320kB +max_parallel_workers_per_gather = 0 +}); + +$node->start; + +# Enable injection points +$node->safe_psql('postgres', "CREATE EXTENSION injection_points"); + +# Get the block size (this is fixed for the binary) +my $block_size = $node->safe_psql('postgres', "SHOW block_size"); + +# Try to create pg_buffercache extension for buffer analysis +eval { + $node->safe_psql('postgres', "CREATE EXTENSION pg_buffercache"); +}; +if ($@) { + $node->stop; + plan skip_all => 'pg_buffercache extension not available - cannot verify buffer usage'; +} + +# Create a small test table, and fetch its properties for later reference if required. +$node->safe_psql('postgres', qq{ + CREATE TABLE client_test (c1 int, data char(50)); +}); + +my $table_oid = $node->safe_psql('postgres', "SELECT oid FROM pg_class WHERE relname = 'client_test'"); +my $table_relfilenode = $node->safe_psql('postgres', "SELECT relfilenode FROM pg_class WHERE relname = 'client_test'"); +note("Test table client_test: OID = $table_oid, relfilenode = $table_relfilenode"); +my ($max_size_bytes, $pages_needed, $rows_to_insert) = calculate_test_sizes($node, $block_size); + +# Create dedicated sessions for injection point handling and test queries, +# so that we don't create new backends for test operations after starting +# resize operation. Only one backend, which tests new backend synchronization +# with resizing operation, should start after resizing has commenced. +my $injection_session = $node->background_psql('postgres'); +my $query_session = $node->background_psql('postgres'); +my $resize_session = $node->background_psql('postgres'); + +# Function to run a single injection point test +sub run_injection_point_test +{ + my ($test_name, $injection_point, $target_size, $operation_type) = @_; + + note("Test with $test_name ($operation_type)"); + + # Calculate test parameters before starting resize + my ($max_size_bytes, $pages_needed, $rows_to_insert) = calculate_test_sizes($node, $target_size, $block_size); + + # Update buffer pool size and wait for it to reflect pending state + $resize_session->query_safe("ALTER SYSTEM SET shared_buffers = '$target_size'"); + $resize_session->query_safe("SELECT pg_reload_conf()"); + my $pending_size_str = "pending: $target_size"; + $resize_session->poll_query_until("SELECT substring(current_setting('shared_buffers'), '$pending_size_str')", $pending_size_str); + + # Set up injection point in injection session + $injection_session->query_safe("SELECT injection_points_attach('$injection_point', 'wait')"); + + # Trigger resize + $resize_session->query_until( + qr/starting_resize/, + q( + \echo starting_resize + SELECT pg_resize_shared_buffers(); + ) + ); + + # Wait until resize actually reaches the injection point using the query session + $query_session->wait_for_event('client backend', $injection_point); + + # Start a client while resize is paused + my $client = $node->background_psql('postgres'); + note("Background client backend PID: " . $client->query_safe("SELECT pg_backend_pid()")); + + # Wake up the injection point from injection session + $injection_session->query_safe("SELECT injection_points_wakeup('$injection_point')"); + + # Test buffer functionality immediately after waking up injection point + # Insert data to test buffer pool functionality during/after resize + $client->query_safe("INSERT INTO client_test SELECT i, 'test_data_' || i FROM generate_series(1, $rows_to_insert) i"); + # Verify the data was inserted correctly and can be read back + is($client->query_safe("SELECT COUNT(*) FROM client_test"), $rows_to_insert, "inserted $rows_to_insert during $test_name ($operation_type) successful"); + + # Verify table size is reasonable (should be substantial for testing) + ok($query_session->query_safe("SELECT pg_total_relation_size('client_test')") >= $max_size_bytes,"table size is large enough to overflow buffer pool in test $test_name ($operation_type)"); + + # Wait for the resize operation to complete. There is no direct way to do so + # in background_psql. Hence fire a psql command and wait for it to finish + $resize_session->query(q(\echo 'done')); + + # Detach injection point from injection session + $injection_session->query_safe("SELECT injection_points_detach('$injection_point')"); + + # Verify resize completed successfully + is($query_session->query_safe("SELECT current_setting('shared_buffers')"), $target_size, + "resize completed successfully to $target_size"); + + # Check buffer pool size using pg_buffercache after resize completion + is($query_session->query_safe("SELECT COUNT(*) FROM pg_buffercache"), calculate_buffer_count($target_size, $block_size), "all buffers in the buffer pool used in $test_name ($operation_type)"); + + # Wait for client to complete + ok($client->quit, "client succeeded during $test_name ($operation_type)"); + + # Clean up for next test + $query_session->query_safe("DELETE FROM client_test"); +} + +# Test injection points during buffer resize with client connections +my @common_injection_tests = ( + { + name => 'flag setting phase', + injection_point => 'pg-resize-shared-buffers-flag-set', + }, + { + name => 'memory remap phase', + injection_point => 'pgrsb-after-shmem-resize', + }, + { + name => 'resize map barrier complete', + injection_point => 'pgrsb-resize-barrier-sent', + }, +); + +# Test common injection points for both shrinking and expanding +foreach my $test (@common_injection_tests) +{ + # Test shrinking scenario + run_injection_point_test($test->{name}, $test->{injection_point}, '272kB', 'shrinking'); + + # Test expanding scenario + run_injection_point_test($test->{name}, $test->{injection_point}, '400kB', 'expanding'); +} + +my @shrink_only_tests = ( + { + name => 'shrink barrier complete', + injection_point => 'pgrsb-shrink-barrier-sent', + size => '200kB', + } +); +foreach my $test (@shrink_only_tests) +{ + run_injection_point_test($test->{name}, $test->{injection_point}, $test->{size}, 'shrinking only'); +} + +my @expand_only_tests = ( + { + name => 'expand barrier complete', + injection_point => 'pgrsb-expand-barrier-sent', + size => '416kB', + } +); +foreach my $test (@expand_only_tests) +{ + run_injection_point_test($test->{name}, $test->{injection_point}, $test->{size}, 'expanding only'); +} + +$injection_session->quit; +$query_session->quit; +$resize_session->quit; + +done_testing(); \ No newline at end of file diff --git a/src/test/meson.build b/src/test/meson.build index ccc31d6a86a1..2a5ba1dec398 100644 --- a/src/test/meson.build +++ b/src/test/meson.build @@ -4,6 +4,7 @@ subdir('regress') subdir('isolation') subdir('authentication') +subdir('buffermgr') subdir('postmaster') subdir('recovery') subdir('subscription') diff --git a/src/test/perl/PostgreSQL/Test/BackgroundPsql.pm b/src/test/perl/PostgreSQL/Test/BackgroundPsql.pm index 60bbd5dd445b..16625e94d92e 100644 --- a/src/test/perl/PostgreSQL/Test/BackgroundPsql.pm +++ b/src/test/perl/PostgreSQL/Test/BackgroundPsql.pm @@ -61,6 +61,7 @@ use Config; use IPC::Run; use PostgreSQL::Test::Utils qw(pump_until); use Test::More; +use Time::HiRes qw(usleep); =pod @@ -371,4 +372,79 @@ sub set_query_timer_restart return $self->{query_timer_restart}; } +=pod + +=item $session->poll_query_until($query [, $expected ]) + +Run B<$query> repeatedly in this background session, until it returns the +B<$expected> result ('t', or SQL boolean true, by default). +Continues polling if the query returns an error result. +Times out after a reasonable number of attempts. +Returns 1 if successful, 0 if timed out. + +=cut + +sub poll_query_until +{ + my ($self, $query, $expected) = @_; + + $expected = 't' unless defined($expected); # default value + + my $max_attempts = 10 * $PostgreSQL::Test::Utils::timeout_default; + my $attempts = 0; + my ($stdout, $stderr_flag); + + while ($attempts < $max_attempts) + { + ($stdout, $stderr_flag) = $self->query($query); + + chomp($stdout); + + # If query succeeded and returned expected result + if (!$stderr_flag && $stdout eq $expected) + { + return 1; + } + + # Wait 0.1 second before retrying. + usleep(100_000); + + $attempts++; + } + + # Give up. Print the output from the last attempt, hopefully that's useful + # for debugging. + my $stderr_output = $stderr_flag ? $self->{stderr} : ''; + diag qq(poll_query_until timed out executing this query: +$query +expecting this output: +$expected +last actual query output: +$stdout +with stderr: +$stderr_output); + return 0; +} + +=item $session->wait_for_event(backend_type, wait_event_name) + +Poll pg_stat_activity until backend_type reaches wait_event_name using this +background session. + +=cut + +sub wait_for_event +{ + my ($self, $backend_type, $wait_event_name) = @_; + + $self->poll_query_until(qq[ + SELECT count(*) > 0 FROM pg_stat_activity + WHERE backend_type = '$backend_type' AND wait_event = '$wait_event_name' + ]) + or die + qq(timed out when waiting for $backend_type to reach wait event '$wait_event_name'); + + return; +} + 1; diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 27a4d1318978..4d9879ac60d7 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -2778,6 +2778,7 @@ ShellTypeInfo ShippableCacheEntry ShippableCacheKey ShmemIndexEnt +ShmemControl ShutdownForeignScan_function ShutdownInformation ShutdownMode From 18d355bbea3ef267c9e35c097074b8ccbf5a6b96 Mon Sep 17 00:00:00 2001 From: Ashutosh Bapat Date: Wed, 1 Oct 2025 09:38:19 +0530 Subject: [PATCH 4/4] WIP test shared buffers resizing and checkpoint A new test triggers an injection point in the BufferSync() after it has collected buffers to flushed. Simultaneously it starts buffer shrinking. The expectation is that the checkpointer would crash accessing a buffer (descriptor) outside the new range of shared buffers. But that does not happen because of a bug in synchronization. The checkpointer does not reload configuration when checkpoint is going on. It does not load the new value of the configuration. When the resizing is triggered by the PM, checkpointer receives the proc signal barrier but it does not start it doesn't enter the barrier mechanism and doesn't alter its address maps or memory sizes. Hence the test does not crash. But of course it means that it won't consider the correct size of buffers next time it performs a checkpoint. The test was at least useful to detect this anomaly. Once we fix the synchronization issue we should see the crash and then fix the crash. Author: Ashutosh Bapat Notes to reviewers ------------------ 1. pg_buffercache used a query on pg_settings to fetch the value of the number of buffers. That doesn't work anymore because of change in the SHOW shared_buffers. Modified the test to convert the setting value to the number of shared buffers, save it in a variable and use the variable in queries which need the number of shared buffers. We could instead fix ShowGUCOption() to pass use_units flag to show_hook and let it output the number of shared buffers instead. But that seems a larger change. There aren't other GUCs whose show_hook outputs their values with units. So this local fix might be better. --- .../expected/pg_buffercache.out | 19 ++- contrib/pg_buffercache/sql/pg_buffercache.sql | 19 ++- src/backend/storage/buffer/bufmgr.c | 4 + src/test/buffermgr/meson.build | 1 + .../t/002_checkpoint_buffer_resize.pl | 111 ++++++++++++++++++ 5 files changed, 130 insertions(+), 24 deletions(-) create mode 100644 src/test/buffermgr/t/002_checkpoint_buffer_resize.pl diff --git a/contrib/pg_buffercache/expected/pg_buffercache.out b/contrib/pg_buffercache/expected/pg_buffercache.out index 2f27bf34637e..632b12abbf84 100644 --- a/contrib/pg_buffercache/expected/pg_buffercache.out +++ b/contrib/pg_buffercache/expected/pg_buffercache.out @@ -1,8 +1,9 @@ CREATE EXTENSION pg_buffercache; -select count(*) = (select setting::bigint - from pg_settings - where name = 'shared_buffers') -from pg_buffercache; +select pg_size_bytes(setting)/(select setting::bigint from pg_settings where name = 'block_size') AS nbuffers + from pg_settings + where name = 'shared_buffers' +\gset +select count(*) = :nbuffers from pg_buffercache; ?column? ---------- t @@ -24,20 +25,14 @@ SELECT count(*) > 0 FROM pg_buffercache_usage_counts() WHERE buffers >= 0; (1 row) -- Test the buffer lookup table function and count is <= shared_buffers -select count(*) <= (select setting::bigint - from pg_settings - where name = 'shared_buffers') -from pg_buffercache_lookup_table_entries(); +select count(*) <= :nbuffers from pg_buffercache_lookup_table_entries(); ?column? ---------- t (1 row) -- Check that pg_buffercache_lookup_table view works and count is <= shared_buffers -select count(*) <= (select setting::bigint - from pg_settings - where name = 'shared_buffers') -from pg_buffercache_lookup_table; +select count(*) <= :nbuffers from pg_buffercache_lookup_table; ?column? ---------- t diff --git a/contrib/pg_buffercache/sql/pg_buffercache.sql b/contrib/pg_buffercache/sql/pg_buffercache.sql index 569b28aebb9d..11fe85ceb3bb 100644 --- a/contrib/pg_buffercache/sql/pg_buffercache.sql +++ b/contrib/pg_buffercache/sql/pg_buffercache.sql @@ -1,9 +1,10 @@ CREATE EXTENSION pg_buffercache; -select count(*) = (select setting::bigint - from pg_settings - where name = 'shared_buffers') -from pg_buffercache; +select pg_size_bytes(setting)/(select setting::bigint from pg_settings where name = 'block_size') AS nbuffers + from pg_settings + where name = 'shared_buffers' +\gset +select count(*) = :nbuffers from pg_buffercache; select buffers_used + buffers_unused > 0, buffers_dirty <= buffers_used, @@ -13,16 +14,10 @@ from pg_buffercache_summary(); SELECT count(*) > 0 FROM pg_buffercache_usage_counts() WHERE buffers >= 0; -- Test the buffer lookup table function and count is <= shared_buffers -select count(*) <= (select setting::bigint - from pg_settings - where name = 'shared_buffers') -from pg_buffercache_lookup_table_entries(); +select count(*) <= :nbuffers from pg_buffercache_lookup_table_entries(); -- Check that pg_buffercache_lookup_table view works and count is <= shared_buffers -select count(*) <= (select setting::bigint - from pg_settings - where name = 'shared_buffers') -from pg_buffercache_lookup_table; +select count(*) <= :nbuffers from pg_buffercache_lookup_table; -- Check that the functions / views can't be accessed by default. To avoid -- having to create a dedicated user, use the pg_database_owner pseudo-role. diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 6c8f8552a4cc..f489ae2932fc 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -67,6 +67,7 @@ #include "utils/rel.h" #include "utils/resowner.h" #include "utils/timestamp.h" +#include "utils/injection_point.h" /* Note: these two macros only work on shared buffers, not local ones! */ @@ -3416,6 +3417,9 @@ BufferSync(int flags) ProcessProcSignalBarrier(); } + /* Injection point after scanning all buffers for dirty pages */ + INJECTION_POINT("buffer-sync-dirty-buffer-scan", NULL); + if (num_to_scan == 0) return; /* nothing to do */ diff --git a/src/test/buffermgr/meson.build b/src/test/buffermgr/meson.build index c24bff721e60..f33feb64a069 100644 --- a/src/test/buffermgr/meson.build +++ b/src/test/buffermgr/meson.build @@ -16,6 +16,7 @@ tests += { }, 'tests': [ 't/001_resize_buffer.pl', + 't/002_checkpoint_buffer_resize.pl', 't/003_parallel_resize_buffer.pl', 't/004_client_join_buffer_resize.pl', ], diff --git a/src/test/buffermgr/t/002_checkpoint_buffer_resize.pl b/src/test/buffermgr/t/002_checkpoint_buffer_resize.pl new file mode 100644 index 000000000000..9ab615b6557f --- /dev/null +++ b/src/test/buffermgr/t/002_checkpoint_buffer_resize.pl @@ -0,0 +1,111 @@ +# Copyright (c) 2025-2025, PostgreSQL Global Development Group +# +# Test shared_buffer resizing coordination with checkpoint using injection points + +use strict; +use warnings; +use IPC::Run; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +# Skip this test if injection points are not supported +if ($ENV{enable_injection_points} ne 'yes') +{ + plan skip_all => 'Injection points not supported by this build'; +} + +# Initialize cluster with injection points enabled +my $node = PostgreSQL::Test::Cluster->new('main'); +$node->init; +$node->append_conf('postgresql.conf', 'shared_preload_libraries = injection_points'); +$node->append_conf('postgresql.conf', 'shared_buffers = 256kB'); +# Disable background writer to prevent interference with dirty buffers +$node->append_conf('postgresql.conf', 'bgwriter_lru_maxpages = 0'); +$node->start; + +# Load the injection points extension +$node->safe_psql('postgres', "CREATE EXTENSION injection_points"); + +# Create some data to make checkpoint meaningful and ensure many dirty buffers +$node->safe_psql('postgres', "CREATE TABLE test_data (id int, data text)"); +# Insert enough data to fill more than 16 buffers (each row ~1KB, so 20+ rows per page) +$node->safe_psql('postgres', "INSERT INTO test_data SELECT i, repeat('x', 1000) FROM generate_series(1, 5000) i"); + +# Create additional tables to ensure we have plenty of dirty buffers +$node->safe_psql('postgres', "CREATE TABLE test_data2 AS SELECT * FROM test_data WHERE id <= 2500"); +$node->safe_psql('postgres', "CREATE TABLE test_data3 AS SELECT * FROM test_data WHERE id > 2500"); + +# Update data to create more dirty buffers +$node->safe_psql('postgres', "UPDATE test_data SET data = repeat('y', 1000) WHERE id % 3 = 0"); +$node->safe_psql('postgres', "UPDATE test_data2 SET data = repeat('z', 1000) WHERE id % 2 = 0"); + +# Prepare the new shared_buffers configuration before starting checkpoint +$node->safe_psql('postgres', "ALTER SYSTEM SET shared_buffers = '128kB'"); +$node->safe_psql('postgres', "SELECT pg_reload_conf()"); + +# Set up the injection point to make checkpoint wait +$node->safe_psql('postgres', "SELECT injection_points_attach('buffer-sync-dirty-buffer-scan', 'wait')"); + +# Start a checkpoint in the background that will trigger the injection point +my $checkpoint_session = $node->background_psql('postgres'); +$checkpoint_session->query_until( + qr/starting_checkpoint/, + q( + \echo starting_checkpoint + CHECKPOINT; + \q + ) +); + +# Wait until checkpointer actually reaches the injection point +$node->wait_for_event('checkpointer', 'buffer-sync-dirty-buffer-scan'); + +# Verify checkpoint is waiting by checking if it hasn't completed +my $checkpoint_running = $node->safe_psql('postgres', + "SELECT COUNT(*) FROM pg_stat_activity WHERE backend_type = 'checkpointer' AND wait_event = 'buffer-sync-dirty-buffer-scan'"); +is($checkpoint_running, '1', 'Checkpoint is waiting at injection point'); + +# Start the resize operation in the background (don't wait for completion) +my $resize_session = $node->background_psql('postgres'); +$resize_session->query_until( + qr/starting_resize/, + q( + \echo starting_resize + SELECT pg_resize_shared_buffers(); + ) +); + +# Continue the checkpoint and wait for its completion +my $log_offset = -s $node->logfile; +$node->safe_psql('postgres', "SELECT injection_points_wakeup('buffer-sync-dirty-buffer-scan')"); + +# Wait for both checkpoint and resize to complete +$node->wait_for_log(qr/checkpoint complete/, $log_offset); + +# Wait for the resize operation to complete using the proper method +$resize_session->query(q(\echo 'resize_complete')); + +pass('Checkpoint and buffer resize both completed after injection point was released'); + +# Verify the resize actually worked +is($node->safe_psql('postgres', "SHOW shared_buffers"), '128kB', + 'Buffer resize completed successfully after checkpoint coordination'); + +# Cleanup the background session +$resize_session->quit; + +# Clean up the injection point +$node->safe_psql('postgres', "SELECT injection_points_detach('buffer-sync-dirty-buffer-scan')"); + +# Verify system remains stable after coordinated operations + +# Perform a normal checkpoint to ensure everything is working +$node->safe_psql('postgres', "CHECKPOINT"); + +pass('System remains stable after injection point testing'); + +# Cleanup +$node->safe_psql('postgres', "DROP TABLE test_data, test_data2, test_data3"); + +done_testing(); \ No newline at end of file