diff --git a/contrib/pg_buffercache/expected/pg_buffercache.out b/contrib/pg_buffercache/expected/pg_buffercache.out index 9a9216dc7b1b..632b12abbf84 100644 --- a/contrib/pg_buffercache/expected/pg_buffercache.out +++ b/contrib/pg_buffercache/expected/pg_buffercache.out @@ -1,8 +1,9 @@ CREATE EXTENSION pg_buffercache; -select count(*) = (select setting::bigint - from pg_settings - where name = 'shared_buffers') -from pg_buffercache; +select pg_size_bytes(setting)/(select setting::bigint from pg_settings where name = 'block_size') AS nbuffers + from pg_settings + where name = 'shared_buffers' +\gset +select count(*) = :nbuffers from pg_buffercache; ?column? ---------- t @@ -23,6 +24,20 @@ SELECT count(*) > 0 FROM pg_buffercache_usage_counts() WHERE buffers >= 0; t (1 row) +-- Test the buffer lookup table function and count is <= shared_buffers +select count(*) <= :nbuffers from pg_buffercache_lookup_table_entries(); + ?column? +---------- + t +(1 row) + +-- Check that pg_buffercache_lookup_table view works and count is <= shared_buffers +select count(*) <= :nbuffers from pg_buffercache_lookup_table; + ?column? +---------- + t +(1 row) + -- Check that the functions / views can't be accessed by default. To avoid -- having to create a dedicated user, use the pg_database_owner pseudo-role. SET ROLE pg_database_owner; @@ -34,6 +49,10 @@ SELECT * FROM pg_buffercache_summary(); ERROR: permission denied for function pg_buffercache_summary SELECT * FROM pg_buffercache_usage_counts(); ERROR: permission denied for function pg_buffercache_usage_counts +SELECT * FROM pg_buffercache_lookup_table_entries(); +ERROR: permission denied for function pg_buffercache_lookup_table_entries +SELECT * FROM pg_buffercache_lookup_table; +ERROR: permission denied for view pg_buffercache_lookup_table RESET role; -- Check that pg_monitor is allowed to query view / function SET ROLE pg_monitor; @@ -55,6 +74,21 @@ SELECT count(*) > 0 FROM pg_buffercache_usage_counts(); t (1 row) +RESET role; +-- Check that pg_read_all_stats is allowed to query buffer lookup table +SET ROLE pg_read_all_stats; +SELECT count(*) >= 0 FROM pg_buffercache_lookup_table_entries(); + ?column? +---------- + t +(1 row) + +SELECT count(*) >= 0 FROM pg_buffercache_lookup_table; + ?column? +---------- + t +(1 row) + RESET role; ------ ---- Test pg_buffercache_evict* functions diff --git a/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql b/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql index 458f054a6917..9bf58567878d 100644 --- a/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql +++ b/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql @@ -44,3 +44,27 @@ CREATE FUNCTION pg_buffercache_evict_all( OUT buffers_skipped int4) AS 'MODULE_PATHNAME', 'pg_buffercache_evict_all' LANGUAGE C PARALLEL SAFE VOLATILE; + +-- Add the buffer lookup table function +CREATE FUNCTION pg_buffercache_lookup_table_entries( + OUT tablespace oid, + OUT database oid, + OUT relfilenode oid, + OUT forknum int2, + OUT blocknum int8, + OUT bufferid int4) +RETURNS SETOF record +AS 'MODULE_PATHNAME', 'pg_buffercache_lookup_table_entries' +LANGUAGE C PARALLEL SAFE VOLATILE; + +-- Create a view for convenient access. +CREATE VIEW pg_buffercache_lookup_table AS + SELECT * FROM pg_buffercache_lookup_table_entries(); + +-- Don't want these to be available to public. +REVOKE ALL ON FUNCTION pg_buffercache_lookup_table_entries() FROM PUBLIC; +REVOKE ALL ON pg_buffercache_lookup_table FROM PUBLIC; + +-- Grant access to monitoring role. +GRANT EXECUTE ON FUNCTION pg_buffercache_lookup_table_entries() TO pg_read_all_stats; +GRANT SELECT ON pg_buffercache_lookup_table TO pg_read_all_stats; diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c index c29b784dfa1a..45efc6a314bf 100644 --- a/contrib/pg_buffercache/pg_buffercache_pages.c +++ b/contrib/pg_buffercache/pg_buffercache_pages.c @@ -16,6 +16,7 @@ #include "storage/buf_internals.h" #include "storage/bufmgr.h" #include "utils/rel.h" +#include "utils/tuplestore.h" #define NUM_BUFFERCACHE_PAGES_MIN_ELEM 8 @@ -100,6 +101,7 @@ PG_FUNCTION_INFO_V1(pg_buffercache_usage_counts); PG_FUNCTION_INFO_V1(pg_buffercache_evict); PG_FUNCTION_INFO_V1(pg_buffercache_evict_relation); PG_FUNCTION_INFO_V1(pg_buffercache_evict_all); +PG_FUNCTION_INFO_V1(pg_buffercache_lookup_table_entries); /* Only need to touch memory once per backend process lifetime */ @@ -116,6 +118,7 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) TupleDesc tupledesc; TupleDesc expected_tupledesc; HeapTuple tuple; + int currentNBuffers = pg_atomic_read_u32(&ShmemCtrl->currentNBuffers); if (SRF_IS_FIRSTCALL()) { @@ -172,10 +175,10 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) /* Allocate NBuffers worth of BufferCachePagesRec records. */ fctx->record = (BufferCachePagesRec *) MemoryContextAllocHuge(CurrentMemoryContext, - sizeof(BufferCachePagesRec) * NBuffers); + sizeof(BufferCachePagesRec) * currentNBuffers); /* Set max calls and remember the user function context. */ - funcctx->max_calls = NBuffers; + funcctx->max_calls = currentNBuffers; funcctx->user_fctx = fctx; /* Return to original context when allocating transient memory */ @@ -189,13 +192,24 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) * snapshot across all buffers, but we do grab the buffer header * locks, so the information of each buffer is self-consistent. */ - for (i = 0; i < NBuffers; i++) + for (i = 0; i < currentNBuffers; i++) { BufferDesc *bufHdr; uint32 buf_state; CHECK_FOR_INTERRUPTS(); + /* + * TODO: We should just scan the entire buffer descriptor + * array instead of relying on curent buffer pool size. But that can + * happen if only we setup the descriptor array large enough at the + * server startup time. + */ + if (currentNBuffers != pg_atomic_read_u32(&ShmemCtrl->currentNBuffers)) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("number of shared buffers changed during scan of buffer cache"))); + bufHdr = GetBufferDescriptor(i); /* Lock each buffer header before inspecting. */ buf_state = LockBufHdr(bufHdr); @@ -776,3 +790,19 @@ pg_buffercache_evict_all(PG_FUNCTION_ARGS) PG_RETURN_DATUM(result); } + +/* + * Return lookup table content as a set of records. + */ +Datum +pg_buffercache_lookup_table_entries(PG_FUNCTION_ARGS) +{ + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + + InitMaterializedSRF(fcinfo, 0); + + /* Fill the tuplestore */ + BufTableGetContents(rsinfo->setResult, rsinfo->setDesc); + + return (Datum) 0; +} diff --git a/contrib/pg_buffercache/sql/pg_buffercache.sql b/contrib/pg_buffercache/sql/pg_buffercache.sql index 47cca1907c74..11fe85ceb3bb 100644 --- a/contrib/pg_buffercache/sql/pg_buffercache.sql +++ b/contrib/pg_buffercache/sql/pg_buffercache.sql @@ -1,9 +1,10 @@ CREATE EXTENSION pg_buffercache; -select count(*) = (select setting::bigint - from pg_settings - where name = 'shared_buffers') -from pg_buffercache; +select pg_size_bytes(setting)/(select setting::bigint from pg_settings where name = 'block_size') AS nbuffers + from pg_settings + where name = 'shared_buffers' +\gset +select count(*) = :nbuffers from pg_buffercache; select buffers_used + buffers_unused > 0, buffers_dirty <= buffers_used, @@ -12,6 +13,12 @@ from pg_buffercache_summary(); SELECT count(*) > 0 FROM pg_buffercache_usage_counts() WHERE buffers >= 0; +-- Test the buffer lookup table function and count is <= shared_buffers +select count(*) <= :nbuffers from pg_buffercache_lookup_table_entries(); + +-- Check that pg_buffercache_lookup_table view works and count is <= shared_buffers +select count(*) <= :nbuffers from pg_buffercache_lookup_table; + -- Check that the functions / views can't be accessed by default. To avoid -- having to create a dedicated user, use the pg_database_owner pseudo-role. SET ROLE pg_database_owner; @@ -19,6 +26,8 @@ SELECT * FROM pg_buffercache; SELECT * FROM pg_buffercache_pages() AS p (wrong int); SELECT * FROM pg_buffercache_summary(); SELECT * FROM pg_buffercache_usage_counts(); +SELECT * FROM pg_buffercache_lookup_table_entries(); +SELECT * FROM pg_buffercache_lookup_table; RESET role; -- Check that pg_monitor is allowed to query view / function @@ -28,6 +37,12 @@ SELECT buffers_used + buffers_unused > 0 FROM pg_buffercache_summary(); SELECT count(*) > 0 FROM pg_buffercache_usage_counts(); RESET role; +-- Check that pg_read_all_stats is allowed to query buffer lookup table +SET ROLE pg_read_all_stats; +SELECT count(*) >= 0 FROM pg_buffercache_lookup_table_entries(); +SELECT count(*) >= 0 FROM pg_buffercache_lookup_table; +RESET role; + ------ ---- Test pg_buffercache_evict* functions diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 023b3f03ba93..d007055eed79 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -1724,7 +1724,6 @@ include_dir 'conf.d' that is BLCKSZ bytes, typically 8kB. (Non-default values of BLCKSZ change the minimum value.) - This parameter can only be set at server start. @@ -1747,6 +1746,49 @@ include_dir 'conf.d' appropriate, so as to leave adequate space for the operating system. + + The shared memory consumed by the buffer pool is allocated and + initialized according to the value of the GUC at the time of starting + the server. A desired new value of GUC can be loaded while the server is + running using SIGHUP. But the buffer pool will + not be resized immediately. Use + pg_resize_shared_buffers() to dynamically resize + the shared buffer pool (see for details). + SHOW shared_buffers shows the current number of + shared buffers and pending number, if any. Please note that when the GUC + is changed, the other GUCS which use this GUCs value to set their + defaults will not be changed. They may still require a server restart to + consider new value. + + + + + + max_shared_buffers (integer) + + max_shared_buffers configuration parameter + + + + + Sets the upper limit for the shared_buffers value. + The default value is 0, + which means no explicit limit is set and max_shared_buffers + will be automatically set to the value of shared_buffers + at server startup. + If this value is specified without units, it is taken as blocks, + that is BLCKSZ bytes, typically 8kB. + This parameter can only be set at server start. + + + + This parameter determines the amount of memory address space to reserve + in each backend for expanding the buffer pool in future. While the + memory for buffer pool is allocated on demand as it is resized, the + memory required to hold the buffer manager metadata is allocated + statically at the server start accounting for the largest buffer pool + size allowed by this parameter. + diff --git a/doc/src/sgml/func/func-admin.sgml b/doc/src/sgml/func/func-admin.sgml index 1b465bc8ba71..0dc89b07c765 100644 --- a/doc/src/sgml/func/func-admin.sgml +++ b/doc/src/sgml/func/func-admin.sgml @@ -99,6 +99,63 @@ off + + + + + pg_resize_shared_buffers + + pg_resize_shared_buffers () + boolean + + + Dynamically resizes the shared buffer pool to match the current + value of the shared_buffers parameter. This + function implements a coordinated resize process that ensures all + backend processes acknowledge the change before completing the + operation. The resize happens in multiple phases to maintain + data consistency and system stability. Returns true + if the resize was successful, or raises an error if the operation + fails. This function can only be called by superusers. + + + To resize shared buffers, first update the shared_buffers + setting and reload the configuration, then verify the new value is loaded + before calling this function. For example: + +postgres=# ALTER SYSTEM SET shared_buffers = '256MB'; +ALTER SYSTEM +postgres=# SELECT pg_reload_conf(); + pg_reload_conf +---------------- + t +(1 row) + +postgres=# SHOW shared_buffers; + shared_buffers +------------------------- + 128MB (pending: 256MB) +(1 row) + +postgres=# SELECT pg_resize_shared_buffers(); + pg_resize_shared_buffers +-------------------------- + t +(1 row) + +postgres=# SHOW shared_buffers; + shared_buffers +---------------- + 256MB +(1 row) + + The SHOW shared_buffers step is important to verify + that the configuration reload was successful and the new value is + available to the current session before attempting the resize. The + output shows both the current and pending values when a change is waiting + to be applied. + + diff --git a/doc/src/sgml/system-views.sgml b/doc/src/sgml/system-views.sgml index 0e623e7fb867..7cb1e5e17f81 100644 --- a/doc/src/sgml/system-views.sgml +++ b/doc/src/sgml/system-views.sgml @@ -71,6 +71,11 @@ backend memory contexts + + pg_buffer_lookup_table + shared buffer lookup table + + pg_config compile-time configuration parameters @@ -901,6 +906,90 @@ AND c1.path[c2.level] = c2.path[c2.level]; + + <structname>pg_buffer_lookup_table</structname> + + pg_buffer_lookup_table + + + The pg_buffer_lookup_table view exposes the current + contents of the shared buffer lookup table. Each row represents an entry in + the lookup table mapping a relation page to the ID of buffer in which it is + cached. The shared buffer lookup table is locked for a short duration while + reading so as to ensure consistency. This may affect performance if this view + is queried very frequently. + + + <structname>pg_buffer_lookup_table</structname> View + + + + + Column Type + + + Description + + + + + + + tablespace oid + + + OID of the tablespace containing the relation + + + + + database oid + + + OID of the database containing the relation (zero for shared relations) + + + + + relfilenode oid + + + relfilenode identifying the relation + + + + + forknum int2 + + + Fork number within the relation (see ) + + + + + blocknum int8 + + + Block number within the relation + + + + + bufferid int4 + + + ID of the buffer caching the page + + + + +
+ + Access to this view is restricted to members of the + pg_read_all_stats role by default. + +
+ <structname>pg_config</structname> @@ -4144,6 +4233,15 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
+ + + segment text + + + The name of the shared memory segment concerning the allocation. + + + off int8 diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index 77676d6d0359..73df59098866 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -232,7 +232,7 @@ SimpleLruAutotuneBuffers(int divisor, int max) { return Min(max - (max % SLRU_BANK_SIZE), Max(SLRU_BANK_SIZE, - NBuffers / divisor - (NBuffers / divisor) % SLRU_BANK_SIZE)); + NBuffersPending / divisor - (NBuffersPending / divisor) % SLRU_BANK_SIZE)); } /* diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 22d0a2e8c3a6..f4363e0035d9 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -4676,7 +4676,7 @@ XLOGChooseNumBuffers(void) { int xbuffers; - xbuffers = NBuffers / 32; + xbuffers = NBuffersPending / 32; if (xbuffers > (wal_segment_size / XLOG_BLCKSZ)) xbuffers = (wal_segment_size / XLOG_BLCKSZ); if (xbuffers < 8) diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c index fc8638c1b61b..226944e45882 100644 --- a/src/backend/bootstrap/bootstrap.c +++ b/src/backend/bootstrap/bootstrap.c @@ -335,6 +335,8 @@ BootstrapModeMain(int argc, char *argv[], bool check_only) InitializeFastPathLocks(); + InitializeMaxNBuffers(); + CreateSharedMemoryAndSemaphores(); /* diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 95ad29a64b98..6a0180f39be4 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -668,6 +668,13 @@ GRANT SELECT ON pg_shmem_allocations TO pg_read_all_stats; REVOKE EXECUTE ON FUNCTION pg_get_shmem_allocations() FROM PUBLIC; GRANT EXECUTE ON FUNCTION pg_get_shmem_allocations() TO pg_read_all_stats; +CREATE VIEW pg_shmem_segments AS + SELECT * FROM pg_get_shmem_segments(); + +REVOKE ALL ON pg_shmem_segments FROM PUBLIC; +GRANT SELECT ON pg_shmem_segments TO pg_read_all_stats; +REVOKE EXECUTE ON FUNCTION pg_get_shmem_segments() FROM PUBLIC; +GRANT EXECUTE ON FUNCTION pg_get_shmem_segments() TO pg_read_all_stats; CREATE VIEW pg_shmem_allocations_numa AS SELECT * FROM pg_get_shmem_allocations_numa(); diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c index 197926d44f6b..68de301441bb 100644 --- a/src/backend/port/sysv_shmem.c +++ b/src/backend/port/sysv_shmem.c @@ -30,13 +30,19 @@ #include "miscadmin.h" #include "port/pg_bitutils.h" #include "portability/mem.h" +#include "storage/bufmgr.h" #include "storage/dsm.h" #include "storage/fd.h" #include "storage/ipc.h" +#include "storage/lwlock.h" #include "storage/pg_shmem.h" +#include "storage/pmsignal.h" +#include "storage/procsignal.h" +#include "storage/shmem.h" #include "utils/guc.h" #include "utils/guc_hooks.h" #include "utils/pidfile.h" +#include "utils/wait_event.h" /* @@ -90,12 +96,90 @@ typedef enum SHMSTATE_UNATTACHED, /* pertinent to DataDir, no attached PIDs */ } IpcMemoryState; - +/* + * TODO: These should be moved into ShmemSegment, now that there can be multiple + * shared memory segments. But there's windows specific code which will need + * adjustment, so leaving it here. + */ unsigned long UsedShmemSegID = 0; void *UsedShmemSegAddr = NULL; -static Size AnonymousShmemSize; -static void *AnonymousShmem = NULL; +volatile bool delay_shmem_resize = false; + +/* + * Anonymous mapping layout we use looks like this: + * + * 00400000-00c2a000 r-xp /bin/postgres + * ... + * 3f526000-3f590000 rw-p [heap] + * 7fbd827fe000-7fbd8bdde000 rw-s /memfd:main (deleted) + * 7fbd8bdde000-7fbe82800000 ---s /memfd:main (deleted) + * 7fbe82800000-7fbe90670000 r--p /usr/lib/locale/locale-archive + * 7fbe90800000-7fbe90941000 r-xp /usr/lib64/libstdc++.so.6.0.34 + * ... + * + * We need to place shared memory mappings in such a way, that there will be + * gaps between them in the address space. Those gaps have to be large enough + * to resize the mapping up to certain size, without counting towards the total + * memory consumption. + * + * To achieve this, for each shared memory segment we first create an anonymous + * file of specified size using memfd_create, which will accomodate actual + * shared memory mapping content. It is represented by the first /memfd:main + * with rw permissions. Then we create a mapping for this file using mmap, with + * size much larger than required and flags PROT_NONE (allows to make sure the + * reserved space will not be used) and MAP_NORESERVE (prevents the space from + * being counted against memory limits). The mapping serves as an address space + * reservation, into which shared memory segment can be extended and is + * represented by the second /memfd:main with no permissions. + * + * The reserved space for buffer manager related segments is calculated based on + * MaxNBuffers. + */ + +/* + * Flag telling that we have decided to use huge pages. + * + * XXX: It's possible to use GetConfigOption("huge_pages_status", false, false) + * instead, but it feels like an overkill. + */ +static bool huge_pages_on = false; + +/* + * Currently broadcasted value of NBuffers in shared memory. + * + * Most of the time this value is going to be equal to NBuffers. But if + * postmaster is resizing shared memory and a new backend was created + * at the same time, there is a possibility for the new backend to inherit the + * old NBuffers value, but miss the resize signal if ProcSignal infrastructure + * was not initialized yet. Consider this situation: + * + * Postmaster ------> New Backend + * | | + * | Launch + * | | + * | Inherit NBuffers + * | | + * Resize NBuffers | + * | | + * Emit Barrier | + * | Init ProcSignal + * | | + * Finish resize | + * | | + * New NBuffers Old NBuffers + * + * In this case the backend is not yet ready to receive a signal from + * EmitProcSignalBarrier, and will be ignored. The same happens if ProcSignal + * is initialized even later, after the resizing was finished. + * + * To address resulting inconsistency, postmaster broadcasts the current + * NBuffers value via shared memory. Every new backend has to verify this value + * before it will access the buffer pool: if it differs from its own value, + * this indicates a shared memory resize has happened and the backend has to + * first synchronize with rest of the pack. + */ +ShmemControl *ShmemCtrl = NULL; static void *InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size); static void IpcMemoryDetach(int status, Datum shmaddr); @@ -104,6 +188,25 @@ static IpcMemoryState PGSharedMemoryAttach(IpcMemoryId shmId, void *attachAt, PGShmemHeader **addr); +const char* +MappingName(int shmem_segment) +{ + switch (shmem_segment) + { + case MAIN_SHMEM_SEGMENT: + return "main"; + case BUFFERS_SHMEM_SEGMENT: + return "buffers"; + case BUFFER_DESCRIPTORS_SHMEM_SEGMENT: + return "descriptors"; + case BUFFER_IOCV_SHMEM_SEGMENT: + return "iocv"; + case CHECKPOINT_BUFFERS_SHMEM_SEGMENT: + return "checkpoint"; + default: + return "unknown"; + } +} /* * InternalIpcMemoryCreate(memKey, size) @@ -470,19 +573,20 @@ PGSharedMemoryAttach(IpcMemoryId shmId, * hugepage sizes, we might want to think about more invasive strategies, * such as increasing shared_buffers to absorb the extra space. * - * Returns the (real, assumed or config provided) page size into - * *hugepagesize, and the hugepage-related mmap flags to use into - * *mmap_flags if requested by the caller. If huge pages are not supported, - * *hugepagesize and *mmap_flags are set to 0. + * Returns the (real, assumed or config provided) page size into *hugepagesize, + * the hugepage-related mmap and memfd flags to use into *mmap_flags and + * *memfd_flags if requested by the caller. If huge pages are not supported, + * *hugepagesize, *mmap_flags and *memfd_flags are set to 0. */ void -GetHugePageSize(Size *hugepagesize, int *mmap_flags) +GetHugePageSize(Size *hugepagesize, int *mmap_flags, int *memfd_flags) { #ifdef MAP_HUGETLB Size default_hugepagesize = 0; Size hugepagesize_local = 0; int mmap_flags_local = 0; + int memfd_flags_local = 0; /* * System-dependent code to find out the default huge page size. @@ -541,6 +645,7 @@ GetHugePageSize(Size *hugepagesize, int *mmap_flags) } mmap_flags_local = MAP_HUGETLB; + memfd_flags_local = MFD_HUGETLB; /* * On recent enough Linux, also include the explicit page size, if @@ -551,7 +656,16 @@ GetHugePageSize(Size *hugepagesize, int *mmap_flags) { int shift = pg_ceil_log2_64(hugepagesize_local); - mmap_flags_local |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT; + memfd_flags_local |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT; + } +#endif + +#if defined(MFD_HUGE_MASK) && defined(MFD_HUGE_SHIFT) + if (hugepagesize_local != default_hugepagesize) + { + int shift = pg_ceil_log2_64(hugepagesize_local); + + memfd_flags_local |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT; } #endif @@ -560,6 +674,8 @@ GetHugePageSize(Size *hugepagesize, int *mmap_flags) *mmap_flags = mmap_flags_local; if (hugepagesize) *hugepagesize = hugepagesize_local; + if (memfd_flags) + *memfd_flags = memfd_flags_local; #else @@ -567,6 +683,8 @@ GetHugePageSize(Size *hugepagesize, int *mmap_flags) *hugepagesize = 0; if (mmap_flags) *mmap_flags = 0; + if (memfd_flags) + *memfd_flags = 0; #endif /* MAP_HUGETLB */ } @@ -588,83 +706,242 @@ check_huge_page_size(int *newval, void **extra, GucSource source) return true; } +/* + * Wrapper around posix_fallocate() to allocate memory for a given shared memory + * segment. + * + * Performs retry on EINTR, and raises error upon failure. + */ +static void +shmem_fallocate(int fd, const char *mapping_name, Size size, int elevel) +{ +#if defined(HAVE_POSIX_FALLOCATE) && defined(__linux__) + int ret; + + + /* + * If there is not enough memory, trying to access a hole in address space + * will cause SIGBUS. If supported, avoid that by allocating memory upfront. + * + * We still use a traditional EINTR retry loop to handle SIGCONT. + * posix_fallocate() doesn't restart automatically, and we don't want this to + * fail if you attach a debugger. + */ + do + { + ret = posix_fallocate(fd, 0, size); + } while (ret == EINTR); + + if (ret != 0) + { + ereport(elevel, + (errmsg("segment[%s]: could not allocate space for anonymous file: %s", + mapping_name, strerror(ret)), + (ret == ENOMEM) ? + errhint("This error usually means that PostgreSQL's request " + "for a shared memory segment exceeded available memory, " + "swap space, or huge pages. To reduce the request size " + "(currently %zu bytes), reduce PostgreSQL's shared " + "memory usage, perhaps by reducing \"shared_buffers\" or " + "\"max_connections\".", + size) : 0)); + } +#endif /* HAVE_POSIX_FALLOCATE && __linux__ */ +} + +/* + * Round up the required amount of memory and the amount of required reserved + * address space to the nearest huge page size. + */ +static inline void +round_off_mapping_sizes_for_hugepages(MemoryMappingSizes *mapping, int hugepagesize) +{ + if (hugepagesize == 0) + return; + + if (mapping->shmem_req_size % hugepagesize != 0) + mapping->shmem_req_size += hugepagesize - + (mapping->shmem_req_size % hugepagesize); + + if (mapping->shmem_reserved % hugepagesize != 0) + mapping->shmem_reserved = mapping->shmem_reserved + hugepagesize - + (mapping->shmem_reserved % hugepagesize); +} + /* * Creates an anonymous mmap()ed shared memory segment. * - * Pass the requested size in *size. This function will modify *size to the - * actual size of the allocation, if it ends up allocating a segment that is - * larger than requested. + * This function will modify mapping size to the actual size of the allocation, + * if it ends up allocating a segment that is larger than requested. If needed, + * it also rounds up the mapping reserved size to be a multiple of huge page + * size. + * + * Note that we do not fallback from huge pages to regular pages in this + * function, this decision was already made in ReserveAnonymousMemory and we + * stick to it. + * + * TODO: Update the prologue to be consistent with the code. */ -static void * -CreateAnonymousSegment(Size *size) +static void +CreateAnonymousSegment(MemoryMappingSizes *mapping, int segment_id) { - Size allocsize = *size; void *ptr = MAP_FAILED; - int mmap_errno = 0; + int save_errno = 0; + int mmap_flags = PG_MMAP_FLAGS, memfd_flags = 0; + ShmemSegment *segment = &Segments[segment_id]; #ifndef MAP_HUGETLB - /* PGSharedMemoryCreate should have dealt with this case */ - Assert(huge_pages != HUGE_PAGES_ON); + /* PrepareHugePages should have dealt with this case */ + Assert(huge_pages != HUGE_PAGES_ON && !huge_pages_on); #else - if (huge_pages == HUGE_PAGES_ON || huge_pages == HUGE_PAGES_TRY) + if (huge_pages_on) { - /* - * Round up the request size to a suitable large value. - */ Size hugepagesize; - int mmap_flags; - GetHugePageSize(&hugepagesize, &mmap_flags); + /* Make sure nothing is messed up */ + Assert(huge_pages == HUGE_PAGES_ON || huge_pages == HUGE_PAGES_TRY); - if (allocsize % hugepagesize != 0) - allocsize += hugepagesize - (allocsize % hugepagesize); + /* Round up the request size to a suitable large value */ + GetHugePageSize(&hugepagesize, &mmap_flags, &memfd_flags); + round_off_mapping_sizes_for_hugepages(mapping, hugepagesize); - ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE, - PG_MMAP_FLAGS | mmap_flags, -1, 0); - mmap_errno = errno; - if (huge_pages == HUGE_PAGES_TRY && ptr == MAP_FAILED) - elog(DEBUG1, "mmap(%zu) with MAP_HUGETLB failed, huge pages disabled: %m", - allocsize); + /* Verify that the new size is withing the reserved boundaries */ + Assert(mapping->shmem_reserved >= mapping->shmem_req_size); + + mmap_flags = PG_MMAP_FLAGS | mmap_flags; } #endif /* - * Report whether huge pages are in use. This needs to be tracked before - * the second mmap() call if attempting to use huge pages failed - * previously. + * Prepare an anonymous file backing the segment. Its size will be + * specified later via ftruncate. + * + * The file behaves like a regular file, but lives in memory. Once all + * references to the file are dropped, it is automatically released. + * Anonymous memory is used for all backing pages of the file, thus it has + * the same semantics as anonymous memory allocations using mmap with the + * MAP_ANONYMOUS flag. */ - SetConfigOption("huge_pages_status", (ptr == MAP_FAILED) ? "off" : "on", - PGC_INTERNAL, PGC_S_DYNAMIC_DEFAULT); + segment->segment_fd = memfd_create(MappingName(segment_id), memfd_flags); + if (segment->segment_fd == -1) + ereport(FATAL, + (errmsg("segment[%s]: could not create anonymous shared memory file: %m", + MappingName(segment_id)))); - if (ptr == MAP_FAILED && huge_pages != HUGE_PAGES_ON) - { - /* - * Use the original size, not the rounded-up value, when falling back - * to non-huge pages. - */ - allocsize = *size; - ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE, - PG_MMAP_FLAGS, -1, 0); - mmap_errno = errno; - } + elog(DEBUG1, "segment[%s]: mmap(%zu)", MappingName(segment_id), mapping->shmem_req_size); + /* + * Reserve maximum required address space for future expansion of this + * memory segment. MAP_NORESERVE ensures that no memory is allocated. The + * whole address space will be setup for read/write access, so that memory + * allocated to this address space can be read or written to even if it is + * resized. + */ + ptr = mmap(NULL, mapping->shmem_reserved, PROT_READ | PROT_WRITE, + mmap_flags | MAP_NORESERVE, segment->segment_fd, 0); if (ptr == MAP_FAILED) + ereport(FATAL, + (errmsg("segment[%s]: could not map anonymous shared memory: %m", + MappingName(segment_id)))); + + /* + * Resize the backing file to the required size. On platforms where it is + * supported, we also allocate the required memory upfront. On other + * platform the memory upto the size of file will be allocated on demand. + */ + if(ftruncate(segment->segment_fd, mapping->shmem_req_size) == -1) { - errno = mmap_errno; + save_errno = errno; + + close(segment->segment_fd); + + errno = save_errno; ereport(FATAL, - (errmsg("could not map anonymous shared memory: %m"), - (mmap_errno == ENOMEM) ? + (errmsg("segment[%s]: could not truncate anonymous file to size %zu: %m", + MappingName(segment_id), mapping->shmem_req_size), + (save_errno == ENOMEM) ? errhint("This error usually means that PostgreSQL's request " "for a shared memory segment exceeded available memory, " "swap space, or huge pages. To reduce the request size " "(currently %zu bytes), reduce PostgreSQL's shared " "memory usage, perhaps by reducing \"shared_buffers\" or " "\"max_connections\".", - allocsize) : 0)); + mapping->shmem_req_size) : 0)); + } + shmem_fallocate(segment->segment_fd, MappingName(segment_id), mapping->shmem_req_size, FATAL); + + segment->shmem = ptr; + segment->shmem_size = mapping->shmem_req_size; + segment->shmem_reserved = mapping->shmem_reserved; +} + +/* + * PrepareHugePages + * + * Figure out if there are enough huge pages to allocate all shared memory + * segments, and report that information via huge_pages_status and + * huge_pages_on. It needs to be called before creating shared memory segments. + * + * It is necessary to maintain the same semantic (simple on/off) for + * huge_pages_status, even if there are multiple shared memory segments: all + * segments either use huge pages or not, there is no mix of segments with + * different page size. The latter might be actually beneficial, in particular + * because only some segments may require large amount of memory, but for now + * we go with a simple solution. + */ +void +PrepareHugePages() +{ + void *ptr = MAP_FAILED; + MemoryMappingSizes mapping_sizes[NUM_MEMORY_MAPPINGS]; + + CalculateShmemSize(mapping_sizes); + + /* Complain if hugepages demanded but we can't possibly support them */ +#if !defined(MAP_HUGETLB) + if (huge_pages == HUGE_PAGES_ON) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("huge pages not supported on this platform"))); +#else + if (huge_pages == HUGE_PAGES_ON || huge_pages == HUGE_PAGES_TRY) + { + Size hugepagesize, total_size = 0; + int mmap_flags; + + GetHugePageSize(&hugepagesize, &mmap_flags, NULL); + + /* + * Figure out how much memory is needed for all segments, keeping in + * mind that for every segment this value will be rounding up by the + * huge page size. The resulting value will be used to probe memory and + * decide whether we will allocate huge pages or not. + */ + for(int segment = 0; segment < NUM_MEMORY_MAPPINGS; segment++) + { + Size segment_size = mapping_sizes[segment].shmem_req_size; + + if (segment_size % hugepagesize != 0) + segment_size += hugepagesize - (segment_size % hugepagesize); + + total_size += segment_size; + } + + /* Map total amount of memory to test its availability. */ + elog(DEBUG1, "reserving space: probe mmap(%zu) with MAP_HUGETLB", + total_size); + ptr = mmap(NULL, total_size, PROT_NONE, + PG_MMAP_FLAGS | MAP_ANONYMOUS | mmap_flags, -1, 0); } +#endif - *size = allocsize; - return ptr; + /* + * Report whether huge pages are in use. This needs to be tracked before + * creating shared memory segments. + */ + SetConfigOption("huge_pages_status", (ptr == MAP_FAILED) ? "off" : "on", + PGC_INTERNAL, PGC_S_DYNAMIC_DEFAULT); + huge_pages_on = ptr != MAP_FAILED; } /* @@ -674,20 +951,133 @@ CreateAnonymousSegment(Size *size) static void AnonymousShmemDetach(int status, Datum arg) { - /* Release anonymous shared memory block, if any. */ - if (AnonymousShmem != NULL) + for(int i = 0; i < NUM_MEMORY_MAPPINGS; i++) { - if (munmap(AnonymousShmem, AnonymousShmemSize) < 0) - elog(LOG, "munmap(%p, %zu) failed: %m", - AnonymousShmem, AnonymousShmemSize); - AnonymousShmem = NULL; + ShmemSegment *segment = &Segments[i]; + + /* Release anonymous shared memory block, if any. */ + if (segment->shmem != NULL) + { + if (munmap(segment->shmem, segment->shmem_size) < 0) + elog(LOG, "munmap(%p, %zu) failed: %m", + segment->shmem, segment->shmem_size); + segment->shmem = NULL; + } } } +/* + * Resize all shared memory segments based on the new shared_buffers value (saved + * in ShmemCtrl area). The actual segment resizing is done via ftruncate, which + * will fail if there is not sufficient space to expand the anon file. + * + * TODO: Rename this to BufferShmemResize() or something. Only buffer manager's + * memory should be resized in this function. + * + * TODO: This function changes the amount of shared memory used. So it should + * also update the show only GUCs shared_memory_size and + * shared_memory_size_in_huge_pages in all backends. SetConfigOption() may be + * used for that. But it's not clear whether is_reload parameter is safe to use + * while resizing is going on; also at what stage it should be done. + */ +bool +AnonymousShmemResize(void) +{ + int mmap_flags = PG_MMAP_FLAGS; + Size hugepagesize; + MemoryMappingSizes mapping_sizes[NUM_MEMORY_MAPPINGS]; + + Assert(!pg_atomic_unlocked_test_flag(&ShmemCtrl->resize_in_progress)); + + /* TODO: This is a hack. NBuffersPending should never be written by anything + * other than GUC system. Find a way to pass new NBuffers value to + * BufferManagerShmemSize(). */ + NBuffersPending = pg_atomic_read_u32(&ShmemCtrl->targetNBuffers); + elog(DEBUG1, "Resize shmem from %d to %d", NBuffers, NBuffersPending); + +#ifndef MAP_HUGETLB + /* PrepareHugePages should have dealt with this case */ + Assert(huge_pages != HUGE_PAGES_ON && !huge_pages_on); +#else + if (huge_pages_on) + { + Assert(huge_pages == HUGE_PAGES_ON || huge_pages == HUGE_PAGES_TRY); + GetHugePageSize(&hugepagesize, &mmap_flags, NULL); + } +#endif + + /* Note that BufferManagerShmemSize() indirectly depends on NBuffersPending. */ + BufferManagerShmemSize(mapping_sizes); + + for(int i = 0; i < NUM_MEMORY_MAPPINGS; i++) + { + MemoryMappingSizes *mapping = &mapping_sizes[i]; + ShmemSegment *segment = &Segments[i]; + PGShmemHeader *shmem_hdr = segment->ShmemSegHdr; + + /* Main shared memory segment is always static. Ignore it. */ + if (i == MAIN_SHMEM_SEGMENT) + continue; + + round_off_mapping_sizes(mapping); + round_off_mapping_sizes_for_hugepages(mapping, hugepagesize); + + /* + * Size of the reserved address space should not change, since it depends + * upon MaxNBuffers, which can be changed only on restart. + */ + Assert(segment->shmem_reserved == mapping->shmem_reserved); +#ifdef MAP_HUGETLB + if (huge_pages_on && (mapping_sizes->shmem_req_size % hugepagesize != 0)) + mapping_sizes->shmem_req_size += hugepagesize - (mapping_sizes->shmem_req_size % hugepagesize); +#endif + elog(DEBUG1, "segment[%s]: requested size %zu, current size %zu, reserved %zu", + MappingName(i), mapping->shmem_req_size, segment->shmem_size, + segment->shmem_reserved); + + if (segment->shmem == NULL) + continue; + + if (segment->shmem_size == mapping->shmem_req_size) + continue; + + /* + * We should have reserved enough address space for resizing. PANIC if + * that's not the case. + */ + if (segment->shmem_reserved < mapping->shmem_req_size) + ereport(PANIC, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("not enough shared memory is reserved"))); + + elog(DEBUG1, "segment[%s]: resize from %zu to %zu at address %p", + MappingName(i), segment->shmem_size, + mapping->shmem_req_size, segment->shmem); + + /* + * Resize the backing file to resize the allocated memory, and allocate + * more memory on supported platforms if required. + */ + if(ftruncate(segment->segment_fd, mapping->shmem_req_size) == -1) + ereport(ERROR, + (errcode(ERRCODE_SYSTEM_ERROR), + errmsg("could not truncate anonymous file for \"%s\": %m", + MappingName(i)))); + if (mapping->shmem_req_size > segment->shmem_size) + shmem_fallocate(segment->segment_fd, MappingName(i), mapping->shmem_req_size, ERROR); + + segment->shmem_size = mapping->shmem_req_size; + shmem_hdr->totalsize = segment->shmem_size; + segment->ShmemEnd = segment->shmem + segment->shmem_size; + } + + return true; +} + /* * PGSharedMemoryCreate * - * Create a shared memory segment of the given size and initialize its + * Create a shared memory segment for the given mapping and initialize its * standard header. Also, register an on_shmem_exit callback to release * the storage. * @@ -697,7 +1087,7 @@ AnonymousShmemDetach(int status, Datum arg) * postmaster or backend. */ PGShmemHeader * -PGSharedMemoryCreate(Size size, +PGSharedMemoryCreate(MemoryMappingSizes *mapping, int segment_id, PGShmemHeader **shim) { IpcMemoryKey NextShmemSegID; @@ -705,6 +1095,7 @@ PGSharedMemoryCreate(Size size, PGShmemHeader *hdr; struct stat statbuf; Size sysvsize; + ShmemSegment *segment = &Segments[segment_id]; /* * We use the data directory's ID info (inode and device numbers) to @@ -717,14 +1108,6 @@ PGSharedMemoryCreate(Size size, errmsg("could not stat data directory \"%s\": %m", DataDir))); - /* Complain if hugepages demanded but we can't possibly support them */ -#if !defined(MAP_HUGETLB) - if (huge_pages == HUGE_PAGES_ON) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("huge pages not supported on this platform"))); -#endif - /* For now, we don't support huge pages in SysV memory */ if (huge_pages == HUGE_PAGES_ON && shared_memory_type != SHMEM_TYPE_MMAP) ereport(ERROR, @@ -732,12 +1115,12 @@ PGSharedMemoryCreate(Size size, errmsg("huge pages not supported with the current \"shared_memory_type\" setting"))); /* Room for a header? */ - Assert(size > MAXALIGN(sizeof(PGShmemHeader))); + Assert(mapping->shmem_req_size > MAXALIGN(sizeof(PGShmemHeader))); if (shared_memory_type == SHMEM_TYPE_MMAP) { - AnonymousShmem = CreateAnonymousSegment(&size); - AnonymousShmemSize = size; + /* On success, mapping data will be modified. */ + CreateAnonymousSegment(mapping, segment_id); /* Register on-exit routine to unmap the anonymous segment */ on_shmem_exit(AnonymousShmemDetach, (Datum) 0); @@ -747,7 +1130,7 @@ PGSharedMemoryCreate(Size size, } else { - sysvsize = size; + sysvsize = mapping->shmem_req_size; /* huge pages are only available with mmap */ SetConfigOption("huge_pages_status", "off", @@ -760,7 +1143,7 @@ PGSharedMemoryCreate(Size size, * loop simultaneously. (CreateDataDirLockFile() does not entirely ensure * that, but prefer fixing it over coping here.) */ - NextShmemSegID = statbuf.st_ino; + NextShmemSegID = statbuf.st_ino + segment_id; for (;;) { @@ -852,13 +1235,13 @@ PGSharedMemoryCreate(Size size, /* * Initialize space allocation status for segment. */ - hdr->totalsize = size; + hdr->totalsize = mapping->shmem_req_size; hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader)); *shim = hdr; /* Save info for possible future use */ - UsedShmemSegAddr = memAddress; - UsedShmemSegID = (unsigned long) NextShmemSegID; + segment->seg_addr = memAddress; + segment->seg_id = (unsigned long) NextShmemSegID; /* * If AnonymousShmem is NULL here, then we're not using anonymous shared @@ -866,10 +1249,10 @@ PGSharedMemoryCreate(Size size, * block. Otherwise, the System V shared memory block is only a shim, and * we must return a pointer to the real block. */ - if (AnonymousShmem == NULL) + if (segment->shmem == NULL) return hdr; - memcpy(AnonymousShmem, hdr, sizeof(PGShmemHeader)); - return (PGShmemHeader *) AnonymousShmem; + memcpy(segment->shmem, hdr, sizeof(PGShmemHeader)); + return (PGShmemHeader *) segment->shmem; } #ifdef EXEC_BACKEND @@ -969,23 +1352,47 @@ PGSharedMemoryNoReAttach(void) void PGSharedMemoryDetach(void) { - if (UsedShmemSegAddr != NULL) + for(int i = 0; i < NUM_MEMORY_MAPPINGS; i++) { - if ((shmdt(UsedShmemSegAddr) < 0) + ShmemSegment *segment = &Segments[i]; + + if (segment->seg_addr != NULL) + { + if ((shmdt(segment->seg_addr) < 0) #if defined(EXEC_BACKEND) && defined(__CYGWIN__) - /* Work-around for cygipc exec bug */ - && shmdt(NULL) < 0 + /* Work-around for cygipc exec bug */ + && shmdt(NULL) < 0 #endif - ) - elog(LOG, "shmdt(%p) failed: %m", UsedShmemSegAddr); - UsedShmemSegAddr = NULL; + ) + elog(LOG, "shmdt(%p) failed: %m", segment->seg_addr); + segment->seg_addr = NULL; + } + + if (segment->shmem != NULL) + { + if (munmap(segment->shmem, segment->shmem_size) < 0) + elog(LOG, "munmap(%p, %zu) failed: %m", + segment->shmem, segment->shmem_size); + segment->shmem = NULL; + } } +} - if (AnonymousShmem != NULL) +void +ShmemControlInit(void) +{ + bool foundShmemCtrl; + + ShmemCtrl = (ShmemControl *) + ShmemInitStruct("Shmem Control", sizeof(ShmemControl), + &foundShmemCtrl); + + if (!foundShmemCtrl) { - if (munmap(AnonymousShmem, AnonymousShmemSize) < 0) - elog(LOG, "munmap(%p, %zu) failed: %m", - AnonymousShmem, AnonymousShmemSize); - AnonymousShmem = NULL; + pg_atomic_init_u32(&ShmemCtrl->targetNBuffers, 0); + pg_atomic_init_u32(&ShmemCtrl->currentNBuffers, 0); + pg_atomic_init_flag(&ShmemCtrl->resize_in_progress); + + ShmemCtrl->coordinator = 0; } } diff --git a/src/backend/port/win32_sema.c b/src/backend/port/win32_sema.c index 5854ad1f54d3..e7365ff8060d 100644 --- a/src/backend/port/win32_sema.c +++ b/src/backend/port/win32_sema.c @@ -44,7 +44,7 @@ PGSemaphoreShmemSize(int maxSemas) * process exits. */ void -PGReserveSemaphores(int maxSemas) +PGReserveSemaphores(int maxSemas, int shmem_segment) { mySemSet = (HANDLE *) malloc(maxSemas * sizeof(HANDLE)); if (mySemSet == NULL) diff --git a/src/backend/port/win32_shmem.c b/src/backend/port/win32_shmem.c index 4dee856d6bd6..5c0c32babaf1 100644 --- a/src/backend/port/win32_shmem.c +++ b/src/backend/port/win32_shmem.c @@ -204,7 +204,7 @@ EnableLockPagesPrivilege(int elevel) * standard header. */ PGShmemHeader * -PGSharedMemoryCreate(Size size, +PGSharedMemoryCreate(MemoryMappingSizes *mapping_sizes, int segment_id, PGShmemHeader **shim) { void *memAddress; @@ -216,9 +216,10 @@ PGSharedMemoryCreate(Size size, DWORD size_high; DWORD size_low; SIZE_T largePageSize = 0; - Size orig_size = size; + Size size = mapping_sizes->shmem_req_size; DWORD flProtect = PAGE_READWRITE; DWORD desiredAccess; + ShmemSegment *segment = &Segments[segment_id] ShmemProtectiveRegion = VirtualAlloc(NULL, PROTECTIVE_REGION_SIZE, MEM_RESERVE, PAGE_NOACCESS); @@ -304,7 +305,7 @@ PGSharedMemoryCreate(Size size, * Use the original size, not the rounded-up value, when * falling back to non-huge pages. */ - size = orig_size; + size = mapping_sizes->shmem_req_size; flProtect = PAGE_READWRITE; goto retry; } @@ -393,6 +394,11 @@ PGSharedMemoryCreate(Size size, hdr->dsm_control = 0; /* Save info for possible future use */ + segment->shmem_size = size; + segment->seg_addr = memAddress; + segment->shmem = (Pointer) hdr; + segment->seg_id = (unsigned long) hmap2; + UsedShmemSegAddr = memAddress; UsedShmemSegSize = size; UsedShmemSegID = hmap2; @@ -627,7 +633,7 @@ pgwin32_ReserveSharedMemoryRegion(HANDLE hChild) * use GetLargePageMinimum() instead. */ void -GetHugePageSize(Size *hugepagesize, int *mmap_flags) +GetHugePageSize(Size *hugepagesize, int *mmap_flags, int *memfd_flags) { if (hugepagesize) *hugepagesize = 0; diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c index e84e8663e966..ef3f84a55f57 100644 --- a/src/backend/postmaster/checkpointer.c +++ b/src/backend/postmaster/checkpointer.c @@ -654,9 +654,12 @@ CheckpointerMain(const void *startup_data, size_t startup_data_len) static void ProcessCheckpointerInterrupts(void) { - if (ProcSignalBarrierPending) - ProcessProcSignalBarrier(); - + /* + * Reloading config can trigger further signals, complicating interrupts + * processing -- so let it run first. + * + * XXX: Is there any need in memory barrier after ProcessConfigFile? + */ if (ConfigReloadPending) { ConfigReloadPending = false; @@ -676,6 +679,9 @@ ProcessCheckpointerInterrupts(void) UpdateSharedMemoryConfig(); } + if (ProcSignalBarrierPending) + ProcessProcSignalBarrier(); + /* Perform logging of memory contexts of this process */ if (LogMemoryContextPending) ProcessLogMemoryContextInterrupt(); diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 7c064cf9fbb2..2095713d7c0e 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -110,11 +110,15 @@ #include "replication/slotsync.h" #include "replication/walsender.h" #include "storage/aio_subsys.h" +#include "storage/bufmgr.h" #include "storage/fd.h" #include "storage/io_worker.h" #include "storage/ipc.h" +#include "storage/pg_shmem.h" #include "storage/pmsignal.h" #include "storage/proc.h" +#include "storage/procsignal.h" +#include "storage/shmem.h" #include "tcop/backend_startup.h" #include "tcop/tcopprot.h" #include "utils/datetime.h" @@ -125,7 +129,6 @@ #ifdef EXEC_BACKEND #include "common/file_utils.h" -#include "storage/pg_shmem.h" #endif @@ -958,6 +961,11 @@ PostmasterMain(int argc, char *argv[]) */ InitializeFastPathLocks(); + /* + * Calculate MaxNBuffers for buffer pool resizing. + */ + InitializeMaxNBuffers(); + /* * Give preloaded libraries a chance to request additional shared memory. */ diff --git a/src/backend/storage/buffer/Makefile b/src/backend/storage/buffer/Makefile index fd7c40dcb089..3bc9aee85deb 100644 --- a/src/backend/storage/buffer/Makefile +++ b/src/backend/storage/buffer/Makefile @@ -17,6 +17,7 @@ OBJS = \ buf_table.o \ bufmgr.o \ freelist.o \ - localbuf.o + localbuf.o \ + buf_resize.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c index 6fd3a6bbac5e..4a354107185d 100644 --- a/src/backend/storage/buffer/buf_init.c +++ b/src/backend/storage/buffer/buf_init.c @@ -17,6 +17,7 @@ #include "storage/aio.h" #include "storage/buf_internals.h" #include "storage/bufmgr.h" +#include "utils/guc.h" BufferDescPadded *BufferDescriptors; char *BufferBlocks; @@ -61,8 +62,12 @@ CkptSortItem *CkptBufferIds; /* * Initialize shared buffer pool * - * This is called once during shared-memory initialization (either in the - * postmaster, or in a standalone backend). + * This is called once during shared-memory initialization. + * TODO: Restore this function to it's initial form. This function should see no + * change in buffer resize patches, except may be use of NBuffersPending. + * + * No locks are taking in this function, it is the caller responsibility to + * make sure only one backend can work with new buffers. */ void BufferManagerShmemInit(void) @@ -71,25 +76,26 @@ BufferManagerShmemInit(void) foundDescs, foundIOCV, foundBufCkpt; + int i; /* Align descriptors to a cacheline boundary. */ BufferDescriptors = (BufferDescPadded *) - ShmemInitStruct("Buffer Descriptors", - NBuffers * sizeof(BufferDescPadded), - &foundDescs); + ShmemInitStructInSegment("Buffer Descriptors", + NBuffersPending * sizeof(BufferDescPadded), + &foundDescs, BUFFER_DESCRIPTORS_SHMEM_SEGMENT); /* Align buffer pool on IO page size boundary. */ BufferBlocks = (char *) TYPEALIGN(PG_IO_ALIGN_SIZE, - ShmemInitStruct("Buffer Blocks", - NBuffers * (Size) BLCKSZ + PG_IO_ALIGN_SIZE, - &foundBufs)); + ShmemInitStructInSegment("Buffer Blocks", + NBuffersPending * (Size) BLCKSZ + PG_IO_ALIGN_SIZE, + &foundBufs, BUFFERS_SHMEM_SEGMENT)); /* Align condition variables to cacheline boundary. */ BufferIOCVArray = (ConditionVariableMinimallyPadded *) - ShmemInitStruct("Buffer IO Condition Variables", - NBuffers * sizeof(ConditionVariableMinimallyPadded), - &foundIOCV); + ShmemInitStructInSegment("Buffer IO Condition Variables", + NBuffersPending * sizeof(ConditionVariableMinimallyPadded), + &foundIOCV, BUFFER_IOCV_SHMEM_SEGMENT); /* * The array used to sort to-be-checkpointed buffer ids is located in @@ -99,81 +105,290 @@ BufferManagerShmemInit(void) * painful. */ CkptBufferIds = (CkptSortItem *) - ShmemInitStruct("Checkpoint BufferIds", - NBuffers * sizeof(CkptSortItem), &foundBufCkpt); + ShmemInitStructInSegment("Checkpoint BufferIds", + NBuffersPending * sizeof(CkptSortItem), &foundBufCkpt, + CHECKPOINT_BUFFERS_SHMEM_SEGMENT); if (foundDescs || foundBufs || foundIOCV || foundBufCkpt) { /* should find all of these, or none of them */ Assert(foundDescs && foundBufs && foundIOCV && foundBufCkpt); - /* note: this path is only taken in EXEC_BACKEND case */ - } - else - { - int i; - /* - * Initialize all the buffer headers. + * note: this path is only taken in EXEC_BACKEND case when initializing + * shared memory. */ - for (i = 0; i < NBuffers; i++) - { - BufferDesc *buf = GetBufferDescriptor(i); + } - ClearBufferTag(&buf->tag); + /* + * Initialize all the buffer headers. + */ + for (i = 0; i < NBuffersPending; i++) + { + BufferDesc *buf = GetBufferDescriptor(i); - pg_atomic_init_u32(&buf->state, 0); - buf->wait_backend_pgprocno = INVALID_PROC_NUMBER; + ClearBufferTag(&buf->tag); - buf->buf_id = i; + pg_atomic_init_u32(&buf->state, 0); + buf->wait_backend_pgprocno = INVALID_PROC_NUMBER; - pgaio_wref_clear(&buf->io_wref); + buf->buf_id = i; - LWLockInitialize(BufferDescriptorGetContentLock(buf), - LWTRANCHE_BUFFER_CONTENT); + pgaio_wref_clear(&buf->io_wref); - ConditionVariableInit(BufferDescriptorGetIOCV(buf)); - } + LWLockInitialize(BufferDescriptorGetContentLock(buf), + LWTRANCHE_BUFFER_CONTENT); + + ConditionVariableInit(BufferDescriptorGetIOCV(buf)); } - /* Init other shared buffer-management stuff */ + /* + * Init other shared buffer-management stuff. + */ StrategyInitialize(!foundDescs); /* Initialize per-backend file flush context */ WritebackContextInit(&BackendWritebackContext, &backend_flush_after); + + /* Declare the size of current buffer pool. */ + NBuffers = NBuffersPending; + pg_atomic_write_u32(&ShmemCtrl->currentNBuffers, NBuffers); + pg_atomic_write_u32(&ShmemCtrl->targetNBuffers, NBuffers); } /* * BufferManagerShmemSize * * compute the size of shared memory for the buffer pool including - * data pages, buffer descriptors, hash tables, etc. + * data pages, buffer descriptors, hash tables, etc. based on the + * shared memory segment. The main segment must not allocate anything + * related to buffers, every other segment will receive part of the + * data. + * + * Also sets the shmem_reserved field for each segment based on MaxNBuffers. */ Size -BufferManagerShmemSize(void) +BufferManagerShmemSize(MemoryMappingSizes *mapping_sizes) { - Size size = 0; + size_t size; - /* size of buffer descriptors */ - size = add_size(size, mul_size(NBuffers, sizeof(BufferDescPadded))); - /* to allow aligning buffer descriptors */ + /* size of buffer descriptors, plus alignment padding */ + size = add_size(0, mul_size(NBuffersPending, sizeof(BufferDescPadded))); + size = add_size(size, PG_CACHE_LINE_SIZE); + mapping_sizes[BUFFER_DESCRIPTORS_SHMEM_SEGMENT].shmem_req_size = size; + size = add_size(0, mul_size(MaxNBuffers, sizeof(BufferDescPadded))); size = add_size(size, PG_CACHE_LINE_SIZE); + mapping_sizes[BUFFER_DESCRIPTORS_SHMEM_SEGMENT].shmem_reserved = size; /* size of data pages, plus alignment padding */ - size = add_size(size, PG_IO_ALIGN_SIZE); - size = add_size(size, mul_size(NBuffers, BLCKSZ)); + size = add_size(0, PG_IO_ALIGN_SIZE); + size = add_size(size, mul_size(NBuffersPending, BLCKSZ)); + mapping_sizes[BUFFERS_SHMEM_SEGMENT].shmem_req_size = size; + size = add_size(0, PG_IO_ALIGN_SIZE); + size = add_size(size, mul_size(MaxNBuffers, BLCKSZ)); + mapping_sizes[BUFFERS_SHMEM_SEGMENT].shmem_reserved = size; - /* size of stuff controlled by freelist.c */ - size = add_size(size, StrategyShmemSize()); - - /* size of I/O condition variables */ - size = add_size(size, mul_size(NBuffers, + /* size of I/O condition variables, plus alignment padding */ + size = add_size(0, mul_size(NBuffersPending, + sizeof(ConditionVariableMinimallyPadded))); + size = add_size(size, PG_CACHE_LINE_SIZE); + mapping_sizes[BUFFER_IOCV_SHMEM_SEGMENT].shmem_req_size = size; + size = add_size(0, mul_size(MaxNBuffers, sizeof(ConditionVariableMinimallyPadded))); - /* to allow aligning the above */ size = add_size(size, PG_CACHE_LINE_SIZE); + mapping_sizes[BUFFER_IOCV_SHMEM_SEGMENT].shmem_reserved = size; /* size of checkpoint sort array in bufmgr.c */ - size = add_size(size, mul_size(NBuffers, sizeof(CkptSortItem))); + mapping_sizes[CHECKPOINT_BUFFERS_SHMEM_SEGMENT].shmem_req_size = mul_size(NBuffersPending, sizeof(CkptSortItem)); + mapping_sizes[CHECKPOINT_BUFFERS_SHMEM_SEGMENT].shmem_reserved = mul_size(MaxNBuffers, sizeof(CkptSortItem)); + + /* Allocations in the main memory segment, at the end. */ + + /* size of stuff controlled by freelist.c */ + size = add_size(0, StrategyShmemSize()); return size; } + +/* + * Reinitialize shared buffer manager structures when resizing the buffer pool. + * + * This function is called in the backend which coordinates buffer resizing + * operation. + * + * TODO: Avoid code duplication with BufferManagerShmemInit() and also assess + * which functionality in the latter is required in this function. + */ +void +BufferManagerShmemResize(int currentNBuffers, int targetNBuffers) +{ + bool found; + int i; + void *tmpPtr; + + tmpPtr = (BufferDescPadded *) + ShmemUpdateStructInSegment("Buffer Descriptors", + targetNBuffers * sizeof(BufferDescPadded), + &found, BUFFER_DESCRIPTORS_SHMEM_SEGMENT); + if (BufferDescriptors != tmpPtr || !found) + elog(FATAL, "resizing buffer descriptors failed: expected pointer %p, got %p, found=%d", + BufferDescriptors, tmpPtr, found); + + tmpPtr = (ConditionVariableMinimallyPadded *) + ShmemUpdateStructInSegment("Buffer IO Condition Variables", + targetNBuffers * sizeof(ConditionVariableMinimallyPadded), + &found, BUFFER_IOCV_SHMEM_SEGMENT); + if (BufferIOCVArray != tmpPtr || !found) + elog(FATAL, "resizing buffer IO condition variables failed: expected pointer %p, got %p, found=%d", + BufferIOCVArray, tmpPtr, found); + + tmpPtr = (CkptSortItem *) + ShmemUpdateStructInSegment("Checkpoint BufferIds", + targetNBuffers * sizeof(CkptSortItem), &found, + CHECKPOINT_BUFFERS_SHMEM_SEGMENT); + if (CkptBufferIds != tmpPtr || !found) + elog(FATAL, "resizing checkpoint buffer IDs failed: expected pointer %p, got %p, found=%d", + CkptBufferIds, tmpPtr, found); + + tmpPtr = (char *) + TYPEALIGN(PG_IO_ALIGN_SIZE, + ShmemUpdateStructInSegment("Buffer Blocks", + targetNBuffers * (Size) BLCKSZ + PG_IO_ALIGN_SIZE, + &found, BUFFERS_SHMEM_SEGMENT)); + if (BufferBlocks != tmpPtr || !found) + elog(FATAL, "resizing buffer blocks failed: expected pointer %p, got %p, found=%d", + BufferBlocks, tmpPtr, found); + + /* + * Initialize the headers for new buffers. If we are shrinking the + * buffers, currentNBuffers >= targetNBuffers, thus this loop doesn't execute. + */ + for (i = currentNBuffers; i < targetNBuffers; i++) + { + BufferDesc *buf = GetBufferDescriptor(i); + + ClearBufferTag(&buf->tag); + + pg_atomic_init_u32(&buf->state, 0); + buf->wait_backend_pgprocno = INVALID_PROC_NUMBER; + + buf->buf_id = i; + + LWLockInitialize(BufferDescriptorGetContentLock(buf), + LWTRANCHE_BUFFER_CONTENT); + + ConditionVariableInit(BufferDescriptorGetIOCV(buf)); + } + + /* + * We do not touch StrategyControl here. Instead it is done by background + * writer when handling PROCSIGNAL_BARRIER_SHBUF_EXPAND or + * PROCSIGNAL_BARRIER_SHBUF_SHRINK barrier. + */ +} + +/* + * BufferManagerShmemValidate + * Validate that buffer manager shared memory structures have correct + * pointers and sizes after a resize operation. + * + * This function is called by backends during ProcessBarrierShmemResizeStruct + * to ensure their view of the buffer structures is consistent after memory + * remapping. + */ +void +BufferManagerShmemValidate(int targetNBuffers) +{ + bool found; + void *tmpPtr; + + /* Validate Buffer Descriptors */ + tmpPtr = (BufferDescPadded *) + ShmemInitStructInSegment("Buffer Descriptors", + targetNBuffers * sizeof(BufferDescPadded), + &found, BUFFER_DESCRIPTORS_SHMEM_SEGMENT); + if (!found || BufferDescriptors != tmpPtr) + elog(FATAL, "validating buffer descriptors failed: expected pointer %p, got %p, found=%d", + BufferDescriptors, tmpPtr, found); + + /* Validate Buffer IO Condition Variables */ + tmpPtr = (ConditionVariableMinimallyPadded *) + ShmemInitStructInSegment("Buffer IO Condition Variables", + targetNBuffers * sizeof(ConditionVariableMinimallyPadded), + &found, BUFFER_IOCV_SHMEM_SEGMENT); + if (!found || BufferIOCVArray != tmpPtr) + elog(FATAL, "validating buffer IO condition variables failed: expected pointer %p, got %p, found=%d", + BufferIOCVArray, tmpPtr, found); + + /* Validate Checkpoint BufferIds */ + tmpPtr = (CkptSortItem *) + ShmemInitStructInSegment("Checkpoint BufferIds", + targetNBuffers * sizeof(CkptSortItem), &found, + CHECKPOINT_BUFFERS_SHMEM_SEGMENT); + if (!found || CkptBufferIds != tmpPtr) + elog(FATAL, "validating checkpoint buffer IDs failed: expected pointer %p, got %p, found=%d", + CkptBufferIds, tmpPtr, found); + + /* Validate Buffer Blocks */ + tmpPtr = (char *) + TYPEALIGN(PG_IO_ALIGN_SIZE, + ShmemInitStructInSegment("Buffer Blocks", + targetNBuffers * (Size) BLCKSZ + PG_IO_ALIGN_SIZE, + &found, BUFFERS_SHMEM_SEGMENT)); + if (!found || BufferBlocks != tmpPtr) + elog(FATAL, "validating buffer blocks failed: expected pointer %p, got %p, found=%d", + BufferBlocks, tmpPtr, found); +} + +/* + * check_shared_buffers + * GUC check_hook for shared_buffers + * + * When reloading the configuration, shared_buffers should not be set to a value + * higher than max_shared_buffers fixed at the boot time. + */ +bool +check_shared_buffers(int *newval, void **extra, GucSource source) +{ + if (finalMaxNBuffers && *newval > MaxNBuffers) + { + GUC_check_errdetail("\"shared_buffers\" must be less than \"max_shared_buffers\"."); + return false; + } + return true; +} + +/* + * show_shared_buffers + * GUC show_hook for shared_buffers + * + * Shows both current and pending buffer counts with proper unit formatting. + */ +const char * +show_shared_buffers(void) +{ + static char buffer[128]; + int64 current_value, pending_value; + const char *current_unit, *pending_unit; + int currentNBuffers = pg_atomic_read_u32(&ShmemCtrl->currentNBuffers); + + if (currentNBuffers == NBuffersPending) + { + /* No buffer pool resizing pending. */ + convert_int_from_base_unit(currentNBuffers, GUC_UNIT_BLOCKS, ¤t_value, ¤t_unit); + snprintf(buffer, sizeof(buffer), INT64_FORMAT "%s", current_value, current_unit); + } + else + { + /* + * New value for NBuffers is loaded but not applied yet, show both + * current and pending. + */ + convert_int_from_base_unit(currentNBuffers, GUC_UNIT_BLOCKS, ¤t_value, ¤t_unit); + convert_int_from_base_unit(NBuffersPending, GUC_UNIT_BLOCKS, &pending_value, &pending_unit); + snprintf(buffer, sizeof(buffer), INT64_FORMAT "%s (pending: " INT64_FORMAT "%s)", + current_value, current_unit, pending_value, pending_unit); + } + + return buffer; +} diff --git a/src/backend/storage/buffer/buf_resize.c b/src/backend/storage/buffer/buf_resize.c new file mode 100644 index 000000000000..e815600c3ba0 --- /dev/null +++ b/src/backend/storage/buffer/buf_resize.c @@ -0,0 +1,399 @@ +/*------------------------------------------------------------------------- + * + * buf_resize.c + * shared buffer pool resizing functionality + * + * This module contains the implementation of shared buffer pool resizing, + * including the main resize coordination function and barrier processing + * functions that synchronize all backends during resize operations. + * + * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/buffer/buf_resize.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "fmgr.h" +#include "miscadmin.h" +#include "postmaster/bgwriter.h" +#include "storage/bufmgr.h" +#include "storage/buf_internals.h" +#include "storage/ipc.h" +#include "storage/pg_shmem.h" +#include "storage/pmsignal.h" +#include "storage/procsignal.h" +#include "storage/shmem.h" +#include "utils/injection_point.h" + + +/* + * Prepare ShmemCtrl for resizing the shared buffer pool. + */ +static void +MarkBufferResizingStart(int targetNBuffers, int currentNBuffers) +{ + Assert(!pg_atomic_unlocked_test_flag(&ShmemCtrl->resize_in_progress)); + + Assert(pg_atomic_read_u32(&ShmemCtrl->currentNBuffers) == currentNBuffers); + + pg_atomic_write_u32(&ShmemCtrl->targetNBuffers, targetNBuffers); + ShmemCtrl->coordinator = MyProcPid; +} + +/* + * Reset ShmemCtrl after resizing the shared buffer pool is done. + */ +static void +MarkBufferResizingEnd(int NBuffers) +{ + Assert(!pg_atomic_unlocked_test_flag(&ShmemCtrl->resize_in_progress)); + + Assert(pg_atomic_read_u32(&ShmemCtrl->currentNBuffers) == NBuffers); + pg_atomic_write_u32(&ShmemCtrl->targetNBuffers, 0); + ShmemCtrl->coordinator = -1; +} + +/* + * Communicate given buffer pool resize barrier to all other backends and the Postmaster. + * + * ProcSignalBarrier is not sent to the Postmaster but we need the Postmaster to + * update its knowledge about the buffer pool so that it can be inherited by the + * child processes. + */ +static void +SharedBufferResizeBarrier(ProcSignalBarrierType barrier, const char *barrier_name) +{ + WaitForProcSignalBarrier(EmitProcSignalBarrier(barrier)); + elog(LOG, "all backends acknowledged %s barrier", barrier_name); + +#ifdef USE_INJECTION_POINTS + /* Injection point specific to this barrier type */ + switch (barrier) + { + case PROCSIGNAL_BARRIER_SHBUF_SHRINK: + INJECTION_POINT("pgrsb-shrink-barrier-sent", NULL); + break; + case PROCSIGNAL_BARRIER_SHBUF_RESIZE_MAP_AND_MEM: + INJECTION_POINT("pgrsb-resize-barrier-sent", NULL); + break; + case PROCSIGNAL_BARRIER_SHBUF_EXPAND: + INJECTION_POINT("pgrsb-expand-barrier-sent", NULL); + break; + case PROCSIGNAL_BARRIER_SHBUF_RESIZE_FAILED: + /* TODO: Add an injection point here. */ + break; + case PROCSIGNAL_BARRIER_SMGRRELEASE: + /* + * Not relevant in this function but it's here so that the compiler + * can detect any missing shared buffer resizing barrier enum here. + */ + break; + } +#endif /* USE_INJECTION_POINTS */ +} + +/* + * C implementation of SQL interface to update the shared buffers according to + * the current values of shared_buffers GUCs. + * + * The current boundaries of the buffer pool are given by two ranges. + * + * - [1, StrategyControl::activeNBuffers] is the range of buffers from which new + * allocations can happen at any time. + * + * - [1, ShmemCtrl::currentNBuffers] is the range of valid buffers at any given + * time. + * + * Let's assume that before resizing, the number of buffers in the buffer pool is + * NBuffersOld. After resizing it is NBuffersNew. Before resizing + * StrategyControl::activeNBuffers == ShmemCtrl::currentNBuffers == NBuffersOld. + * After the resizing finishes StrategyControl::activeNBuffers == + * ShmemCtrl::currentNBuffers == NBuffersNew. Thus when no resizing happens these + * two ranges are same. + * + * Following steps are performed by the coordinator during resizing. + * + * 1. Marks resizing in progress to avoid multiple concurrent invocations of this + * function. + * + * 2. When shrinking the shared buffer pool, the coordinator sends SHBUF_SHRINK + * ProcSignalBarrier. In response to this barrier background writer is expected + * to set StrategyControl::activeNBuffers = NBuffersNew to restrict the new + * buffer allocations only to the new buffer pool size and also reset its + * internal state. Once every backend has acknowledged the barrier, the + * coordinator can be sure that new allocations will not happen in the buffer + * pool area being shrunk. Then it evicts the buffers in that area. Note that + * ShmemCtrl::currentNBuffers is still NBuffersOld, since backend may still + * access buffers allocated before the resizing started. Buffer eviction may fail + * if a buffer being evicted is pinned and the resizing operatino is aborted. + * Once the eviction is finished, the extra memory can be freed in the next step. + * + * 2. This step is executed in both cases, when expanding the buffer pool or + * shrinking the buffer pool. The anonymous file backing each of the shared + * memory segment containg the buffer pool shared data structures is resized to + * the amount of memory required for the new buffer pool size. When expanding the + * expanded portion of memory is initialized appropriately. + * ShmemCtrl::currentNBuffers is set to NBuffersNew to indicate new range of + * valid shared buffers. Every backend is sent SHBUF_RESIZE_MAP_AND_MEM barrier. + * All the backends validate that their pointers to the shared buffers structure + * are valid and have the right size. Once every backend has acknowledged the + * barrier, this step finishes. + * + * 3. When expanding the buffer pool, the coordinator sends SHBUF_EXPAND barrier + * to signal end of expansion. When expadning the background writer, in response + * to StrategyControl::activeNBuffers = NBufferNew so that new allocations can + * use expanded range of buffer pool. + * + * TODO: Handle the case when the backend executing this function dies or the + * query is cancelled or it hits an error while resizing. + */ +Datum +pg_resize_shared_buffers(PG_FUNCTION_ARGS) +{ + bool result = true; + int currentNBuffers = pg_atomic_read_u32(&ShmemCtrl->currentNBuffers); + int targetNBuffers = NBuffersPending; + + if (currentNBuffers == targetNBuffers) + { + elog(LOG, "shared buffers are already at %d, no need to resize", currentNBuffers); + PG_RETURN_BOOL(true); + } + + if (!pg_atomic_test_set_flag(&ShmemCtrl->resize_in_progress)) + { + elog(LOG, "shared buffer resizing already in progress"); + PG_RETURN_BOOL(false); + } + + /* + * TODO: What if the NBuffersPending value seen here is not the desired one + * because somebody did a pg_reload_conf() between the last pg_reload_conf() + * and execution of this function? + */ + MarkBufferResizingStart(targetNBuffers, currentNBuffers); + elog(LOG, "resizing shared buffers from %d to %d", currentNBuffers, targetNBuffers); + + INJECTION_POINT("pg-resize-shared-buffers-flag-set", NULL); + + /* Phase 1: SHBUF_SHRINK - Only for shrinking buffer pool */ + if (targetNBuffers < currentNBuffers) + { + /* + * Phase 1: Shrinking - send SHBUF_SHRINK barrier + * Every backend sets activeNBuffers = NewNBuffers to restrict + * buffer pool allocations to the new size + */ + elog(LOG, "Phase 1: Shrinking buffer pool, restricting allocations to %d buffers", targetNBuffers); + + SharedBufferResizeBarrier(PROCSIGNAL_BARRIER_SHBUF_SHRINK, CppAsString(PROCSIGNAL_BARRIER_SHBUF_SHRINK)); + + /* Evict buffers in the area being shrunk */ + elog(LOG, "evicting buffers %u..%u", targetNBuffers + 1, currentNBuffers); + if (!EvictExtraBuffers(targetNBuffers, currentNBuffers)) + { + elog(WARNING, "failed to evict extra buffers during shrinking"); + SharedBufferResizeBarrier(PROCSIGNAL_BARRIER_SHBUF_RESIZE_FAILED, CppAsString(PROCSIGNAL_BARRIER_SHBUF_RESIZE_FAILED)); + MarkBufferResizingEnd(currentNBuffers); + pg_atomic_clear_flag(&ShmemCtrl->resize_in_progress); + PG_RETURN_BOOL(false); + } + + /* Update the current NBuffers. */ + pg_atomic_write_u32(&ShmemCtrl->currentNBuffers, targetNBuffers); + } + + /* Phase 2: SHBUF_RESIZE_MAP_AND_MEM - Both expanding and shrinking */ + elog(LOG, "Phase 2: Remapping shared memory segments and updating structures"); + if (!AnonymousShmemResize()) + { + /* + * This should never fail since address map should already be reserved. + * So the failure should be treated as PANIC. + */ + elog(PANIC, "failed to resize anonymous shared memory"); + } + + /* Update structure pointers and sizes */ + BufferManagerShmemResize(currentNBuffers, targetNBuffers); + + INJECTION_POINT("pgrsb-after-shmem-resize", NULL); + + SharedBufferResizeBarrier(PROCSIGNAL_BARRIER_SHBUF_RESIZE_MAP_AND_MEM, CppAsString(PROCSIGNAL_BARRIER_SHBUF_RESIZE_MAP_AND_MEM)); + + /* Phase 3: SHBUF_EXPAND - Only for expanding buffer pool */ + if (targetNBuffers > currentNBuffers) + { + /* + * Phase 3: Expanding - send SHBUF_EXPAND barrier + * Backends set activeNBuffers = NewNBuffers and start allocating + * buffers from the expanded range + */ + elog(LOG, "Phase 3: Expanding buffer pool, enabling allocations up to %d buffers", targetNBuffers); + pg_atomic_write_u32(&ShmemCtrl->currentNBuffers, targetNBuffers); + + SharedBufferResizeBarrier(PROCSIGNAL_BARRIER_SHBUF_EXPAND, CppAsString(PROCSIGNAL_BARRIER_SHBUF_EXPAND)); + } + + /* + * Reset buffer resize control area. + */ + MarkBufferResizingEnd(targetNBuffers); + + pg_atomic_clear_flag(&ShmemCtrl->resize_in_progress); + + elog(LOG, "successfully resized shared buffers to %d", targetNBuffers); + + PG_RETURN_BOOL(result); +} + +bool +ProcessBarrierShmemShrink(void) +{ + int targetNBuffers = pg_atomic_read_u32(&ShmemCtrl->targetNBuffers); + + Assert(!pg_atomic_unlocked_test_flag(&ShmemCtrl->resize_in_progress)); + + /* + * Delay adjusting the new active size of buffer pool till this process + * becomes ready to resize buffers. + */ + if (delay_shmem_resize) + { + elog(LOG, "Phase 1: Delaying SHBUF_SHRINK barrier - restricting allocations to %d buffers, coordinator is %d", + targetNBuffers, ShmemCtrl->coordinator); + + return false; + } + + if (MyBackendType == B_BG_WRITER) + { + /* + * We have to reset the background writer's buffer allocation statistics + * and the strategy control together so that background writer doesn't go + * out of sync with ClockSweepTick(). + * + * TODO: But in case the background writer is not running, nobody would + * reset the strategy control area. So we can't rely on background + * worker to do that. So find a better way. + */ + BgBufferSyncReset(NBuffers, targetNBuffers); + /* Reset strategy control to new size */ + StrategyReset(targetNBuffers); + } + + elog(LOG, "Phase 1: Processing SHBUF_SHRINK barrier - NBuffers = %d, coordinator is %d", + NBuffers, ShmemCtrl->coordinator); + + return true; +} + +bool +ProcessBarrierShmemResizeMapAndMem(void) +{ + int targetNBuffers = pg_atomic_read_u32(&ShmemCtrl->targetNBuffers); + + Assert(!pg_atomic_unlocked_test_flag(&ShmemCtrl->resize_in_progress)); + + /* + * If buffer pool is being shrunk, we are already working with a smaller + * buffer pool, so shrinking address space and shared structures should not + * be a problem. When expanding, expanding the address space and shared + * structures beyond the current boundaries is not going to be a problem + * since we are not accessing that memory yet. So there is no reason to + * delay processing this barrier. + */ + + /* + * Coordinator has already adjusted its address map and also updated sizes + * of the shared buffer structures, no further validation needed. + */ + if (ShmemCtrl->coordinator == MyProcPid) + return true; + + /* + * Backends validate that their pointers to shared buffer structures are + * still valid and have the correct size after memory remapping. + * + * TODO: Do want to do this only in assert enabled builds? + */ + BufferManagerShmemValidate(targetNBuffers); + + elog(LOG, "Backend %d successfully validated structure pointers after resize", MyProcPid); + + return true; +} + +bool +ProcessBarrierShmemExpand(void) +{ + int targetNBuffers = pg_atomic_read_u32(&ShmemCtrl->targetNBuffers); + + Assert(!pg_atomic_unlocked_test_flag(&ShmemCtrl->resize_in_progress)); + + /* + * Delay adjusting the new active size of buffer pool till this process + * becomes ready to resize buffers. + */ + if (delay_shmem_resize) + { + elog(LOG, "Phase 3: delaying SHBUF_EXPAND barrier - enabling allocations up to %d buffers, coordinator is %d", + targetNBuffers, ShmemCtrl->coordinator); + return false; + } + + if (MyBackendType == B_BG_WRITER) + { + /* + * We have to reset the background writer's buffer allocation statistics + * and the strategy control together so that background writer doesn't go + * out of sync with ClockSweepTick(). + * + * TODO: But in case the background writer is not running, nobody would + * reset the strategy control area. So we can't rely on background + * worker to do that. So find a better way. + */ + BgBufferSyncReset(NBuffers, targetNBuffers); + StrategyReset(targetNBuffers); + } + + elog(LOG, "Phase 3: Processing SHBUF_EXPAND barrier - targetNBuffers = %d, ShmemCtrl->coordinator = %d", targetNBuffers, ShmemCtrl->coordinator); + + return true; +} + +bool +ProcessBarrierShmemResizeFailed(void) +{ + int currentNBuffers = pg_atomic_read_u32(&ShmemCtrl->currentNBuffers); + int targetNBuffers = pg_atomic_read_u32(&ShmemCtrl->targetNBuffers); + + Assert(!pg_atomic_unlocked_test_flag(&ShmemCtrl->resize_in_progress)); + + if (MyBackendType == B_BG_WRITER) + { + /* + * We have to reset the background writer's buffer allocation statistics + * and the strategy control together so that background writer doesn't go + * out of sync with ClockSweepTick(). + * + * TODO: But in case the background writer is not running, nobody would + * reset the strategy control area. So we can't rely on background + * worker to do that. So find a better way. + */ + BgBufferSyncReset(NBuffers, currentNBuffers); + /* Reset strategy control to new size */ + StrategyReset(currentNBuffers); + } + + elog(LOG, "received proc signal indicating failure to resize shared buffers from %d to %d, restoring to %d, coordinator is %d", + NBuffers, targetNBuffers, currentNBuffers, ShmemCtrl->coordinator); + + return true; +} \ No newline at end of file diff --git a/src/backend/storage/buffer/buf_table.c b/src/backend/storage/buffer/buf_table.c index 9d256559bab9..18c9c6f336c1 100644 --- a/src/backend/storage/buffer/buf_table.c +++ b/src/backend/storage/buffer/buf_table.c @@ -21,7 +21,13 @@ */ #include "postgres.h" +#include "fmgr.h" +#include "funcapi.h" #include "storage/buf_internals.h" +#include "storage/lwlock.h" +#include "storage/pg_shmem.h" +#include "utils/rel.h" +#include "utils/builtins.h" /* entry for buffer lookup hashtable */ typedef struct @@ -59,10 +65,18 @@ InitBufTable(int size) info.entrysize = sizeof(BufferLookupEnt); info.num_partitions = NUM_BUFFER_PARTITIONS; - SharedBufHash = ShmemInitHash("Shared Buffer Lookup Table", + /* + * The shared buffer look up table is set up only once with maximum possible + * entries considering maximum size of the buffer pool. It is not resized + * after that even if the buffer pool is resized. Hence it is allocated in + * the main shared memory segment and not in a resizeable shared memory + * segment. + */ + SharedBufHash = ShmemInitHashInSegment("Shared Buffer Lookup Table", size, size, &info, - HASH_ELEM | HASH_BLOBS | HASH_PARTITION | HASH_FIXED_SIZE); + HASH_ELEM | HASH_BLOBS | HASH_PARTITION | HASH_FIXED_SIZE, + MAIN_SHMEM_SEGMENT); } /* @@ -159,3 +173,56 @@ BufTableDelete(BufferTag *tagPtr, uint32 hashcode) if (!result) /* shouldn't happen */ elog(ERROR, "shared buffer hash table corrupted"); } + +/* + * BufTableGetContents + * Fill the given tuplestore with contents of the shared buffer lookup table + * + * This function is used by pg_buffercache extension to expose buffer lookup + * table contents via SQL. The caller is responsible for setting up the + * tuplestore and result set info. + */ +void +BufTableGetContents(Tuplestorestate *tupstore, TupleDesc tupdesc) +{ +/* Expected number of attributes of the buffer lookup table entry. */ +#define BUFTABLE_CONTENTS_COLS 6 + + HASH_SEQ_STATUS hstat; + BufferLookupEnt *ent; + Datum values[BUFTABLE_CONTENTS_COLS]; + bool nulls[BUFTABLE_CONTENTS_COLS]; + int i; + + memset(nulls, 0, sizeof(nulls)); + + Assert(tupdesc->natts == BUFTABLE_CONTENTS_COLS); + + /* + * Lock all buffer mapping partitions to ensure a consistent view of the + * hash table during the scan. Must grab LWLocks in partition-number order + * to avoid LWLock deadlock. + */ + for (i = 0; i < NUM_BUFFER_PARTITIONS; i++) + LWLockAcquire(BufMappingPartitionLockByIndex(i), LW_SHARED); + + hash_seq_init(&hstat, SharedBufHash); + while ((ent = (BufferLookupEnt *) hash_seq_search(&hstat)) != NULL) + { + values[0] = ObjectIdGetDatum(ent->key.spcOid); + values[1] = ObjectIdGetDatum(ent->key.dbOid); + values[2] = ObjectIdGetDatum(ent->key.relNumber); + values[3] = ObjectIdGetDatum(ent->key.forkNum); + values[4] = Int64GetDatum(ent->key.blockNum); + values[5] = Int32GetDatum(ent->id); + + tuplestore_putvalues(tupstore, tupdesc, values, nulls); + } + + /* + * Release all buffer mapping partition locks in the reverse order so as + * to avoid LWLock deadlock. + */ + for (i = NUM_BUFFER_PARTITIONS - 1; i >= 0; i--) + LWLockRelease(BufMappingPartitionLockByIndex(i)); +} diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 327ddb7adc88..f489ae2932fc 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -57,6 +57,7 @@ #include "storage/fd.h" #include "storage/ipc.h" #include "storage/lmgr.h" +#include "storage/pg_shmem.h" #include "storage/proc.h" #include "storage/read_stream.h" #include "storage/smgr.h" @@ -66,6 +67,7 @@ #include "utils/rel.h" #include "utils/resowner.h" #include "utils/timestamp.h" +#include "utils/injection_point.h" /* Note: these two macros only work on shared buffers, not local ones! */ @@ -3415,6 +3417,9 @@ BufferSync(int flags) ProcessProcSignalBarrier(); } + /* Injection point after scanning all buffers for dirty pages */ + INJECTION_POINT("buffer-sync-dirty-buffer-scan", NULL); + if (num_to_scan == 0) return; /* nothing to do */ @@ -3607,6 +3612,32 @@ BufferSync(int flags) TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan); } +/* + * Information saved between BgBufferSync() calls so we can determine the + * strategy point's advance rate and avoid scanning already-cleaned buffers. The + * variables are global instead of static local so that BgBufferSyncReset() can + * adjust it when resizing shared buffers. + */ +static bool saved_info_valid = false; +static int prev_strategy_buf_id; +static uint32 prev_strategy_passes; +static int next_to_clean; +static uint32 next_passes; + +/* Moving averages of allocation rate and clean-buffer density */ +static float smoothed_alloc = 0; +static float smoothed_density = 10.0; + +void +BgBufferSyncReset(int currentNBuffers, int targetNBuffers) +{ + saved_info_valid = false; +#ifdef BGW_DEBUG + elog(DEBUG2, "invalidated background writer status after resizing buffers from %d to %d", + currentNBuffers, targetNBuffers); +#endif +} + /* * BgBufferSync -- Write out some dirty buffers in the pool. * @@ -3626,20 +3657,6 @@ BgBufferSync(WritebackContext *wb_context) uint32 strategy_passes; uint32 recent_alloc; - /* - * Information saved between calls so we can determine the strategy - * point's advance rate and avoid scanning already-cleaned buffers. - */ - static bool saved_info_valid = false; - static int prev_strategy_buf_id; - static uint32 prev_strategy_passes; - static int next_to_clean; - static uint32 next_passes; - - /* Moving averages of allocation rate and clean-buffer density */ - static float smoothed_alloc = 0; - static float smoothed_density = 10.0; - /* Potentially these could be tunables, but for now, not */ float smoothing_samples = 16; float scan_whole_pool_milliseconds = 120000.0; @@ -3662,6 +3679,25 @@ BgBufferSync(WritebackContext *wb_context) long new_strategy_delta; uint32 new_recent_alloc; + /* + * If buffer pool is being shrunk the buffer being written out may not remain + * valid. If the buffer pool is being expanded, more buffers will become + * available without even this function writing out any. Hence wait till + * buffer resizing finishes i.e. go into hibernation mode. + * + * TODO: We may not need this synchronization if background worker itself + * becomes the coordinator. + */ + if (!pg_atomic_unlocked_test_flag(&ShmemCtrl->resize_in_progress)) + return true; + + /* + * Resizing shared buffers while this function is performing an LRU scan on + * them may lead to wrong results. Indicate that the resizing should wait for + * the LRU scan to complete. + */ + delay_shmem_resize = true; + /* * Find out where the clock-sweep currently is, and how many buffer * allocations have happened since our last call. @@ -3679,6 +3715,7 @@ BgBufferSync(WritebackContext *wb_context) if (bgwriter_lru_maxpages <= 0) { saved_info_valid = false; + delay_shmem_resize = false; return true; } @@ -3838,8 +3875,17 @@ BgBufferSync(WritebackContext *wb_context) num_written = 0; reusable_buffers = reusable_buffers_est; - /* Execute the LRU scan */ - while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est) + /* + * Execute the LRU scan. + * + * If buffer pool is being shrunk, the buffer being written may not remain + * valid. If the buffer pool is being expanded, more buffers will become + * available without even this function writing any. Hence stop what we are doing. This + * also unblocks other processes that are waiting for buffer resizing to + * finish. + */ + while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est && + !pg_atomic_unlocked_test_flag(&ShmemCtrl->resize_in_progress)) { int sync_state = SyncOneBuffer(next_to_clean, true, wb_context); @@ -3898,6 +3944,9 @@ BgBufferSync(WritebackContext *wb_context) #endif } + /* Let the resizing commence. */ + delay_shmem_resize = false; + /* Return true if OK to hibernate */ return (bufs_to_lap == 0 && recent_alloc == 0); } @@ -4208,7 +4257,23 @@ DebugPrintBufferRefcount(Buffer buffer) void CheckPointBuffers(int flags) { + /* Mark that buffer sync is in progress - delay any shared memory resizing. */ + /* + * TODO: We need to assess whether we should allow checkpoint and buffer + * resizing to run in parallel. When expanding buffers it may be fine to let + * the checkpointer run in RESIZE_MAP_AND_MEM phase but delay phase EXPAND + * phase till the checkpoint finishes, at the same time not allow checkpoint + * to run during expansion phase. When shrinking the buffers, we should + * delay SHRINK phase till checkpoint finishes and not allow to start + * checkpoint till SHRINK phase is done, but allow it to run in + * RESIZE_MAP_AND_MEM phase. This needs careful analysis and testing. + */ + delay_shmem_resize = true; + BufferSync(flags); + + /* Mark that buffer sync is no longer in progress - allow shared memory resizing */ + delay_shmem_resize = false; } /* @@ -7466,3 +7531,70 @@ const PgAioHandleCallbacks aio_local_buffer_readv_cb = { .complete_local = local_buffer_readv_complete, .report = buffer_readv_report, }; + +/* + * When shrinking shared buffers pool, evict the buffers which will not be part + * of the shrunk buffer pool. + */ +bool +EvictExtraBuffers(int targetNBuffers, int currentNBuffers) +{ + bool result = true; + + Assert(targetNBuffers < currentNBuffers); + + /* + * If the buffer being evicated is locked, this function will need to wait. + * This function should not be called from a Postmaster since it can not wait on a lock. + */ + Assert(IsUnderPostmaster); + + /* + * TODO: Before evicting any buffer, we should check whether any of the + * buffers are pinned. If we find that a buffer is pinned after evicting + * most of them, that will impact performance since all those evicted + * buffers might need to be read again. + */ + for (Buffer buf = targetNBuffers + 1; buf <= currentNBuffers; buf++) + { + BufferDesc *desc = GetBufferDescriptor(buf - 1); + uint32 buf_state; + bool buffer_flushed; + + buf_state = pg_atomic_read_u32(&desc->state); + + /* + * Nobody is expected to touch the buffers while resizing is + * going one hence unlocked precheck should be safe and saves + * some cycles. + */ + if (!(buf_state & BM_VALID)) + continue; + + /* + * XXX: Looks like CurrentResourceOwner can be NULL here, find + * another one in that case? + * */ + if (CurrentResourceOwner) + ResourceOwnerEnlarge(CurrentResourceOwner); + + ReservePrivateRefCountEntry(); + + LockBufHdr(desc); + + /* + * Now that we have locked buffer descriptor, make sure that the + * buffer without valid data has been skipped above. + */ + Assert(buf_state & BM_VALID); + + if (!EvictUnpinnedBufferInternal(desc, &buffer_flushed)) + { + elog(WARNING, "could not remove buffer %u, it is pinned", buf); + result = false; + break; + } + } + + return result; +} diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c index 28d952b35344..256521d889af 100644 --- a/src/backend/storage/buffer/freelist.c +++ b/src/backend/storage/buffer/freelist.c @@ -19,6 +19,7 @@ #include "port/atomics.h" #include "storage/buf_internals.h" #include "storage/bufmgr.h" +#include "storage/pg_shmem.h" #include "storage/proc.h" #define INT_ACCESS_ONCE(var) ((int)(*((volatile int *)&(var)))) @@ -32,10 +33,16 @@ typedef struct /* Spinlock: protects the values below */ slock_t buffer_strategy_lock; + /* + * Number of active buffers that can be allocated. During buffer resizing, + * this may be different from NBuffers which tracks the global buffer count. + */ + pg_atomic_uint32 activeNBuffers; + /* * clock-sweep hand: index of next buffer to consider grabbing. Note that * this isn't a concrete buffer - we only ever increase the value. So, to - * get an actual buffer, it needs to be used modulo NBuffers. + * get an actual buffer, it needs to be used modulo activeNBuffers. */ pg_atomic_uint32 nextVictimBuffer; @@ -100,21 +107,27 @@ static inline uint32 ClockSweepTick(void) { uint32 victim; + int activeBuffers; /* - * Atomically move hand ahead one buffer - if there's several processes - * doing this, this can lead to buffers being returned slightly out of - * apparent order. + * Atomically move hand ahead one buffer - if there's several processes doing + * this, this can lead to buffers being returned slightly out of apparent + * order. We need to read both the current position of hand and the current + * buffer allocation limit together consistently. They may be reset by + * concurrent resize. */ + SpinLockAcquire(&StrategyControl->buffer_strategy_lock); victim = pg_atomic_fetch_add_u32(&StrategyControl->nextVictimBuffer, 1); + activeBuffers = pg_atomic_read_u32(&StrategyControl->activeNBuffers); + SpinLockRelease(&StrategyControl->buffer_strategy_lock); - if (victim >= NBuffers) + if (victim >= activeBuffers) { uint32 originalVictim = victim; /* always wrap what we look up in BufferDescriptors */ - victim = victim % NBuffers; + victim = victim % activeBuffers; /* * If we're the one that just caused a wraparound, force @@ -142,7 +155,7 @@ ClockSweepTick(void) */ SpinLockAcquire(&StrategyControl->buffer_strategy_lock); - wrapped = expected % NBuffers; + wrapped = expected % activeBuffers; success = pg_atomic_compare_exchange_u32(&StrategyControl->nextVictimBuffer, &expected, wrapped); @@ -227,7 +240,8 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocs, 1); /* Use the "clock sweep" algorithm to find a free buffer */ - trycounter = NBuffers; + trycounter = pg_atomic_read_u32(&StrategyControl->activeNBuffers); + for (;;) { uint32 old_buf_state; @@ -280,7 +294,7 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state, local_buf_state)) { - trycounter = NBuffers; + trycounter = pg_atomic_read_u32(&StrategyControl->activeNBuffers); break; } } @@ -322,10 +336,12 @@ StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc) { uint32 nextVictimBuffer; int result; + uint32 activeNBuffers; SpinLockAcquire(&StrategyControl->buffer_strategy_lock); nextVictimBuffer = pg_atomic_read_u32(&StrategyControl->nextVictimBuffer); - result = nextVictimBuffer % NBuffers; + activeNBuffers = pg_atomic_read_u32(&StrategyControl->activeNBuffers); + result = nextVictimBuffer % activeNBuffers; if (complete_passes) { @@ -335,7 +351,7 @@ StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc) * Additionally add the number of wraparounds that happened before * completePasses could be incremented. C.f. ClockSweepTick(). */ - *complete_passes += nextVictimBuffer / NBuffers; + *complete_passes += nextVictimBuffer / activeNBuffers; } if (num_buf_alloc) @@ -382,7 +398,7 @@ StrategyShmemSize(void) Size size = 0; /* size of lookup hash table ... see comment in StrategyInitialize */ - size = add_size(size, BufTableShmemSize(NBuffers + NUM_BUFFER_PARTITIONS)); + size = add_size(size, BufTableShmemSize(MaxNBuffers + NUM_BUFFER_PARTITIONS)); /* size of the shared replacement strategy control block */ size = add_size(size, MAXALIGN(sizeof(BufferStrategyControl))); @@ -390,6 +406,31 @@ StrategyShmemSize(void) return size; } +void +StrategyReset(int activeNBuffers) +{ + Assert(StrategyControl); + + SpinLockAcquire(&StrategyControl->buffer_strategy_lock); + + /* Update the active buffer count for the strategy */ + pg_atomic_write_u32(&StrategyControl->activeNBuffers, activeNBuffers); + + /* Reset the clock-sweep pointer to start from beginning */ + pg_atomic_write_u32(&StrategyControl->nextVictimBuffer, 0); + + /* + * The statistics is viewed in the context of the number of shared buffers. + * Reset it as the size of active number of shared buffers changes. + */ + StrategyControl->completePasses = 0; + pg_atomic_write_u32(&StrategyControl->numBufferAllocs, 0); + + /* TODO: Do we need to seset background writer notifications? */ + StrategyControl->bgwprocno = -1; + SpinLockRelease(&StrategyControl->buffer_strategy_lock); +} + /* * StrategyInitialize -- initialize the buffer cache replacement * strategy. @@ -407,20 +448,29 @@ StrategyInitialize(bool init) * * Since we can't tolerate running out of lookup table entries, we must be * sure to specify an adequate table size here. The maximum steady-state - * usage is of course NBuffers entries, but BufferAlloc() tries to insert - * a new entry before deleting the old. In principle this could be - * happening in each partition concurrently, so we could need as many as - * NBuffers + NUM_BUFFER_PARTITIONS entries. + * usage is of course is as many number of entries as the number of buffers + * in the buffer pool. Right now there is no way to free shared memory. Even + * if we shrink the buffer lookup table when shrinking the buffer pool the + * unused hash table entries can not be freed. When we expand the buffer + * pool, more entries can be allocated but we can not resize the hash table + * directory without rehashing all the entries. Just allocating more entries + * will lead to more contention. Hence we setup the buffer lookup table + * considering the maximum possible size of the buffer pool which is + * MaxNBuffers. + * + * Additionally BufferAlloc() tries to insert a new entry before deleting the + * old. In principle this could be happening in each partition concurrently, + * so we need extra NUM_BUFFER_PARTITIONS entries. */ - InitBufTable(NBuffers + NUM_BUFFER_PARTITIONS); + InitBufTable(MaxNBuffers + NUM_BUFFER_PARTITIONS); /* * Get or create the shared strategy control block */ StrategyControl = (BufferStrategyControl *) - ShmemInitStruct("Buffer Strategy Status", + ShmemInitStructInSegment("Buffer Strategy Status", sizeof(BufferStrategyControl), - &found); + &found, MAIN_SHMEM_SEGMENT); if (!found) { @@ -431,6 +481,8 @@ StrategyInitialize(bool init) SpinLockInit(&StrategyControl->buffer_strategy_lock); + /* Initialize the active buffer count */ + pg_atomic_init_u32(&StrategyControl->activeNBuffers, NBuffersPending); /* Initialize the clock-sweep pointer */ pg_atomic_init_u32(&StrategyControl->nextVictimBuffer, 0); @@ -668,12 +720,23 @@ GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state) strategy->current = 0; /* - * If the slot hasn't been filled yet, tell the caller to allocate a new - * buffer with the normal allocation strategy. He will then fill this - * slot by calling AddBufferToRing with the new buffer. + * If the slot hasn't been filled yet or the buffer in the slot has been + * invalidated when buffer pool was shrunk, tell the caller to allocate a new + * buffer with the normal allocation strategy. He will then fill this slot + * by calling AddBufferToRing with the new buffer. + * + * TODO: Ideally we would want to check for bufnum > NBuffers only once + * after every time the buffer pool is shrunk so as to catch any runtime + * bugs that introduce invalid buffers in the ring. But that is complicated. + * The BufferAccessStrategy objects are not accessible outside the + * ScanState. Hence we can not purge the buffers while evicting the buffers. + * After the resizing is finished, it's not possible to notice when we touch + * the first of those objects and the last of objects. See if this can + * fixed. */ bufnum = strategy->buffers[strategy->current]; - if (bufnum == InvalidBuffer) + if (bufnum == InvalidBuffer || + bufnum > pg_atomic_read_u32(&StrategyControl->activeNBuffers)) return NULL; buf = GetBufferDescriptor(bufnum - 1); diff --git a/src/backend/storage/buffer/meson.build b/src/backend/storage/buffer/meson.build index 448976d2400b..2fc58db5a917 100644 --- a/src/backend/storage/buffer/meson.build +++ b/src/backend/storage/buffer/meson.build @@ -6,4 +6,5 @@ backend_sources += files( 'bufmgr.c', 'freelist.c', 'localbuf.c', + 'buf_resize.c', ) diff --git a/src/backend/storage/ipc/ipc.c b/src/backend/storage/ipc/ipc.c index 2704e80b3a7d..1965b2d3eb4d 100644 --- a/src/backend/storage/ipc/ipc.c +++ b/src/backend/storage/ipc/ipc.c @@ -61,6 +61,8 @@ static void proc_exit_prepare(int code); * but provide some additional features we need --- in particular, * we want to register callbacks to invoke when we are disconnecting * from a broken shared-memory context but not exiting the postmaster. + * Maximum number of such exit callbacks depends on the number of shared + * segments. * * Callback functions can take zero, one, or two args: the first passed * arg is the integer exitcode, the second is the Datum supplied when @@ -68,7 +70,7 @@ static void proc_exit_prepare(int code); * ---------------------------------------------------------------- */ -#define MAX_ON_EXITS 20 +#define MAX_ON_EXITS 40 struct ONEXIT { diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index b23d0c19360a..23e9b53ea074 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -81,10 +81,17 @@ RequestAddinShmemSpace(Size size) /* * CalculateShmemSize - * Calculates the amount of shared memory needed. + * Calculates the amount of shared memory needed. + * + * The amount of shared memory required per segment is saved in mapping_sizes, + * which is expected to be an array of size NUM_MEMORY_MAPPINGS. The total + * amount of memory needed across all the segments is returned. For the memory + * mappings which reserve address space for future expansion, the required + * amount of reserved space is saved in mapping_sizes of those segments. + * This memory is not included in the returned value. */ Size -CalculateShmemSize(void) +CalculateShmemSize(MemoryMappingSizes *mapping_sizes) { Size size; @@ -102,7 +109,13 @@ CalculateShmemSize(void) sizeof(ShmemIndexEnt))); size = add_size(size, dsm_estimate_size()); size = add_size(size, DSMRegistryShmemSize()); - size = add_size(size, BufferManagerShmemSize()); + + /* + * Buffer manager adds estimates for memory requirements for every shared + * memory segment that it uses in the corresponding AnonymousMappings. + * Consider size required from only the main shared memory segment here. + */ + size = add_size(size, BufferManagerShmemSize(mapping_sizes)); size = add_size(size, LockManagerShmemSize()); size = add_size(size, PredicateLockShmemSize()); size = add_size(size, ProcGlobalShmemSize()); @@ -141,11 +154,32 @@ CalculateShmemSize(void) size = add_size(size, AioShmemSize()); size = add_size(size, WaitLSNShmemSize()); + /* + * XXX: For some reason slightly more memory is needed for larger + * shared_buffers, but this size is enough for any large value I've tested + * with. Is it a mistake in how slots are split, or there was a hidden + * inconsistency in shmem calculation? + */ + size = add_size(size, 1024 * 1024 * 100); + /* include additional requested shmem from preload libraries */ size = add_size(size, total_addin_request); + /* + * All the shared memory allocations considered so far happen in the main + * shared memory segment. + */ + mapping_sizes[MAIN_SHMEM_SEGMENT].shmem_req_size = size; + mapping_sizes[MAIN_SHMEM_SEGMENT].shmem_reserved = size; + + size = 0; /* might as well round it off to a multiple of a typical page size */ - size = add_size(size, 8192 - (size % 8192)); + for (int segment = 0; segment < NUM_MEMORY_MAPPINGS; segment++) + { + round_off_mapping_sizes(&mapping_sizes[segment]); + /* Compute the total size of all segments */ + size = size + mapping_sizes[segment].shmem_req_size; + } return size; } @@ -191,32 +225,44 @@ CreateSharedMemoryAndSemaphores(void) { PGShmemHeader *shim; PGShmemHeader *seghdr; - Size size; + MemoryMappingSizes mapping_sizes[NUM_MEMORY_MAPPINGS]; Assert(!IsUnderPostmaster); - /* Compute the size of the shared-memory block */ - size = CalculateShmemSize(); - elog(DEBUG3, "invoking IpcMemoryCreate(size=%zu)", size); - - /* - * Create the shmem segment - */ - seghdr = PGSharedMemoryCreate(size, &shim); - - /* - * Make sure that huge pages are never reported as "unknown" while the - * server is running. - */ - Assert(strcmp("unknown", - GetConfigOption("huge_pages_status", false, false)) != 0); + CalculateShmemSize(mapping_sizes); - InitShmemAccess(seghdr); + /* Decide if we use huge pages or regular size pages */ + PrepareHugePages(); - /* - * Set up shared memory allocation mechanism - */ - InitShmemAllocation(); + for(int segment = 0; segment < NUM_MEMORY_MAPPINGS; segment++) + { + MemoryMappingSizes *mapping = &mapping_sizes[segment]; + + /* Compute the size of the shared-memory block */ + elog(DEBUG3, "invoking IpcMemoryCreate(segment %s, size=%zu, reserved address space=%zu)", + MappingName(segment), mapping->shmem_req_size, mapping->shmem_reserved); + + /* + * Create the shmem segment. + * + * XXX: Do multiple shims are needed, one per segment? + */ + seghdr = PGSharedMemoryCreate(mapping, segment, &shim); + + /* + * Make sure that huge pages are never reported as "unknown" while the + * server is running. + */ + Assert(strcmp("unknown", + GetConfigOption("huge_pages_status", false, false)) != 0); + + InitShmemAccessInSegment(seghdr, segment); + + /* + * Set up shared memory allocation mechanism + */ + InitShmemAllocationInSegment(segment); + } /* Initialize subsystems */ CreateOrAttachShmemStructs(); @@ -274,6 +320,8 @@ CreateOrAttachShmemStructs(void) CommitTsShmemInit(); SUBTRANSShmemInit(); MultiXactShmemInit(); + /* TODO: This should be part of BufferManagerShmemInit() */ + ShmemControlInit(); BufferManagerShmemInit(); /* @@ -334,7 +382,9 @@ CreateOrAttachShmemStructs(void) * InitializeShmemGUCs * * This function initializes runtime-computed GUCs related to the amount of - * shared memory required for the current configuration. + * shared memory required for the current configuration. It assumes that the + * memory required by the shared memory segments is already calculated and is + * available in AnonymousMappings. */ void InitializeShmemGUCs(void) @@ -343,11 +393,13 @@ InitializeShmemGUCs(void) Size size_b; Size size_mb; Size hp_size; + MemoryMappingSizes mapping_sizes[NUM_MEMORY_MAPPINGS]; + /* * Calculate the shared memory size and round up to the nearest megabyte. */ - size_b = CalculateShmemSize(); + size_b = CalculateShmemSize(mapping_sizes); size_mb = add_size(size_b, (1024 * 1024) - 1) / (1024 * 1024); sprintf(buf, "%zu", size_mb); SetConfigOption("shared_memory_size", buf, @@ -356,7 +408,7 @@ InitializeShmemGUCs(void) /* * Calculate the number of huge pages required. */ - GetHugePageSize(&hp_size, NULL); + GetHugePageSize(&hp_size, NULL, NULL); if (hp_size != 0) { Size hp_required; diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c index 087821311cce..c7c36f2be675 100644 --- a/src/backend/storage/ipc/procsignal.c +++ b/src/backend/storage/ipc/procsignal.c @@ -24,9 +24,11 @@ #include "port/pg_bitutils.h" #include "replication/logicalworker.h" #include "replication/walsender.h" +#include "storage/bufmgr.h" #include "storage/condition_variable.h" #include "storage/ipc.h" #include "storage/latch.h" +#include "storage/pg_shmem.h" #include "storage/shmem.h" #include "storage/sinval.h" #include "storage/smgr.h" @@ -109,6 +111,10 @@ static bool CheckProcSignal(ProcSignalReason reason); static void CleanupProcSignalState(int status, Datum arg); static void ResetProcSignalBarrierBits(uint32 flags); +#ifdef DEBUG_SHMEM_RESIZE +bool delay_proc_signal_init = false; +#endif + /* * ProcSignalShmemSize * Compute space needed for ProcSignal's shared memory @@ -170,6 +176,43 @@ ProcSignalInit(const uint8 *cancel_key, int cancel_key_len) uint32 old_pss_pid; Assert(cancel_key_len >= 0 && cancel_key_len <= MAX_CANCEL_KEY_LENGTH); + +#ifdef DEBUG_SHMEM_RESIZE + /* + * Introduced for debugging purposes. You can change the variable at + * runtime using gdb, then start new backends with delayed ProcSignal + * initialization. Simple pg_usleep wont work here due to SIGHUP interrupt + * needed for testing. Taken from pg_sleep; + */ + if (delay_proc_signal_init) + { +#define GetNowFloat() ((float8) GetCurrentTimestamp() / 1000000.0) + float8 endtime = GetNowFloat() + 5; + + for (;;) + { + float8 delay; + long delay_ms; + + CHECK_FOR_INTERRUPTS(); + + delay = endtime - GetNowFloat(); + if (delay >= 600.0) + delay_ms = 600000; + else if (delay > 0.0) + delay_ms = (long) (delay * 1000.0); + else + break; + + (void) WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + delay_ms, + WAIT_EVENT_PG_SLEEP); + ResetLatch(MyLatch); + } + } +#endif + if (MyProcNumber < 0) elog(ERROR, "MyProcNumber not set"); if (MyProcNumber >= NumProcSignalSlots) @@ -576,6 +619,18 @@ ProcessProcSignalBarrier(void) case PROCSIGNAL_BARRIER_SMGRRELEASE: processed = ProcessBarrierSmgrRelease(); break; + case PROCSIGNAL_BARRIER_SHBUF_SHRINK: + processed = ProcessBarrierShmemShrink(); + break; + case PROCSIGNAL_BARRIER_SHBUF_RESIZE_MAP_AND_MEM: + processed = ProcessBarrierShmemResizeMapAndMem(); + break; + case PROCSIGNAL_BARRIER_SHBUF_EXPAND: + processed = ProcessBarrierShmemExpand(); + break; + case PROCSIGNAL_BARRIER_SHBUF_RESIZE_FAILED: + processed = ProcessBarrierShmemResizeFailed(); + break; } /* diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c index 0f18beb6ad4a..eafcb665ba91 100644 --- a/src/backend/storage/ipc/shmem.c +++ b/src/backend/storage/ipc/shmem.c @@ -69,27 +69,34 @@ #include "funcapi.h" #include "miscadmin.h" #include "port/pg_numa.h" +#include "postmaster/bgwriter.h" +#include "storage/bufmgr.h" +#include "storage/buf_internals.h" +#include "storage/ipc.h" #include "storage/lwlock.h" #include "storage/pg_shmem.h" +#include "storage/pmsignal.h" +#include "storage/procsignal.h" #include "storage/shmem.h" #include "storage/spin.h" #include "utils/builtins.h" +#include "utils/injection_point.h" +#include "utils/wait_event.h" static void *ShmemAllocRaw(Size size, Size *allocated_size); -static void *ShmemAllocUnlocked(Size size); +static void *ShmemAllocRawInSegment(Size size, Size *allocated_size, + int shmem_segment); /* shared memory global variables */ -static PGShmemHeader *ShmemSegHdr; /* shared mem segment header */ +ShmemSegment Segments[NUM_MEMORY_MAPPINGS]; -static void *ShmemBase; /* start address of shared memory */ - -static void *ShmemEnd; /* end+1 address of shared memory */ - -slock_t *ShmemLock; /* spinlock for shared memory and LWLock - * allocation */ - -static HTAB *ShmemIndex = NULL; /* primary index hashtable for shmem */ +/* + * Primary index hashtable for shmem, for simplicity we use a single for all + * shared memory segments. There can be performance consequences of that, and + * an alternative option would be to have one index per shared memory segments. + */ +static HTAB *ShmemIndex = NULL; /* To get reliable results for NUMA inquiry we need to "touch pages" once */ static bool firstNumaTouch = true; @@ -102,9 +109,17 @@ Datum pg_numa_available(PG_FUNCTION_ARGS); void InitShmemAccess(PGShmemHeader *seghdr) { - ShmemSegHdr = seghdr; - ShmemBase = seghdr; - ShmemEnd = (char *) ShmemBase + seghdr->totalsize; + InitShmemAccessInSegment(seghdr, MAIN_SHMEM_SEGMENT); +} + +void +InitShmemAccessInSegment(PGShmemHeader *seghdr, int shmem_segment) +{ + PGShmemHeader *shmhdr = (PGShmemHeader *) seghdr; + ShmemSegment *seg = &Segments[shmem_segment]; + seg->ShmemSegHdr = shmhdr; + seg->ShmemBase = (void *) shmhdr; + seg->ShmemEnd = (char *) seg->ShmemBase + shmhdr->totalsize; } /* @@ -115,7 +130,13 @@ InitShmemAccess(PGShmemHeader *seghdr) void InitShmemAllocation(void) { - PGShmemHeader *shmhdr = ShmemSegHdr; + InitShmemAllocationInSegment(MAIN_SHMEM_SEGMENT); +} + +void +InitShmemAllocationInSegment(int shmem_segment) +{ + PGShmemHeader *shmhdr = Segments[shmem_segment].ShmemSegHdr; char *aligned; Assert(shmhdr != NULL); @@ -124,9 +145,9 @@ InitShmemAllocation(void) * Initialize the spinlock used by ShmemAlloc. We must use * ShmemAllocUnlocked, since obviously ShmemAlloc can't be called yet. */ - ShmemLock = (slock_t *) ShmemAllocUnlocked(sizeof(slock_t)); + Segments[shmem_segment].ShmemLock = (slock_t *) ShmemAllocUnlockedInSegment(sizeof(slock_t), shmem_segment); - SpinLockInit(ShmemLock); + SpinLockInit(Segments[shmem_segment].ShmemLock); /* * Allocations after this point should go through ShmemAlloc, which @@ -151,16 +172,22 @@ InitShmemAllocation(void) */ void * ShmemAlloc(Size size) +{ + return ShmemAllocInSegment(size, MAIN_SHMEM_SEGMENT); +} + +void * +ShmemAllocInSegment(Size size, int shmem_segment) { void *newSpace; Size allocated_size; - newSpace = ShmemAllocRaw(size, &allocated_size); + newSpace = ShmemAllocRawInSegment(size, &allocated_size, shmem_segment); if (!newSpace) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of shared memory (%zu bytes requested)", - size))); + errmsg("out of shared memory in segment %s (%zu bytes requested)", + MappingName(shmem_segment), size))); return newSpace; } @@ -185,6 +212,12 @@ ShmemAllocNoError(Size size) */ static void * ShmemAllocRaw(Size size, Size *allocated_size) +{ + return ShmemAllocRawInSegment(size, allocated_size, MAIN_SHMEM_SEGMENT); +} + +static void * +ShmemAllocRawInSegment(Size size, Size *allocated_size, int shmem_segment) { Size newStart; Size newFree; @@ -204,22 +237,22 @@ ShmemAllocRaw(Size size, Size *allocated_size) size = CACHELINEALIGN(size); *allocated_size = size; - Assert(ShmemSegHdr != NULL); + Assert(Segments[shmem_segment].ShmemSegHdr != NULL); - SpinLockAcquire(ShmemLock); + SpinLockAcquire(Segments[shmem_segment].ShmemLock); - newStart = ShmemSegHdr->freeoffset; + newStart = Segments[shmem_segment].ShmemSegHdr->freeoffset; newFree = newStart + size; - if (newFree <= ShmemSegHdr->totalsize) + if (newFree <= Segments[shmem_segment].ShmemSegHdr->totalsize) { - newSpace = (char *) ShmemBase + newStart; - ShmemSegHdr->freeoffset = newFree; + newSpace = (char *) Segments[shmem_segment].ShmemBase + newStart; + Segments[shmem_segment].ShmemSegHdr->freeoffset = newFree; } else newSpace = NULL; - SpinLockRelease(ShmemLock); + SpinLockRelease(Segments[shmem_segment].ShmemLock); /* note this assert is okay with newSpace == NULL */ Assert(newSpace == (void *) CACHELINEALIGN(newSpace)); @@ -228,15 +261,16 @@ ShmemAllocRaw(Size size, Size *allocated_size) } /* - * ShmemAllocUnlocked -- allocate max-aligned chunk from shared memory + * ShmemAllocUnlockedInSegment + * allocate max-aligned chunk from given shared memory segment * * Allocate space without locking ShmemLock. This should be used for, * and only for, allocations that must happen before ShmemLock is ready. * * We consider maxalign, rather than cachealign, sufficient here. */ -static void * -ShmemAllocUnlocked(Size size) +void * +ShmemAllocUnlockedInSegment(Size size, int shmem_segment) { Size newStart; Size newFree; @@ -247,19 +281,19 @@ ShmemAllocUnlocked(Size size) */ size = MAXALIGN(size); - Assert(ShmemSegHdr != NULL); + Assert(Segments[shmem_segment].ShmemSegHdr != NULL); - newStart = ShmemSegHdr->freeoffset; + newStart = Segments[shmem_segment].ShmemSegHdr->freeoffset; newFree = newStart + size; - if (newFree > ShmemSegHdr->totalsize) + if (newFree > Segments[shmem_segment].ShmemSegHdr->totalsize) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of shared memory (%zu bytes requested)", - size))); - ShmemSegHdr->freeoffset = newFree; + errmsg("out of shared memory in segment %s (%zu bytes requested)", + MappingName(shmem_segment), size))); + Segments[shmem_segment].ShmemSegHdr->freeoffset = newFree; - newSpace = (char *) ShmemBase + newStart; + newSpace = (char *) Segments[shmem_segment].ShmemBase + newStart; Assert(newSpace == (void *) MAXALIGN(newSpace)); @@ -274,7 +308,13 @@ ShmemAllocUnlocked(Size size) bool ShmemAddrIsValid(const void *addr) { - return (addr >= ShmemBase) && (addr < ShmemEnd); + return ShmemAddrIsValidInSegment(addr, MAIN_SHMEM_SEGMENT); +} + +bool +ShmemAddrIsValidInSegment(const void *addr, int shmem_segment) +{ + return (addr >= Segments[shmem_segment].ShmemBase) && (addr < Segments[shmem_segment].ShmemEnd); } /* @@ -335,6 +375,18 @@ ShmemInitHash(const char *name, /* table string name for shmem index */ int64 max_size, /* max size of the table */ HASHCTL *infoP, /* info about key and bucket size */ int hash_flags) /* info about infoP */ +{ + return ShmemInitHashInSegment(name, init_size, max_size, infoP, hash_flags, + MAIN_SHMEM_SEGMENT); +} + +HTAB * +ShmemInitHashInSegment(const char *name, /* table string name for shmem index */ + long init_size, /* initial table size */ + long max_size, /* max size of the table */ + HASHCTL *infoP, /* info about key and bucket size */ + int hash_flags, /* info about infoP */ + int shmem_segment) /* in which segment to keep the table */ { bool found; void *location; @@ -351,9 +403,9 @@ ShmemInitHash(const char *name, /* table string name for shmem index */ hash_flags |= HASH_SHARED_MEM | HASH_ALLOC | HASH_DIRSIZE; /* look it up in the shmem index */ - location = ShmemInitStruct(name, + location = ShmemInitStructInSegment(name, hash_get_shared_size(infoP, hash_flags), - &found); + &found, shmem_segment); /* * if it already exists, attach to it rather than allocate and initialize @@ -386,6 +438,13 @@ ShmemInitHash(const char *name, /* table string name for shmem index */ */ void * ShmemInitStruct(const char *name, Size size, bool *foundPtr) +{ + return ShmemInitStructInSegment(name, size, foundPtr, MAIN_SHMEM_SEGMENT); +} + +void * +ShmemInitStructInSegment(const char *name, Size size, bool *foundPtr, + int shmem_segment) { ShmemIndexEnt *result; void *structPtr; @@ -394,7 +453,7 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr) if (!ShmemIndex) { - PGShmemHeader *shmemseghdr = ShmemSegHdr; + PGShmemHeader *shmemseghdr = Segments[shmem_segment].ShmemSegHdr; /* Must be trying to create/attach to ShmemIndex itself */ Assert(strcmp(name, "ShmemIndex") == 0); @@ -417,7 +476,7 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr) * process can be accessing shared memory yet. */ Assert(shmemseghdr->index == NULL); - structPtr = ShmemAlloc(size); + structPtr = ShmemAllocInSegment(size, shmem_segment); shmemseghdr->index = structPtr; *foundPtr = false; } @@ -434,16 +493,15 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr) LWLockRelease(ShmemIndexLock); ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("could not create ShmemIndex entry for data structure \"%s\"", - name))); + errmsg("could not create ShmemIndex entry for data structure \"%s\" in segment %d", + name, shmem_segment))); } if (*foundPtr) { /* * Structure is in the shmem index so someone else has allocated it - * already. The size better be the same as the size we are trying to - * initialize to, or there is a name conflict (or worse). + * already. The size better be the same as the size we are trying to */ if (result->size != size) { @@ -453,6 +511,7 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr) " \"%s\": expected %zu, actual %zu", name, size, result->size))); } + structPtr = result->location; } else @@ -460,7 +519,7 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr) Size allocated_size; /* It isn't in the table yet. allocate and initialize it */ - structPtr = ShmemAllocRaw(size, &allocated_size); + structPtr = ShmemAllocRawInSegment(size, &allocated_size, shmem_segment); if (structPtr == NULL) { /* out of memory; remove the failed ShmemIndex entry */ @@ -475,18 +534,71 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr) result->size = size; result->allocated_size = allocated_size; result->location = structPtr; + result->shmem_segment = shmem_segment; } LWLockRelease(ShmemIndexLock); - Assert(ShmemAddrIsValid(structPtr)); + Assert(ShmemAddrIsValidInSegment(structPtr, shmem_segment)); + + Assert(structPtr == (void *) CACHELINEALIGN(structPtr)); + + return structPtr; +} + +/* + * ShmemUpdateStructInSegment -- Update the size of a structure in shared memory. + * + * This function updates the size of an existing shared memory structure. It + * finds the structure in the shmem index and updates its size information while + * preserving the existing memory location. + * + * Returns: pointer to the existing structure location. + */ +void * +ShmemUpdateStructInSegment(const char *name, Size size, bool *foundPtr, + int shmem_segment) +{ + ShmemIndexEnt *result; + void *structPtr; + Size delta; + + LWLockAcquire(ShmemIndexLock, LW_EXCLUSIVE); + + Assert(ShmemIndex); + + /* Look up the structure in the shmem index */ + result = (ShmemIndexEnt *) + hash_search(ShmemIndex, name, HASH_FIND, foundPtr); + + Assert(*foundPtr); + Assert(result); + Assert(result->shmem_segment == shmem_segment); + + delta = size - result->size; + /* Store the existing structure pointer */ + structPtr = result->location; + + /* Update the size information. + TODO: Ideally we should implement repalloc kind of functionality for shared memory which will return allocated size. */ + result->size = size; + result->allocated_size = size; + /* Reflect size change in the shared segment */ + SpinLockAcquire(Segments[shmem_segment].ShmemLock); + Segments[shmem_segment].ShmemSegHdr->freeoffset += delta; + SpinLockRelease(Segments[shmem_segment].ShmemLock); + LWLockRelease(ShmemIndexLock); + + /* Verify the structure is still in the correct segment */ + Assert(ShmemAddrIsValidInSegment(structPtr, shmem_segment)); Assert(structPtr == (void *) CACHELINEALIGN(structPtr)); return structPtr; } + /* * Add two Size values, checking for overflow */ @@ -527,13 +639,14 @@ mul_size(Size s1, Size s2) Datum pg_get_shmem_allocations(PG_FUNCTION_ARGS) { -#define PG_GET_SHMEM_SIZES_COLS 4 +#define PG_GET_SHMEM_SIZES_COLS 5 ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; HASH_SEQ_STATUS hstat; ShmemIndexEnt *ent; - Size named_allocated = 0; + Size named_allocated[NUM_MEMORY_MAPPINGS] = {0}; Datum values[PG_GET_SHMEM_SIZES_COLS]; bool nulls[PG_GET_SHMEM_SIZES_COLS]; + int i; InitMaterializedSRF(fcinfo, 0); @@ -546,29 +659,40 @@ pg_get_shmem_allocations(PG_FUNCTION_ARGS) while ((ent = (ShmemIndexEnt *) hash_seq_search(&hstat)) != NULL) { values[0] = CStringGetTextDatum(ent->key); - values[1] = Int64GetDatum((char *) ent->location - (char *) ShmemSegHdr); - values[2] = Int64GetDatum(ent->size); - values[3] = Int64GetDatum(ent->allocated_size); - named_allocated += ent->allocated_size; + values[1] = CStringGetTextDatum(MappingName(ent->shmem_segment)); + values[2] = Int64GetDatum((char *) ent->location - (char *) Segments[ent->shmem_segment].ShmemSegHdr); + values[3] = Int64GetDatum(ent->size); + values[4] = Int64GetDatum(ent->allocated_size); + named_allocated[ent->shmem_segment] += ent->allocated_size; tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); } /* output shared memory allocated but not counted via the shmem index */ - values[0] = CStringGetTextDatum(""); - nulls[1] = true; - values[2] = Int64GetDatum(ShmemSegHdr->freeoffset - named_allocated); - values[3] = values[2]; - tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); + for (i = 0; i < NUM_MEMORY_MAPPINGS; i++) + { + values[0] = CStringGetTextDatum(""); + values[1] = CStringGetTextDatum(MappingName(i)); + nulls[2] = true; + values[3] = Int64GetDatum(Segments[i].ShmemSegHdr->freeoffset - named_allocated[i]); + values[4] = values[3]; + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); + } /* output as-of-yet unused shared memory */ - nulls[0] = true; - values[1] = Int64GetDatum(ShmemSegHdr->freeoffset); - nulls[1] = false; - values[2] = Int64GetDatum(ShmemSegHdr->totalsize - ShmemSegHdr->freeoffset); - values[3] = values[2]; - tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); + memset(nulls, 0, sizeof(nulls)); + + for (i = 0; i < NUM_MEMORY_MAPPINGS; i++) + { + PGShmemHeader *shmhdr = Segments[i].ShmemSegHdr; + nulls[0] = true; + values[1] = CStringGetTextDatum(MappingName(i)); + values[2] = Int64GetDatum(shmhdr->freeoffset); + values[3] = Int64GetDatum(shmhdr->totalsize - shmhdr->freeoffset); + values[4] = values[3]; + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); + } LWLockRelease(ShmemIndexLock); @@ -593,7 +717,7 @@ pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS) Size os_page_size; void **page_ptrs; int *pages_status; - uint64 shm_total_page_count, + uint64 shm_total_page_count = 0, shm_ent_page_count, max_nodes; Size *nodes; @@ -628,7 +752,12 @@ pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS) * this is not very likely, and moreover we have more entries, each of * them using only fraction of the total pages. */ - shm_total_page_count = (ShmemSegHdr->totalsize / os_page_size) + 1; + for(int segment = 0; segment < NUM_MEMORY_MAPPINGS; segment++) + { + PGShmemHeader *shmhdr = Segments[segment].ShmemSegHdr; + shm_total_page_count += (shmhdr->totalsize / os_page_size) + 1; + } + page_ptrs = palloc0(sizeof(void *) * shm_total_page_count); pages_status = palloc(sizeof(int) * shm_total_page_count); @@ -751,7 +880,7 @@ pg_get_shmem_pagesize(void) Assert(huge_pages_status != HUGE_PAGES_UNKNOWN); if (huge_pages_status == HUGE_PAGES_ON) - GetHugePageSize(&os_page_size, NULL); + GetHugePageSize(&os_page_size, NULL, NULL); return os_page_size; } @@ -761,3 +890,45 @@ pg_numa_available(PG_FUNCTION_ARGS) { PG_RETURN_BOOL(pg_numa_init() != -1); } + +/* SQL SRF showing shared memory segments */ +Datum +pg_get_shmem_segments(PG_FUNCTION_ARGS) +{ +#define PG_GET_SHMEM_SEGS_COLS 6 + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + Datum values[PG_GET_SHMEM_SEGS_COLS]; + bool nulls[PG_GET_SHMEM_SEGS_COLS]; + int i; + + InitMaterializedSRF(fcinfo, 0); + + /* output all allocated entries */ + for (i = 0; i < NUM_MEMORY_MAPPINGS; i++) + { + ShmemSegment *segment = &Segments[i]; + PGShmemHeader *shmhdr = segment->ShmemSegHdr; + int j; + + if (shmhdr == NULL) + { + for (j = 0; j < PG_GET_SHMEM_SEGS_COLS; j++) + nulls[j] = true; + } + else + { + memset(nulls, 0, sizeof(nulls)); + values[0] = Int32GetDatum(i); + values[1] = CStringGetTextDatum(MappingName(i)); + values[2] = Int64GetDatum(shmhdr->totalsize); + values[3] = Int64GetDatum(shmhdr->freeoffset); + values[4] = Int64GetDatum(segment->shmem_size); + values[5] = Int64GetDatum(segment->shmem_reserved); + } + + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, + values, nulls); + } + + return (Datum) 0; +} diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c index b017880f5e45..c25dd13b63af 100644 --- a/src/backend/storage/lmgr/lwlock.c +++ b/src/backend/storage/lmgr/lwlock.c @@ -80,6 +80,8 @@ #include "pg_trace.h" #include "pgstat.h" #include "port/pg_bitutils.h" +#include "postmaster/postmaster.h" +#include "storage/pg_shmem.h" #include "storage/proc.h" #include "storage/proclist.h" #include "storage/procnumber.h" @@ -612,12 +614,15 @@ LWLockNewTrancheId(const char *name) /* * We use the ShmemLock spinlock to protect LWLockCounter and * LWLockTrancheNames. + * + * XXX: Looks like this is the only use of Segments outside of shmem.c, + * it's maybe worth it to reshape this part to hide Segments structure. */ - SpinLockAcquire(ShmemLock); + SpinLockAcquire(Segments[MAIN_SHMEM_SEGMENT].ShmemLock); if (*LWLockCounter - LWTRANCHE_FIRST_USER_DEFINED >= MAX_NAMED_TRANCHES) { - SpinLockRelease(ShmemLock); + SpinLockRelease(Segments[MAIN_SHMEM_SEGMENT].ShmemLock); ereport(ERROR, (errmsg("maximum number of tranches already registered"), errdetail("No more than %d tranches may be registered.", @@ -628,7 +633,7 @@ LWLockNewTrancheId(const char *name) LocalLWLockCounter = *LWLockCounter; strlcpy(LWLockTrancheNames[result - LWTRANCHE_FIRST_USER_DEFINED], name, NAMEDATALEN); - SpinLockRelease(ShmemLock); + SpinLockRelease(Segments[MAIN_SHMEM_SEGMENT].ShmemLock); return result; } @@ -750,9 +755,9 @@ GetLWTrancheName(uint16 trancheId) */ if (trancheId >= LocalLWLockCounter) { - SpinLockAcquire(ShmemLock); + SpinLockAcquire(Segments[MAIN_SHMEM_SEGMENT].ShmemLock); LocalLWLockCounter = *LWLockCounter; - SpinLockRelease(ShmemLock); + SpinLockRelease(Segments[MAIN_SHMEM_SEGMENT].ShmemLock); if (trancheId >= LocalLWLockCounter) elog(ERROR, "tranche %d is not registered", trancheId); diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 7dd75a490aab..9c9ebe4280a0 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -63,6 +63,7 @@ #include "rewrite/rewriteHandler.h" #include "storage/bufmgr.h" #include "storage/ipc.h" +#include "storage/pg_shmem.h" #include "storage/pmsignal.h" #include "storage/proc.h" #include "storage/procsignal.h" @@ -4128,6 +4129,9 @@ PostgresSingleUserMain(int argc, char *argv[], /* Initialize size of fast-path lock cache. */ InitializeFastPathLocks(); + /* Initialize MaxNBuffers for buffer pool resizing. */ + InitializeMaxNBuffers(); + /* * Give preloaded libraries a chance to request additional shared memory. */ @@ -4318,6 +4322,13 @@ PostgresMain(const char *dbname, const char *username) */ BeginReportingGUCOptions(); + /* + * TODO: The new backend should fetch the shared buffers status. If the + * resizing is going on, it should bring itself upto speed with it. If not, + * simply fetch the latest pointers are sizes. Is this the right place to do + * that? + */ + /* * Also set up handler to log session end; we have to wait till now to be * sure Log_disconnections has its final value. diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt index c1ac71ff7f24..ee5887496baf 100644 --- a/src/backend/utils/activity/wait_event_names.txt +++ b/src/backend/utils/activity/wait_event_names.txt @@ -162,6 +162,7 @@ WAL_RECEIVER_EXIT "Waiting for the WAL receiver to exit." WAL_RECEIVER_WAIT_START "Waiting for startup process to send initial data for streaming replication." WAL_SUMMARY_READY "Waiting for a new WAL summary to be generated." XACT_GROUP_UPDATE "Waiting for the group leader to update transaction status at transaction end." +PM_BUFFER_RESIZE_WAIT "Waiting for the postmaster to complete shared buffer pool resize operations." ABI_compatibility: @@ -358,6 +359,7 @@ InjectionPoint "Waiting to read or update information related to injection point SerialControl "Waiting to read or update shared pg_serial state." AioWorkerSubmissionQueue "Waiting to access AIO worker submission queue." WaitLSN "Waiting to read or update shared Wait-for-LSN state." +ShmemResize "Waiting to resize shared memory." # # END OF PREDEFINED LWLOCKS (DO NOT CHANGE THIS LINE) diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c index d31cb45a0588..419c7fad8901 100644 --- a/src/backend/utils/init/globals.c +++ b/src/backend/utils/init/globals.c @@ -139,7 +139,10 @@ int max_parallel_maintenance_workers = 2; * MaxBackends is computed by PostmasterMain after modules have had a chance to * register background workers. */ -int NBuffers = 16384; +int NBuffers = 0; +int NBuffersPending = 16384; +bool finalMaxNBuffers = false; +int MaxNBuffers = 0; int MaxConnections = 100; int max_worker_processes = 8; int max_parallel_workers = 8; diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index 98f9598cd789..46a8a8a3faad 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -595,6 +595,55 @@ InitializeFastPathLocks(void) pg_nextpower2_32(FastPathLockGroupsPerBackend)); } +/* + * Initialize MaxNBuffers variable with validation. + * + * This must be called after GUCs have been loaded but before shared memory size + * is determined. + * + * Since MaxNBuffers limits the size of the buffer pool, it must be at least as + * much as NBuffersPending. If MaxNBuffers is 0 (default), set it to + * NBuffersPending. Otherwise, validate that MaxNBuffers is not less than + * NBuffersPending. + */ +void +InitializeMaxNBuffers(void) +{ + if (MaxNBuffers == 0) /* default/boot value */ + { + char buf[32]; + + snprintf(buf, sizeof(buf), "%d", NBuffersPending); + SetConfigOption("max_shared_buffers", buf, PGC_POSTMASTER, + PGC_S_DYNAMIC_DEFAULT); + + /* + * We prefer to report this value's source as PGC_S_DYNAMIC_DEFAULT. + * However, if the DBA explicitly set max_shared_buffers = 0 in + * the config file, then PGC_S_DYNAMIC_DEFAULT will fail to override + * that and we must force the matter with PGC_S_OVERRIDE. + */ + if (MaxNBuffers == 0) /* failed to apply it? */ + SetConfigOption("max_shared_buffers", buf, PGC_POSTMASTER, + PGC_S_OVERRIDE); + } + else + { + if (MaxNBuffers < NBuffersPending) + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("max_shared_buffers (%d) cannot be less than current shared_buffers (%d)", + MaxNBuffers, NBuffersPending), + errhint("Increase max_shared_buffers or decrease shared_buffers."))); + } + } + + Assert(MaxNBuffers > 0); + Assert(!finalMaxNBuffers); + finalMaxNBuffers = true; +} + /* * Early initialization of a backend (either standalone or under postmaster). * This happens even before InitPostgres. diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index c6484aea087c..96233ba5cb27 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -2599,7 +2599,7 @@ convert_to_base_unit(double value, const char *unit, * the value without loss. For example, if the base unit is GUC_UNIT_KB, 1024 * is converted to 1 MB, but 1025 is represented as 1025 kB. */ -static void +void convert_int_from_base_unit(int64 base_value, int base_unit, int64 *value, const char **unit) { diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat index 1128167c0251..539b29f0065a 100644 --- a/src/backend/utils/misc/guc_parameters.dat +++ b/src/backend/utils/misc/guc_parameters.dat @@ -2013,6 +2013,15 @@ max => 'MAX_BACKENDS /* XXX? */', }, +{ name => "max_shared_buffers", type => 'int', context => 'PGC_POSTMASTER', group => 'RESOURCES_MEM', + short_desc => 'Sets the upper limit for the shared_buffers value.', + flags => 'GUC_UNIT_BLOCKS', + variable => 'MaxNBuffers', + boot_val => '0', + min => '0', + max => 'INT_MAX / 2', +}, + { name => 'max_slot_wal_keep_size', type => 'int', context => 'PGC_SIGHUP', group => 'REPLICATION_SENDING', short_desc => 'Sets the maximum WAL size that can be reserved by replication slots.', long_desc => 'Replication slots will be marked as failed, and segments released for deletion or recycling, if this much space is occupied by WAL on disk. -1 means no maximum.', @@ -2581,13 +2590,15 @@ # We sometimes multiply the number of shared buffers by two without # checking for overflow, so we mustn't allow more than INT_MAX / 2. -{ name => 'shared_buffers', type => 'int', context => 'PGC_POSTMASTER', group => 'RESOURCES_MEM', +{ name => 'shared_buffers', type => 'int', context => 'PGC_SIGHUP', group => 'RESOURCES_MEM', short_desc => 'Sets the number of shared memory buffers used by the server.', flags => 'GUC_UNIT_BLOCKS', - variable => 'NBuffers', + variable => 'NBuffersPending', boot_val => '16384', min => '16', max => 'INT_MAX / 2', + check_hook => 'check_shared_buffers', + show_hook => 'show_shared_buffers', }, { name => 'shared_memory_size', type => 'int', context => 'PGC_INTERNAL', group => 'PRESET_OPTIONS', diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 1edb18958f75..d0c9e6ec7577 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -8576,8 +8576,8 @@ { oid => '5052', descr => 'allocations from the main shared memory segment', proname => 'pg_get_shmem_allocations', prorows => '50', proretset => 't', provolatile => 'v', prorettype => 'record', proargtypes => '', - proallargtypes => '{text,int8,int8,int8}', proargmodes => '{o,o,o,o}', - proargnames => '{name,off,size,allocated_size}', + proallargtypes => '{text,text,int8,int8,int8}', proargmodes => '{o,o,o,o,o}', + proargnames => '{name,segment,off,size,allocated_size}', prosrc => 'pg_get_shmem_allocations' }, { oid => '4099', descr => 'Is NUMA support available?', @@ -8600,6 +8600,14 @@ proargmodes => '{o,o,o}', proargnames => '{name,type,size}', prosrc => 'pg_get_dsm_registry_allocations' }, +# shared memory segments +{ oid => '5101', descr => 'shared memory segments', + proname => 'pg_get_shmem_segments', prorows => '6', proretset => 't', + provolatile => 'v', prorettype => 'record', proargtypes => '', + proallargtypes => '{int4,text,int8,int8,int8,int8}', proargmodes => '{o,o,o,o,o,o}', + proargnames => '{id,name,size,freeoffset,mapping_size,mapping_reserved_size}', + prosrc => 'pg_get_shmem_segments' }, + # memory context of local backend { oid => '2282', descr => 'information about all memory contexts of local backend', @@ -12612,4 +12620,10 @@ proargnames => '{pid,io_id,io_generation,state,operation,off,length,target,handle_data_len,raw_result,result,target_desc,f_sync,f_localmem,f_buffered}', prosrc => 'pg_get_aios' }, +{ oid => '9999', descr => 'resize shared buffers according to the value of GUC `shared_buffers`', + proname => 'pg_resize_shared_buffers', + provolatile => 'v', + prorettype => 'bool', + proargtypes => '', + prosrc => 'pg_resize_shared_buffers'}, ] diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 9a7d733ddeff..b4dc2c4ba57d 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -173,7 +173,11 @@ extern PGDLLIMPORT bool ExitOnAnyError; extern PGDLLIMPORT char *DataDir; extern PGDLLIMPORT int data_directory_mode; +/* TODO: This is no more a GUC variable; should be moved somewhere else. */ extern PGDLLIMPORT int NBuffers; +extern PGDLLIMPORT int NBuffersPending; +extern PGDLLIMPORT bool finalMaxNBuffers; +extern PGDLLIMPORT int MaxNBuffers; extern PGDLLIMPORT int MaxBackends; extern PGDLLIMPORT int MaxConnections; extern PGDLLIMPORT int max_worker_processes; @@ -502,6 +506,7 @@ extern PGDLLIMPORT ProcessingMode Mode; extern void pg_split_opts(char **argv, int *argcp, const char *optstr); extern void InitializeMaxBackends(void); extern void InitializeFastPathLocks(void); +extern void InitializeMaxNBuffers(void); extern void InitPostgres(const char *in_dbname, Oid dboid, const char *username, Oid useroid, bits32 flags, diff --git a/src/include/portability/mem.h b/src/include/portability/mem.h index ef9800732d90..40588ff69683 100644 --- a/src/include/portability/mem.h +++ b/src/include/portability/mem.h @@ -38,7 +38,7 @@ #define MAP_NOSYNC 0 #endif -#define PG_MMAP_FLAGS (MAP_SHARED|MAP_ANONYMOUS|MAP_HASSEMAPHORE) +#define PG_MMAP_FLAGS (MAP_SHARED|MAP_HASSEMAPHORE) /* Some really old systems don't define MAP_FAILED. */ #ifndef MAP_FAILED diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index 5400c56a965f..4c53194e13e4 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -28,6 +28,7 @@ #include "storage/spin.h" #include "utils/relcache.h" #include "utils/resowner.h" +#include "utils/tuplestore.h" /* * Buffer state is a single 32-bit variable where following data is combined. @@ -512,6 +513,7 @@ extern void StrategyNotifyBgWriter(int bgwprocno); extern Size StrategyShmemSize(void); extern void StrategyInitialize(bool init); +extern void StrategyReset(int activeNBuffers); /* buf_table.c */ extern Size BufTableShmemSize(int size); @@ -520,6 +522,7 @@ extern uint32 BufTableHashCode(BufferTag *tagPtr); extern int BufTableLookup(BufferTag *tagPtr, uint32 hashcode); extern int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id); extern void BufTableDelete(BufferTag *tagPtr, uint32 hashcode); +extern void BufTableGetContents(Tuplestorestate *tupstore, TupleDesc tupdesc); /* localbuf.c */ extern bool PinLocalBuffer(BufferDesc *buf_hdr, bool adjust_usagecount); diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index b5f8f3c5d42f..774cf8f38edd 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -19,7 +19,9 @@ #include "storage/block.h" #include "storage/buf.h" #include "storage/bufpage.h" +#include "storage/pg_shmem.h" #include "storage/relfilelocator.h" +#include "utils/guc.h" #include "utils/relcache.h" #include "utils/snapmgr.h" @@ -158,6 +160,7 @@ typedef struct WritebackContext WritebackContext; /* in globals.c ... this duplicates miscadmin.h */ extern PGDLLIMPORT int NBuffers; +extern PGDLLIMPORT int NBuffersPending; /* in bufmgr.c */ extern PGDLLIMPORT bool zero_damaged_pages; @@ -204,6 +207,11 @@ extern PGDLLIMPORT int32 *LocalRefCount; #define BUFFER_LOCK_SHARE 1 #define BUFFER_LOCK_EXCLUSIVE 2 +/* + * prototypes for functions in buf_init.c + */ +extern const char *show_shared_buffers(void); +extern bool check_shared_buffers(int *newval, void **extra, GucSource source); /* * prototypes for functions in bufmgr.c @@ -307,6 +315,7 @@ extern bool IsBufferCleanupOK(Buffer buffer); extern bool HoldingBufferPinThatDelaysRecovery(void); extern bool BgBufferSync(WritebackContext *wb_context); +extern void BgBufferSyncReset(int currentNBuffers, int targetNBuffers); extern uint32 GetPinLimit(void); extern uint32 GetLocalPinLimit(void); @@ -323,10 +332,13 @@ extern void EvictRelUnpinnedBuffers(Relation rel, int32 *buffers_evicted, int32 *buffers_flushed, int32 *buffers_skipped); +extern bool EvictExtraBuffers(int targetNBuffers, int currentNBuffers); /* in buf_init.c */ extern void BufferManagerShmemInit(void); -extern Size BufferManagerShmemSize(void); +extern Size BufferManagerShmemSize(MemoryMappingSizes *mapping_sizes); +extern void BufferManagerShmemResize(int currentNBuffers, int targetNBuffers); +extern void BufferManagerShmemValidate(int targetNBuffers); /* in localbuf.c */ extern void AtProcExit_LocalBuffers(void); @@ -375,7 +387,7 @@ extern void FreeAccessStrategy(BufferAccessStrategy strategy); static inline bool BufferIsValid(Buffer bufnum) { - Assert(bufnum <= NBuffers); + Assert(bufnum <= (Buffer) pg_atomic_read_u32(&ShmemCtrl->currentNBuffers)); Assert(bufnum >= -NLocBuffer); return bufnum != InvalidBuffer; @@ -429,4 +441,11 @@ BufferGetPage(Buffer buffer) #endif /* FRONTEND */ +/* buf_resize.c */ +extern Datum pg_resize_shared_buffers(PG_FUNCTION_ARGS); +extern bool ProcessBarrierShmemShrink(void); +extern bool ProcessBarrierShmemResizeMapAndMem(void); +extern bool ProcessBarrierShmemExpand(void); +extern bool ProcessBarrierShmemResizeFailed(void); + #endif /* BUFMGR_H */ diff --git a/src/include/storage/ipc.h b/src/include/storage/ipc.h index 2a8a8f0eabdb..6dbbb9ad064a 100644 --- a/src/include/storage/ipc.h +++ b/src/include/storage/ipc.h @@ -18,6 +18,8 @@ #ifndef IPC_H #define IPC_H +#include "storage/pg_shmem.h" + typedef void (*pg_on_exit_callback) (int code, Datum arg); typedef void (*shmem_startup_hook_type) (void); @@ -64,6 +66,7 @@ typedef void (*shmem_startup_hook_type) (void); /* ipc.c */ extern PGDLLIMPORT bool proc_exit_inprogress; extern PGDLLIMPORT bool shmem_exit_inprogress; +extern PGDLLIMPORT volatile bool delay_shmem_resize; pg_noreturn extern void proc_exit(int code); extern void shmem_exit(int code); @@ -77,11 +80,13 @@ extern void check_on_shmem_exit_lists_are_empty(void); /* ipci.c */ extern PGDLLIMPORT shmem_startup_hook_type shmem_startup_hook; -extern Size CalculateShmemSize(void); +extern Size CalculateShmemSize(MemoryMappingSizes *mapping_sizes); extern void CreateSharedMemoryAndSemaphores(void); #ifdef EXEC_BACKEND extern void AttachSharedMemoryStructs(void); #endif extern void InitializeShmemGUCs(void); +extern void CoordinateShmemResize(void); +extern bool AnonymousShmemResize(void); #endif /* IPC_H */ diff --git a/src/include/storage/lwlocklist.h b/src/include/storage/lwlocklist.h index 5b0ce383408c..9c4b928441ce 100644 --- a/src/include/storage/lwlocklist.h +++ b/src/include/storage/lwlocklist.h @@ -86,6 +86,7 @@ PG_LWLOCK(51, InjectionPoint) PG_LWLOCK(52, SerialControl) PG_LWLOCK(53, AioWorkerSubmissionQueue) PG_LWLOCK(54, WaitLSN) +PG_LWLOCK(55, ShmemResize) /* * There also exist several built-in LWLock tranches. As with the predefined diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h index 5f7d4b83a60e..369000688209 100644 --- a/src/include/storage/pg_shmem.h +++ b/src/include/storage/pg_shmem.h @@ -24,7 +24,19 @@ #ifndef PG_SHMEM_H #define PG_SHMEM_H +#include "port/atomics.h" +#include "storage/barrier.h" #include "storage/dsm_impl.h" +#include "storage/procsignal.h" +#include "storage/spin.h" +#include "storage/shmem.h" +#include "utils/guc.h" + +typedef struct MemoryMappingSizes +{ + Size shmem_req_size; /* Required size of the segment */ + Size shmem_reserved; /* Required size of the reserved address space. */ +} MemoryMappingSizes; typedef struct PGShmemHeader /* standard header for all Postgres shmem */ { @@ -41,11 +53,56 @@ typedef struct PGShmemHeader /* standard header for all Postgres shmem */ #endif } PGShmemHeader; +typedef struct ShmemSegment +{ + PGShmemHeader *ShmemSegHdr; /* shared mem segment header */ + void *ShmemBase; /* start address of shared memory */ + void *ShmemEnd; /* end+1 address of shared memory */ + slock_t *ShmemLock; /* spinlock for shared memory and LWLock + * allocation */ + int segment_fd; /* fd for the backing anon file */ + unsigned long seg_id; /* IPC key */ + int shmem_segment; /* TODO: Do we really need it? */ + Size shmem_size; /* Size of the actually used memory */ + Size shmem_reserved; /* Size of the reserved mapping */ + Pointer shmem; /* Pointer to the start of the mapped memory */ + Pointer seg_addr; /* SysV shared memory for the header */ +} ShmemSegment; + +/* Number of available segments for anonymous memory mappings */ +#define NUM_MEMORY_MAPPINGS 5 + +extern PGDLLIMPORT ShmemSegment Segments[NUM_MEMORY_MAPPINGS]; + +/* + * ShmemControl is shared between backends and helps to coordinate shared + * memory resize. + * + * TODO: I think we need a lock to protect this structure. If we do so, do we + * need to use atomic integers? + */ +typedef struct +{ + pg_atomic_flag resize_in_progress; /* true if resizing is in progress. false otherwise. */ + pg_atomic_uint32 currentNBuffers; /* Original NBuffers value before resize started */ + pg_atomic_uint32 targetNBuffers; + pid_t coordinator; +} ShmemControl; + +extern PGDLLIMPORT ShmemControl *ShmemCtrl; + +/* The phases for shared memory resizing, used by for ProcSignal barrier. */ +#define SHMEM_RESIZE_REQUESTED 0 +#define SHMEM_RESIZE_START 1 +#define SHMEM_RESIZE_DONE 2 + /* GUC variables */ extern PGDLLIMPORT int shared_memory_type; extern PGDLLIMPORT int huge_pages; extern PGDLLIMPORT int huge_page_size; extern PGDLLIMPORT int huge_pages_status; +extern PGDLLIMPORT bool finalMaxNBuffers; +extern PGDLLIMPORT int MaxNBuffers; /* Possible values for huge_pages and huge_pages_status */ typedef enum @@ -85,10 +142,53 @@ extern void PGSharedMemoryReAttach(void); extern void PGSharedMemoryNoReAttach(void); #endif -extern PGShmemHeader *PGSharedMemoryCreate(Size size, +/* + * round off mapping size to a multiple of a typical page size. + */ +static inline void +round_off_mapping_sizes(MemoryMappingSizes *mapping_sizes) +{ + mapping_sizes->shmem_req_size = add_size(mapping_sizes->shmem_req_size, 8192 - (mapping_sizes->shmem_req_size % 8192)); + mapping_sizes->shmem_reserved = add_size(mapping_sizes->shmem_reserved, 8192 - (mapping_sizes->shmem_reserved % 8192)); +} + + +extern PGShmemHeader *PGSharedMemoryCreate(MemoryMappingSizes *mapping_sizes, int segment_id, PGShmemHeader **shim); extern bool PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2); extern void PGSharedMemoryDetach(void); -extern void GetHugePageSize(Size *hugepagesize, int *mmap_flags); +extern const char *MappingName(int shmem_segment); +extern void GetHugePageSize(Size *hugepagesize, int *mmap_flags, + int *memfd_flags); +void PrepareHugePages(void); + +bool ProcessBarrierShmemResize(Barrier *barrier); +const char *show_shared_buffers(void); +bool check_shared_buffers(int *newval, void **extra, GucSource source); +void AdjustShmemSize(void); +extern void WaitOnShmemBarrier(void); +extern void ShmemControlInit(void); + +/* + * To be able to dynamically resize largest parts of the data stored in shared + * memory, we split it into multiple shared memory mappings segments. Each + * segment contains only certain part of the data, which size depends on + * NBuffers. + */ + +/* The main segment, contains everything except buffer blocks and related data. */ +#define MAIN_SHMEM_SEGMENT 0 + +/* Buffer blocks */ +#define BUFFERS_SHMEM_SEGMENT 1 + +/* Buffer descriptors */ +#define BUFFER_DESCRIPTORS_SHMEM_SEGMENT 2 + +/* Condition variables for buffers */ +#define BUFFER_IOCV_SHMEM_SEGMENT 3 + +/* Checkpoint BufferIds */ +#define CHECKPOINT_BUFFERS_SHMEM_SEGMENT 4 #endif /* PG_SHMEM_H */ diff --git a/src/include/storage/pmsignal.h b/src/include/storage/pmsignal.h index 428aa3fd68a0..5ced2a835370 100644 --- a/src/include/storage/pmsignal.h +++ b/src/include/storage/pmsignal.h @@ -42,9 +42,10 @@ typedef enum PMSIGNAL_START_WALRECEIVER, /* start a walreceiver */ PMSIGNAL_ADVANCE_STATE_MACHINE, /* advance postmaster's state machine */ PMSIGNAL_XLOG_IS_SHUTDOWN, /* ShutdownXLOG() completed */ + PMSIGNAL_SHMEM_RESIZE, /* resize shared memory */ } PMSignalReason; -#define NUM_PMSIGNALS (PMSIGNAL_XLOG_IS_SHUTDOWN+1) +#define NUM_PMSIGNALS (PMSIGNAL_SHMEM_RESIZE+1) /* * Reasons why the postmaster would send SIGQUIT to its children. diff --git a/src/include/storage/procsignal.h b/src/include/storage/procsignal.h index afeeb1ca019f..4de11faf12d4 100644 --- a/src/include/storage/procsignal.h +++ b/src/include/storage/procsignal.h @@ -54,6 +54,10 @@ typedef enum typedef enum { PROCSIGNAL_BARRIER_SMGRRELEASE, /* ask smgr to close files */ + PROCSIGNAL_BARRIER_SHBUF_SHRINK, /* shrink buffer pool - restrict allocations to new size */ + PROCSIGNAL_BARRIER_SHBUF_RESIZE_MAP_AND_MEM, /* remap shared memory segments and update structure pointers */ + PROCSIGNAL_BARRIER_SHBUF_EXPAND, /* expand buffer pool - enable allocations in new range */ + PROCSIGNAL_BARRIER_SHBUF_RESIZE_FAILED, /* signal backends that the shared buffer resizing failed. */ } ProcSignalBarrierType; /* diff --git a/src/include/storage/shmem.h b/src/include/storage/shmem.h index 70a5b8b172c6..d59e5ba6dcd6 100644 --- a/src/include/storage/shmem.h +++ b/src/include/storage/shmem.h @@ -30,19 +30,33 @@ extern PGDLLIMPORT slock_t *ShmemLock; typedef struct PGShmemHeader PGShmemHeader; /* avoid including * storage/pg_shmem.h here */ extern void InitShmemAccess(PGShmemHeader *seghdr); +extern void InitShmemAccessInSegment(struct PGShmemHeader *seghdr, + int shmem_segment); extern void InitShmemAllocation(void); +extern void InitShmemAllocationInSegment(int shmem_segment); extern void *ShmemAlloc(Size size); +extern void *ShmemAllocInSegment(Size size, int shmem_segment); extern void *ShmemAllocNoError(Size size); +extern void *ShmemAllocUnlockedInSegment(Size size, int shmem_segment); extern bool ShmemAddrIsValid(const void *addr); +extern bool ShmemAddrIsValidInSegment(const void *addr, int shmem_segment); extern void InitShmemIndex(void); extern HTAB *ShmemInitHash(const char *name, int64 init_size, int64 max_size, HASHCTL *infoP, int hash_flags); +extern HTAB *ShmemInitHashInSegment(const char *name, long init_size, + long max_size, HASHCTL *infoP, + int hash_flags, int shmem_segment); extern void *ShmemInitStruct(const char *name, Size size, bool *foundPtr); +extern void *ShmemInitStructInSegment(const char *name, Size size, + bool *foundPtr, int shmem_segment); +extern void *ShmemUpdateStructInSegment(const char *name, Size size, + bool *foundPtr, int shmem_segment); extern Size add_size(Size s1, Size s2); extern Size mul_size(Size s1, Size s2); extern PGDLLIMPORT Size pg_get_shmem_pagesize(void); + /* ipci.c */ extern void RequestAddinShmemSpace(Size size); @@ -59,6 +73,7 @@ typedef struct void *location; /* location in shared mem */ Size size; /* # bytes requested for the structure */ Size allocated_size; /* # bytes actually allocated */ + int shmem_segment; /* segment in which the structure is allocated */ } ShmemIndexEnt; #endif /* SHMEM_H */ diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h index f21ec37da893..08a84373fb70 100644 --- a/src/include/utils/guc.h +++ b/src/include/utils/guc.h @@ -459,6 +459,8 @@ extern config_handle *get_config_handle(const char *name); extern void AlterSystemSetConfigFile(AlterSystemStmt *altersysstmt); extern char *GetConfigOptionByName(const char *name, const char **varname, bool missing_ok); +extern void convert_int_from_base_unit(int64 base_value, int base_unit, + int64 *value, const char **unit); extern void TransformGUCArray(ArrayType *array, List **names, List **values); diff --git a/src/test/Makefile b/src/test/Makefile index 511a72e6238a..95f8858a8183 100644 --- a/src/test/Makefile +++ b/src/test/Makefile @@ -12,7 +12,7 @@ subdir = src/test top_builddir = ../.. include $(top_builddir)/src/Makefile.global -SUBDIRS = perl postmaster regress isolation modules authentication recovery subscription +SUBDIRS = perl postmaster regress isolation modules authentication recovery subscription buffermgr ifeq ($(with_icu),yes) SUBDIRS += icu diff --git a/src/test/README b/src/test/README index afdc76765190..77f11607ff76 100644 --- a/src/test/README +++ b/src/test/README @@ -15,6 +15,9 @@ examples/ Demonstration programs for libpq that double as regression tests via "make check" +buffermgr/ + Tests for resizing buffer pool without restarting the server + isolation/ Tests for concurrent behavior at the SQL level diff --git a/src/test/buffermgr/Makefile b/src/test/buffermgr/Makefile new file mode 100644 index 000000000000..eb275027fa60 --- /dev/null +++ b/src/test/buffermgr/Makefile @@ -0,0 +1,30 @@ +#------------------------------------------------------------------------- +# +# Makefile for src/test/buffermgr +# +# Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group +# Portions Copyright (c) 1994, Regents of the University of California +# +# src/test/buffermgr/Makefile +# +#------------------------------------------------------------------------- + +EXTRA_INSTALL = contrib/pg_buffercache + +REGRESS = buffer_resize + +# Custom configuration for buffer manager tests +TEMP_CONFIG = $(srcdir)/buffermgr_test.conf + +subdir = src/test/buffermgr +top_builddir = ../../.. +include $(top_builddir)/src/Makefile.global + +check: + $(prove_check) + +installcheck: + $(prove_installcheck) + +clean distclean: + rm -rf tmp_check diff --git a/src/test/buffermgr/README b/src/test/buffermgr/README new file mode 100644 index 000000000000..c375ad809892 --- /dev/null +++ b/src/test/buffermgr/README @@ -0,0 +1,26 @@ +src/test/buffermgr/README + +Regression tests for buffer manager +=================================== + +This directory contains a test suite for resizing buffer manager without restarting the server. + + +Running the tests +================= + +NOTE: You must have given the --enable-tap-tests argument to configure. + +Run + make check +or + make installcheck +You can use "make installcheck" if you previously did "make install". +In that case, the code in the installation tree is tested. With +"make check", a temporary installation tree is built from the current +sources and then tested. + +Either way, this test initializes, starts, and stops a test Postgres +cluster. + +See src/test/perl/README for more info about running these tests. diff --git a/src/test/buffermgr/buffermgr_test.conf b/src/test/buffermgr/buffermgr_test.conf new file mode 100644 index 000000000000..b7c0065c80b8 --- /dev/null +++ b/src/test/buffermgr/buffermgr_test.conf @@ -0,0 +1,11 @@ +# Configuration for buffer manager regression tests + +# Even if max_shared_buffers is set multiple times only the last one is used to +# as the limit on shared_buffers. +max_shared_buffers = 128kB +# Set initial shared_buffers as expected by test +shared_buffers = 128MB +# Set a larger value for max_shared_buffers to allow testing resize operations +max_shared_buffers = 300MB +# Turn huge pages off, since that affects the size of memory segments +huge_pages = off \ No newline at end of file diff --git a/src/test/buffermgr/expected/buffer_resize.out b/src/test/buffermgr/expected/buffer_resize.out new file mode 100644 index 000000000000..d5cb9d784372 --- /dev/null +++ b/src/test/buffermgr/expected/buffer_resize.out @@ -0,0 +1,329 @@ +-- Test buffer pool resizing and shared memory allocation tracking +-- This test resizes the buffer pool multiple times and monitors +-- shared memory allocations related to buffer management +-- TODO: The test sets shared_buffers values in MBs. Instead it could use values +-- in kBs so that the test runs on very small machines. +-- Create a view for buffer-related shared memory allocations +CREATE VIEW buffer_allocations AS +SELECT name, segment, size, allocated_size +FROM pg_shmem_allocations +WHERE name IN ('Buffer Blocks', 'Buffer Descriptors', 'Buffer IO Condition Variables', + 'Checkpoint BufferIds') +ORDER BY name; +-- Note: We exclude the 'main' segment even if it contains the shared buffer +-- lookup table because it contains other shared structures whose total sizes +-- may vary as the code changes. +CREATE VIEW buffer_segments AS +SELECT name, size, mapping_size, mapping_reserved_size +FROM pg_shmem_segments +WHERE name <> 'main' +ORDER BY name; +-- Enable pg_buffercache for buffer count verification +CREATE EXTENSION IF NOT EXISTS pg_buffercache; +-- Test 1: Default shared_buffers +SHOW shared_buffers; + shared_buffers +---------------- + 128MB +(1 row) + +SHOW max_shared_buffers; + max_shared_buffers +-------------------- + 300MB +(1 row) + +SELECT * FROM buffer_allocations; + name | segment | size | allocated_size +-------------------------------+-------------+-----------+---------------- + Buffer Blocks | buffers | 134221824 | 134221824 + Buffer Descriptors | descriptors | 1048576 | 1048576 + Buffer IO Condition Variables | iocv | 262144 | 262144 + Checkpoint BufferIds | checkpoint | 327680 | 327680 +(4 rows) + +SELECT * FROM buffer_segments; + name | size | mapping_size | mapping_reserved_size +-------------+-----------+--------------+----------------------- + buffers | 134225920 | 134225920 | 314580992 + checkpoint | 335872 | 335872 | 770048 + descriptors | 1056768 | 1056768 | 2465792 + iocv | 270336 | 270336 | 622592 +(4 rows) + +SELECT COUNT(*) AS buffer_count FROM pg_buffercache; + buffer_count +-------------- + 16384 +(1 row) + +-- Calling pg_resize_shared_buffers() without changing shared_buffers should be a no-op. +SELECT pg_resize_shared_buffers(); + pg_resize_shared_buffers +-------------------------- + t +(1 row) + +SHOW shared_buffers; + shared_buffers +---------------- + 128MB +(1 row) + +SELECT * FROM buffer_allocations; + name | segment | size | allocated_size +-------------------------------+-------------+-----------+---------------- + Buffer Blocks | buffers | 134221824 | 134221824 + Buffer Descriptors | descriptors | 1048576 | 1048576 + Buffer IO Condition Variables | iocv | 262144 | 262144 + Checkpoint BufferIds | checkpoint | 327680 | 327680 +(4 rows) + +SELECT * FROM buffer_segments; + name | size | mapping_size | mapping_reserved_size +-------------+-----------+--------------+----------------------- + buffers | 134225920 | 134225920 | 314580992 + checkpoint | 335872 | 335872 | 770048 + descriptors | 1056768 | 1056768 | 2465792 + iocv | 270336 | 270336 | 622592 +(4 rows) + +SELECT COUNT(*) AS buffer_count FROM pg_buffercache; + buffer_count +-------------- + 16384 +(1 row) + +-- Test 2: Set to 64MB +ALTER SYSTEM SET shared_buffers = '64MB'; +SELECT pg_reload_conf(); + pg_reload_conf +---------------- + t +(1 row) + +-- reconnect to ensure new setting is loaded +\c +SHOW shared_buffers; + shared_buffers +----------------------- + 128MB (pending: 64MB) +(1 row) + +SELECT pg_resize_shared_buffers(); + pg_resize_shared_buffers +-------------------------- + t +(1 row) + +SHOW shared_buffers; + shared_buffers +---------------- + 64MB +(1 row) + +SELECT * FROM buffer_allocations; + name | segment | size | allocated_size +-------------------------------+-------------+----------+---------------- + Buffer Blocks | buffers | 67112960 | 67112960 + Buffer Descriptors | descriptors | 524288 | 524288 + Buffer IO Condition Variables | iocv | 131072 | 131072 + Checkpoint BufferIds | checkpoint | 163840 | 163840 +(4 rows) + +SELECT * FROM buffer_segments; + name | size | mapping_size | mapping_reserved_size +-------------+----------+--------------+----------------------- + buffers | 67117056 | 67117056 | 314580992 + checkpoint | 172032 | 172032 | 770048 + descriptors | 532480 | 532480 | 2465792 + iocv | 139264 | 139264 | 622592 +(4 rows) + +SELECT COUNT(*) AS buffer_count FROM pg_buffercache; + buffer_count +-------------- + 8192 +(1 row) + +-- Test 3: Set to 256MB +ALTER SYSTEM SET shared_buffers = '256MB'; +SELECT pg_reload_conf(); + pg_reload_conf +---------------- + t +(1 row) + +-- reconnect to ensure new setting is loaded +\c +SHOW shared_buffers; + shared_buffers +----------------------- + 64MB (pending: 256MB) +(1 row) + +SELECT pg_resize_shared_buffers(); + pg_resize_shared_buffers +-------------------------- + t +(1 row) + +SHOW shared_buffers; + shared_buffers +---------------- + 256MB +(1 row) + +SELECT * FROM buffer_allocations; + name | segment | size | allocated_size +-------------------------------+-------------+-----------+---------------- + Buffer Blocks | buffers | 268439552 | 268439552 + Buffer Descriptors | descriptors | 2097152 | 2097152 + Buffer IO Condition Variables | iocv | 524288 | 524288 + Checkpoint BufferIds | checkpoint | 655360 | 655360 +(4 rows) + +SELECT * FROM buffer_segments; + name | size | mapping_size | mapping_reserved_size +-------------+-----------+--------------+----------------------- + buffers | 268443648 | 268443648 | 314580992 + checkpoint | 663552 | 663552 | 770048 + descriptors | 2105344 | 2105344 | 2465792 + iocv | 532480 | 532480 | 622592 +(4 rows) + +SELECT COUNT(*) AS buffer_count FROM pg_buffercache; + buffer_count +-------------- + 32768 +(1 row) + +-- Test 4: Set to 100MB (non-power-of-two) +ALTER SYSTEM SET shared_buffers = '100MB'; +SELECT pg_reload_conf(); + pg_reload_conf +---------------- + t +(1 row) + +-- reconnect to ensure new setting is loaded +\c +SHOW shared_buffers; + shared_buffers +------------------------ + 256MB (pending: 100MB) +(1 row) + +SELECT pg_resize_shared_buffers(); + pg_resize_shared_buffers +-------------------------- + t +(1 row) + +SHOW shared_buffers; + shared_buffers +---------------- + 100MB +(1 row) + +SELECT * FROM buffer_allocations; + name | segment | size | allocated_size +-------------------------------+-------------+-----------+---------------- + Buffer Blocks | buffers | 104861696 | 104861696 + Buffer Descriptors | descriptors | 819200 | 819200 + Buffer IO Condition Variables | iocv | 204800 | 204800 + Checkpoint BufferIds | checkpoint | 256000 | 256000 +(4 rows) + +SELECT * FROM buffer_segments; + name | size | mapping_size | mapping_reserved_size +-------------+-----------+--------------+----------------------- + buffers | 104865792 | 104865792 | 314580992 + checkpoint | 262144 | 262144 | 770048 + descriptors | 827392 | 827392 | 2465792 + iocv | 212992 | 212992 | 622592 +(4 rows) + +SELECT COUNT(*) AS buffer_count FROM pg_buffercache; + buffer_count +-------------- + 12800 +(1 row) + +-- Test 5: Set to minimum 128kB +ALTER SYSTEM SET shared_buffers = '128kB'; +SELECT pg_reload_conf(); + pg_reload_conf +---------------- + t +(1 row) + +-- reconnect to ensure new setting is loaded +\c +SHOW shared_buffers; + shared_buffers +------------------------ + 100MB (pending: 128kB) +(1 row) + +SELECT pg_resize_shared_buffers(); + pg_resize_shared_buffers +-------------------------- + t +(1 row) + +SHOW shared_buffers; + shared_buffers +---------------- + 128kB +(1 row) + +SELECT * FROM buffer_allocations; + name | segment | size | allocated_size +-------------------------------+-------------+--------+---------------- + Buffer Blocks | buffers | 135168 | 135168 + Buffer Descriptors | descriptors | 1024 | 1024 + Buffer IO Condition Variables | iocv | 256 | 256 + Checkpoint BufferIds | checkpoint | 320 | 320 +(4 rows) + +SELECT * FROM buffer_segments; + name | size | mapping_size | mapping_reserved_size +-------------+--------+--------------+----------------------- + buffers | 139264 | 139264 | 314580992 + checkpoint | 8192 | 8192 | 770048 + descriptors | 8192 | 8192 | 2465792 + iocv | 8192 | 8192 | 622592 +(4 rows) + +SELECT COUNT(*) AS buffer_count FROM pg_buffercache; + buffer_count +-------------- + 16 +(1 row) + +-- Test 6: Try to set shared_buffers higher than max_shared_buffers (should fail) +ALTER SYSTEM SET shared_buffers = '400MB'; +ERROR: invalid value for parameter "shared_buffers": 51200 +DETAIL: "shared_buffers" must be less than "max_shared_buffers". +SELECT pg_reload_conf(); + pg_reload_conf +---------------- + t +(1 row) + +-- reconnect to ensure new setting is loaded +\c +-- This should show the old value since the configuration was rejected +SHOW shared_buffers; + shared_buffers +---------------- + 128kB +(1 row) + +SHOW max_shared_buffers; + max_shared_buffers +-------------------- + 300MB +(1 row) + diff --git a/src/test/buffermgr/meson.build b/src/test/buffermgr/meson.build new file mode 100644 index 000000000000..f33feb64a069 --- /dev/null +++ b/src/test/buffermgr/meson.build @@ -0,0 +1,24 @@ +# Copyright (c) 2022-2025, PostgreSQL Global Development Group + +tests += { + 'name': 'buffermgr', + 'sd': meson.current_source_dir(), + 'bd': meson.current_build_dir(), + 'regress': { + 'sql': [ + 'buffer_resize', + ], + 'regress_args': ['--temp-config', files('buffermgr_test.conf')], + }, + 'tap': { + 'env': { + 'enable_injection_points': get_option('injection_points') ? 'yes' : 'no', + }, + 'tests': [ + 't/001_resize_buffer.pl', + 't/002_checkpoint_buffer_resize.pl', + 't/003_parallel_resize_buffer.pl', + 't/004_client_join_buffer_resize.pl', + ], + }, +} diff --git a/src/test/buffermgr/sql/buffer_resize.sql b/src/test/buffermgr/sql/buffer_resize.sql new file mode 100644 index 000000000000..dfaaeabfcbbb --- /dev/null +++ b/src/test/buffermgr/sql/buffer_resize.sql @@ -0,0 +1,95 @@ +-- Test buffer pool resizing and shared memory allocation tracking +-- This test resizes the buffer pool multiple times and monitors +-- shared memory allocations related to buffer management +-- TODO: The test sets shared_buffers values in MBs. Instead it could use values +-- in kBs so that the test runs on very small machines. + +-- Create a view for buffer-related shared memory allocations +CREATE VIEW buffer_allocations AS +SELECT name, segment, size, allocated_size +FROM pg_shmem_allocations +WHERE name IN ('Buffer Blocks', 'Buffer Descriptors', 'Buffer IO Condition Variables', + 'Checkpoint BufferIds') +ORDER BY name; + +-- Note: We exclude the 'main' segment even if it contains the shared buffer +-- lookup table because it contains other shared structures whose total sizes +-- may vary as the code changes. +CREATE VIEW buffer_segments AS +SELECT name, size, mapping_size, mapping_reserved_size +FROM pg_shmem_segments +WHERE name <> 'main' +ORDER BY name; + +-- Enable pg_buffercache for buffer count verification +CREATE EXTENSION IF NOT EXISTS pg_buffercache; + +-- Test 1: Default shared_buffers +SHOW shared_buffers; +SHOW max_shared_buffers; +SELECT * FROM buffer_allocations; +SELECT * FROM buffer_segments; +SELECT COUNT(*) AS buffer_count FROM pg_buffercache; +-- Calling pg_resize_shared_buffers() without changing shared_buffers should be a no-op. +SELECT pg_resize_shared_buffers(); +SHOW shared_buffers; +SELECT * FROM buffer_allocations; +SELECT * FROM buffer_segments; +SELECT COUNT(*) AS buffer_count FROM pg_buffercache; + +-- Test 2: Set to 64MB +ALTER SYSTEM SET shared_buffers = '64MB'; +SELECT pg_reload_conf(); +-- reconnect to ensure new setting is loaded +\c +SHOW shared_buffers; +SELECT pg_resize_shared_buffers(); +SHOW shared_buffers; +SELECT * FROM buffer_allocations; +SELECT * FROM buffer_segments; +SELECT COUNT(*) AS buffer_count FROM pg_buffercache; + +-- Test 3: Set to 256MB +ALTER SYSTEM SET shared_buffers = '256MB'; +SELECT pg_reload_conf(); +-- reconnect to ensure new setting is loaded +\c +SHOW shared_buffers; +SELECT pg_resize_shared_buffers(); +SHOW shared_buffers; +SELECT * FROM buffer_allocations; +SELECT * FROM buffer_segments; +SELECT COUNT(*) AS buffer_count FROM pg_buffercache; + +-- Test 4: Set to 100MB (non-power-of-two) +ALTER SYSTEM SET shared_buffers = '100MB'; +SELECT pg_reload_conf(); +-- reconnect to ensure new setting is loaded +\c +SHOW shared_buffers; +SELECT pg_resize_shared_buffers(); +SHOW shared_buffers; +SELECT * FROM buffer_allocations; +SELECT * FROM buffer_segments; +SELECT COUNT(*) AS buffer_count FROM pg_buffercache; + +-- Test 5: Set to minimum 128kB +ALTER SYSTEM SET shared_buffers = '128kB'; +SELECT pg_reload_conf(); +-- reconnect to ensure new setting is loaded +\c +SHOW shared_buffers; +SELECT pg_resize_shared_buffers(); +SHOW shared_buffers; +SELECT * FROM buffer_allocations; +SELECT * FROM buffer_segments; +SELECT COUNT(*) AS buffer_count FROM pg_buffercache; + +-- Test 6: Try to set shared_buffers higher than max_shared_buffers (should fail) +ALTER SYSTEM SET shared_buffers = '400MB'; +SELECT pg_reload_conf(); +-- reconnect to ensure new setting is loaded +\c +-- This should show the old value since the configuration was rejected +SHOW shared_buffers; +SHOW max_shared_buffers; diff --git a/src/test/buffermgr/t/001_resize_buffer.pl b/src/test/buffermgr/t/001_resize_buffer.pl new file mode 100644 index 000000000000..a0d7f0941713 --- /dev/null +++ b/src/test/buffermgr/t/001_resize_buffer.pl @@ -0,0 +1,135 @@ +# Copyright (c) 2025-2025, PostgreSQL Global Development Group +# +# Minimal test testing shared_buffer resizing under load + +use strict; +use warnings; +use IPC::Run; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +# Function to resize buffer pool and verify the change. +sub apply_and_verify_buffer_change +{ + my ($node, $new_size) = @_; + + # Use the new pg_resize_shared_buffers() interface which handles everything synchronously + $node->safe_psql('postgres', "ALTER SYSTEM SET shared_buffers = '$new_size'"); + $node->safe_psql('postgres', "SELECT pg_reload_conf()"); + + # If resize function fails, try a few times before giving up + my $max_retries = 5; + my $retry_delay = 1; # seconds + my $success = 0; + for my $attempt (1..$max_retries) { + my $result = $node->safe_psql('postgres', "SELECT pg_resize_shared_buffers()"); + if ($result eq 't') { + $success = 1; + last; + } + + # If not the last attempt, wait before retrying + if ($attempt < $max_retries) { + note "Resizing buffer pool to $new_size, attempt $attempt failed, retrying after $retry_delay seconds..."; + sleep($retry_delay); + } + } + + is($success, 1, 'resizing to ' . $new_size . ' succeeded after retries'); + is($node->safe_psql('postgres', "SHOW shared_buffers"), $new_size, + 'SHOW after resizing to '. $new_size . ' succeeded'); +} + +# Initialize a cluster and start pgbench in the background for concurrent load. +my $node = PostgreSQL::Test::Cluster->new('main'); +$node->init; + +# Permit resizing up to 1GB for this test and let the server start with 128MB. +$node->append_conf('postgresql.conf', qq{ +max_shared_buffers = 1GB +shared_buffers = 128MB +log_statement = none +}); + +$node->start; +$node->safe_psql('postgres', "CREATE EXTENSION pg_buffercache"); +my $pgb_scale = 10; +my $pgb_duration = 120; +my $pgb_num_clients = 10; +$node->pgbench( + "--initialize --init-steps=dtpvg --scale=$pgb_scale --quiet", + 0, + [qr{^$}], + [ # stderr patterns to verify initialization stages + qr{dropping old tables}, + qr{creating tables}, + qr{done in \d+\.\d\d s } + ], + "pgbench initialization (scale=$pgb_scale)" +); +my ($pgbench_stdin, $pgbench_stdout, $pgbench_stderr) = ('', '', ''); +# Use --exit-on-abort so that the test stops on the first server crash or error, +# thus making it easy to debug the failure. Use -C to increase the chances of a +# new backend being created while resizing the buffer pool. +my $pgbench_process = IPC::Run::start( + [ + 'pgbench', + '-p', $node->port, + '-T', $pgb_duration, + '-c', $pgb_num_clients, + '-C', + '--exit-on-abort', + 'postgres' + ], + '<' => \$pgbench_stdin, + '>' => \$pgbench_stdout, + '2>' => \$pgbench_stderr +); + +ok($pgbench_process, "pgbench started successfully"); + +# Allow pgbench to establish connections and start generating load. +# +# TODO: When creating new backends is known to work well with buffer pool +# resizing, this wait should be removed. +sleep(1); + +# Resize buffer pool to various sizes while pgbench is running in the +# background. +# +# TODO: These are pseudo-randomly picked sizes, but we can do better. +my $tests_completed = 0; +my @buffer_sizes = ('900MB', '500MB', '250MB', '400MB', '120MB', '600MB'); +for my $target_size (@buffer_sizes) +{ + # Verify workload generator is still running + if (!$pgbench_process->pumpable) { + ok(0, "pgbench is still running"); + last; + } + + apply_and_verify_buffer_change($node, $target_size); + $tests_completed++; + + # Wait for the resized buffer pool to stabilize. If the resized buffer pool + # is utilized fully, it might hit any wrongly initialized areas of shared + # memory. + sleep(2); +} +is($tests_completed, scalar(@buffer_sizes), "All buffer sizes were tested"); + +# Make sure that pgbench can end normally. +$pgbench_process->signal('TERM'); +IPC::Run::finish $pgbench_process; +ok(grep { $pgbench_process->result == $_ } (0, 15), "pgbench exited gracefully"); + +# Log any error output from pgbench for debugging +diag("pgbench stderr:\n$pgbench_stderr"); +diag("pgbench stdout:\n$pgbench_stdout"); + +# Ensure database is still functional after all the buffer changes +$node->connect_ok("dbname=postgres", + "Database remains accessible after $tests_completed buffer resize operations"); + +done_testing(); \ No newline at end of file diff --git a/src/test/buffermgr/t/002_checkpoint_buffer_resize.pl b/src/test/buffermgr/t/002_checkpoint_buffer_resize.pl new file mode 100644 index 000000000000..9ab615b6557f --- /dev/null +++ b/src/test/buffermgr/t/002_checkpoint_buffer_resize.pl @@ -0,0 +1,111 @@ +# Copyright (c) 2025-2025, PostgreSQL Global Development Group +# +# Test shared_buffer resizing coordination with checkpoint using injection points + +use strict; +use warnings; +use IPC::Run; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +# Skip this test if injection points are not supported +if ($ENV{enable_injection_points} ne 'yes') +{ + plan skip_all => 'Injection points not supported by this build'; +} + +# Initialize cluster with injection points enabled +my $node = PostgreSQL::Test::Cluster->new('main'); +$node->init; +$node->append_conf('postgresql.conf', 'shared_preload_libraries = injection_points'); +$node->append_conf('postgresql.conf', 'shared_buffers = 256kB'); +# Disable background writer to prevent interference with dirty buffers +$node->append_conf('postgresql.conf', 'bgwriter_lru_maxpages = 0'); +$node->start; + +# Load the injection points extension +$node->safe_psql('postgres', "CREATE EXTENSION injection_points"); + +# Create some data to make checkpoint meaningful and ensure many dirty buffers +$node->safe_psql('postgres', "CREATE TABLE test_data (id int, data text)"); +# Insert enough data to fill more than 16 buffers (each row ~1KB, so 20+ rows per page) +$node->safe_psql('postgres', "INSERT INTO test_data SELECT i, repeat('x', 1000) FROM generate_series(1, 5000) i"); + +# Create additional tables to ensure we have plenty of dirty buffers +$node->safe_psql('postgres', "CREATE TABLE test_data2 AS SELECT * FROM test_data WHERE id <= 2500"); +$node->safe_psql('postgres', "CREATE TABLE test_data3 AS SELECT * FROM test_data WHERE id > 2500"); + +# Update data to create more dirty buffers +$node->safe_psql('postgres', "UPDATE test_data SET data = repeat('y', 1000) WHERE id % 3 = 0"); +$node->safe_psql('postgres', "UPDATE test_data2 SET data = repeat('z', 1000) WHERE id % 2 = 0"); + +# Prepare the new shared_buffers configuration before starting checkpoint +$node->safe_psql('postgres', "ALTER SYSTEM SET shared_buffers = '128kB'"); +$node->safe_psql('postgres', "SELECT pg_reload_conf()"); + +# Set up the injection point to make checkpoint wait +$node->safe_psql('postgres', "SELECT injection_points_attach('buffer-sync-dirty-buffer-scan', 'wait')"); + +# Start a checkpoint in the background that will trigger the injection point +my $checkpoint_session = $node->background_psql('postgres'); +$checkpoint_session->query_until( + qr/starting_checkpoint/, + q( + \echo starting_checkpoint + CHECKPOINT; + \q + ) +); + +# Wait until checkpointer actually reaches the injection point +$node->wait_for_event('checkpointer', 'buffer-sync-dirty-buffer-scan'); + +# Verify checkpoint is waiting by checking if it hasn't completed +my $checkpoint_running = $node->safe_psql('postgres', + "SELECT COUNT(*) FROM pg_stat_activity WHERE backend_type = 'checkpointer' AND wait_event = 'buffer-sync-dirty-buffer-scan'"); +is($checkpoint_running, '1', 'Checkpoint is waiting at injection point'); + +# Start the resize operation in the background (don't wait for completion) +my $resize_session = $node->background_psql('postgres'); +$resize_session->query_until( + qr/starting_resize/, + q( + \echo starting_resize + SELECT pg_resize_shared_buffers(); + ) +); + +# Continue the checkpoint and wait for its completion +my $log_offset = -s $node->logfile; +$node->safe_psql('postgres', "SELECT injection_points_wakeup('buffer-sync-dirty-buffer-scan')"); + +# Wait for both checkpoint and resize to complete +$node->wait_for_log(qr/checkpoint complete/, $log_offset); + +# Wait for the resize operation to complete using the proper method +$resize_session->query(q(\echo 'resize_complete')); + +pass('Checkpoint and buffer resize both completed after injection point was released'); + +# Verify the resize actually worked +is($node->safe_psql('postgres', "SHOW shared_buffers"), '128kB', + 'Buffer resize completed successfully after checkpoint coordination'); + +# Cleanup the background session +$resize_session->quit; + +# Clean up the injection point +$node->safe_psql('postgres', "SELECT injection_points_detach('buffer-sync-dirty-buffer-scan')"); + +# Verify system remains stable after coordinated operations + +# Perform a normal checkpoint to ensure everything is working +$node->safe_psql('postgres', "CHECKPOINT"); + +pass('System remains stable after injection point testing'); + +# Cleanup +$node->safe_psql('postgres', "DROP TABLE test_data, test_data2, test_data3"); + +done_testing(); \ No newline at end of file diff --git a/src/test/buffermgr/t/003_parallel_resize_buffer.pl b/src/test/buffermgr/t/003_parallel_resize_buffer.pl new file mode 100644 index 000000000000..9cbb5452fd27 --- /dev/null +++ b/src/test/buffermgr/t/003_parallel_resize_buffer.pl @@ -0,0 +1,71 @@ +# Copyright (c) 2025-2025, PostgreSQL Global Development Group +# +# Test that only one pg_resize_shared_buffers() call succeeds when multiple +# sessions attempt to resize buffers concurrently + +use strict; +use warnings; +use IPC::Run; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +# Skip this test if injection points are not supported +if ($ENV{enable_injection_points} ne 'yes') +{ + plan skip_all => 'Injection points not supported by this build'; +} + +# Initialize a cluster +my $node = PostgreSQL::Test::Cluster->new('main'); +$node->init; +$node->append_conf('postgresql.conf', 'shared_preload_libraries = injection_points'); +$node->append_conf('postgresql.conf', 'shared_buffers = 128kB'); +$node->append_conf('postgresql.conf', 'max_shared_buffers = 256kB'); +$node->start; + +# Load injection points extension for test coordination +$node->safe_psql('postgres', "CREATE EXTENSION injection_points"); + +# Test 1: Two concurrent pg_resize_shared_buffers() calls +# Set up injection point to pause the first resize call +$node->safe_psql('postgres', + "SELECT injection_points_attach('pg-resize-shared-buffers-flag-set', 'wait')"); + +# Change shared_buffers for the resize operation +$node->safe_psql('postgres', "ALTER SYSTEM SET shared_buffers = '144kB'"); +$node->safe_psql('postgres', "SELECT pg_reload_conf()"); + +# Start first resize session (will pause at injection point) +my $session1 = $node->background_psql('postgres'); +$session1->query_until( + qr/starting_resize/, + q( + \echo starting_resize + SELECT pg_resize_shared_buffers(); + ) +); + +# Wait until session actually reaches the injection point +$node->wait_for_event('client backend', 'pg-resize-shared-buffers-flag-set'); + +# Start second resize session (should fail immediately since resize is in progress) +my $result2 = $node->safe_psql('postgres', "SELECT pg_resize_shared_buffers()"); + +# The second call should return false (already in progress) +is($result2, 'f', 'Second concurrent resize call returns false'); + +# Wake up the first session +$node->safe_psql('postgres', + "SELECT injection_points_wakeup('pg-resize-shared-buffers-flag-set')"); + +# The pg_resize_shared_buffers() in session1 should now complete successfully +# We can't easily capture the return value from query_until, but we can +# verify the session completes without error and the resize actually happened +$session1->quit; + +# Detach injection point +$node->safe_psql('postgres', + "SELECT injection_points_detach('pg-resize-shared-buffers-flag-set')"); + +done_testing(); \ No newline at end of file diff --git a/src/test/buffermgr/t/004_client_join_buffer_resize.pl b/src/test/buffermgr/t/004_client_join_buffer_resize.pl new file mode 100644 index 000000000000..06f0de6b4091 --- /dev/null +++ b/src/test/buffermgr/t/004_client_join_buffer_resize.pl @@ -0,0 +1,241 @@ +# Copyright (c) 2025-2025, PostgreSQL Global Development Group +# +# Test shared_buffer resizing coordination with client connections joining using injection points + +use strict; +use warnings; +use IPC::Run; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; +use Time::HiRes qw(sleep); + +# Skip this test if injection points are not supported +if ($ENV{enable_injection_points} ne 'yes') +{ + plan skip_all => 'Injection points not supported by this build'; +} + +# Function to calculate the size of test table required to fill up maximum +# buffer pool when populating it. +sub calculate_test_sizes +{ + my ($node, $block_size) = @_; + + # Get the maximum buffer pool size from configuration + my $max_shared_buffers = $node->safe_psql('postgres', "SHOW max_shared_buffers"); + my ($max_val, $max_unit) = ($max_shared_buffers =~ /(\d+)(\w+)/); + my $max_size_bytes; + if (lc($max_unit) eq 'kb') { + $max_size_bytes = $max_val * 1024; + } elsif (lc($max_unit) eq 'mb') { + $max_size_bytes = $max_val * 1024 * 1024; + } elsif (lc($max_unit) eq 'gb') { + $max_size_bytes = $max_val * 1024 * 1024 * 1024; + } else { + # Default to kB if unit is not recognized + $max_size_bytes = $max_val * 1024; + } + + # Fill more pages than minimally required to increase the chances of pages + # from the test table filling the buffer cache. + $max_size_bytes = $max_size_bytes; + my $pages_needed = int($max_size_bytes / $block_size) + 10; # Add some extra to ensure buffers are filled + my $rows_to_insert = $pages_needed * 100; # Assuming roughly 100 rows per page for our table structure + + return ($max_size_bytes, $pages_needed, $rows_to_insert); +} + +# Function to calculate expected buffer count from size string +sub calculate_buffer_count +{ + my ($size_string, $block_size) = @_; + + # Parse size and convert to bytes + my ($size_val, $unit) = ($size_string =~ /(\d+)(\w+)/); + my $size_bytes; + if (lc($unit) eq 'kb') { + $size_bytes = $size_val * 1024; + } elsif (lc($unit) eq 'mb') { + $size_bytes = $size_val * 1024 * 1024; + } elsif (lc($unit) eq 'gb') { + $size_bytes = $size_val * 1024 * 1024 * 1024; + } else { + # Default to kB if unit is not recognized + $size_bytes = $size_val * 1024; + } + + return int($size_bytes / $block_size); +} + +# Initialize cluster with very small buffer sizes for testing +my $node = PostgreSQL::Test::Cluster->new('main'); +$node->init; + +# Configure for buffer resizing with very small buffer pool sizes for faster tests. +# TODO: for some reason parallel workers try to load default number of shared_buffers which doesn't work with lower max_shared_buffers. We need to fix that - somewhere it's picking default value of shared buffers. For now disable parallelism +$node->append_conf('postgresql.conf', 'shared_preload_libraries = injection_points'); +$node->append_conf('postgresql.conf', qq{ +max_shared_buffers = 512kB +shared_buffers = 320kB +max_parallel_workers_per_gather = 0 +}); + +$node->start; + +# Enable injection points +$node->safe_psql('postgres', "CREATE EXTENSION injection_points"); + +# Get the block size (this is fixed for the binary) +my $block_size = $node->safe_psql('postgres', "SHOW block_size"); + +# Try to create pg_buffercache extension for buffer analysis +eval { + $node->safe_psql('postgres', "CREATE EXTENSION pg_buffercache"); +}; +if ($@) { + $node->stop; + plan skip_all => 'pg_buffercache extension not available - cannot verify buffer usage'; +} + +# Create a small test table, and fetch its properties for later reference if required. +$node->safe_psql('postgres', qq{ + CREATE TABLE client_test (c1 int, data char(50)); +}); + +my $table_oid = $node->safe_psql('postgres', "SELECT oid FROM pg_class WHERE relname = 'client_test'"); +my $table_relfilenode = $node->safe_psql('postgres', "SELECT relfilenode FROM pg_class WHERE relname = 'client_test'"); +note("Test table client_test: OID = $table_oid, relfilenode = $table_relfilenode"); +my ($max_size_bytes, $pages_needed, $rows_to_insert) = calculate_test_sizes($node, $block_size); + +# Create dedicated sessions for injection point handling and test queries, +# so that we don't create new backends for test operations after starting +# resize operation. Only one backend, which tests new backend synchronization +# with resizing operation, should start after resizing has commenced. +my $injection_session = $node->background_psql('postgres'); +my $query_session = $node->background_psql('postgres'); +my $resize_session = $node->background_psql('postgres'); + +# Function to run a single injection point test +sub run_injection_point_test +{ + my ($test_name, $injection_point, $target_size, $operation_type) = @_; + + note("Test with $test_name ($operation_type)"); + + # Calculate test parameters before starting resize + my ($max_size_bytes, $pages_needed, $rows_to_insert) = calculate_test_sizes($node, $target_size, $block_size); + + # Update buffer pool size and wait for it to reflect pending state + $resize_session->query_safe("ALTER SYSTEM SET shared_buffers = '$target_size'"); + $resize_session->query_safe("SELECT pg_reload_conf()"); + my $pending_size_str = "pending: $target_size"; + $resize_session->poll_query_until("SELECT substring(current_setting('shared_buffers'), '$pending_size_str')", $pending_size_str); + + # Set up injection point in injection session + $injection_session->query_safe("SELECT injection_points_attach('$injection_point', 'wait')"); + + # Trigger resize + $resize_session->query_until( + qr/starting_resize/, + q( + \echo starting_resize + SELECT pg_resize_shared_buffers(); + ) + ); + + # Wait until resize actually reaches the injection point using the query session + $query_session->wait_for_event('client backend', $injection_point); + + # Start a client while resize is paused + my $client = $node->background_psql('postgres'); + note("Background client backend PID: " . $client->query_safe("SELECT pg_backend_pid()")); + + # Wake up the injection point from injection session + $injection_session->query_safe("SELECT injection_points_wakeup('$injection_point')"); + + # Test buffer functionality immediately after waking up injection point + # Insert data to test buffer pool functionality during/after resize + $client->query_safe("INSERT INTO client_test SELECT i, 'test_data_' || i FROM generate_series(1, $rows_to_insert) i"); + # Verify the data was inserted correctly and can be read back + is($client->query_safe("SELECT COUNT(*) FROM client_test"), $rows_to_insert, "inserted $rows_to_insert during $test_name ($operation_type) successful"); + + # Verify table size is reasonable (should be substantial for testing) + ok($query_session->query_safe("SELECT pg_total_relation_size('client_test')") >= $max_size_bytes,"table size is large enough to overflow buffer pool in test $test_name ($operation_type)"); + + # Wait for the resize operation to complete. There is no direct way to do so + # in background_psql. Hence fire a psql command and wait for it to finish + $resize_session->query(q(\echo 'done')); + + # Detach injection point from injection session + $injection_session->query_safe("SELECT injection_points_detach('$injection_point')"); + + # Verify resize completed successfully + is($query_session->query_safe("SELECT current_setting('shared_buffers')"), $target_size, + "resize completed successfully to $target_size"); + + # Check buffer pool size using pg_buffercache after resize completion + is($query_session->query_safe("SELECT COUNT(*) FROM pg_buffercache"), calculate_buffer_count($target_size, $block_size), "all buffers in the buffer pool used in $test_name ($operation_type)"); + + # Wait for client to complete + ok($client->quit, "client succeeded during $test_name ($operation_type)"); + + # Clean up for next test + $query_session->query_safe("DELETE FROM client_test"); +} + +# Test injection points during buffer resize with client connections +my @common_injection_tests = ( + { + name => 'flag setting phase', + injection_point => 'pg-resize-shared-buffers-flag-set', + }, + { + name => 'memory remap phase', + injection_point => 'pgrsb-after-shmem-resize', + }, + { + name => 'resize map barrier complete', + injection_point => 'pgrsb-resize-barrier-sent', + }, +); + +# Test common injection points for both shrinking and expanding +foreach my $test (@common_injection_tests) +{ + # Test shrinking scenario + run_injection_point_test($test->{name}, $test->{injection_point}, '272kB', 'shrinking'); + + # Test expanding scenario + run_injection_point_test($test->{name}, $test->{injection_point}, '400kB', 'expanding'); +} + +my @shrink_only_tests = ( + { + name => 'shrink barrier complete', + injection_point => 'pgrsb-shrink-barrier-sent', + size => '200kB', + } +); +foreach my $test (@shrink_only_tests) +{ + run_injection_point_test($test->{name}, $test->{injection_point}, $test->{size}, 'shrinking only'); +} + +my @expand_only_tests = ( + { + name => 'expand barrier complete', + injection_point => 'pgrsb-expand-barrier-sent', + size => '416kB', + } +); +foreach my $test (@expand_only_tests) +{ + run_injection_point_test($test->{name}, $test->{injection_point}, $test->{size}, 'expanding only'); +} + +$injection_session->quit; +$query_session->quit; +$resize_session->quit; + +done_testing(); \ No newline at end of file diff --git a/src/test/meson.build b/src/test/meson.build index ccc31d6a86a1..2a5ba1dec398 100644 --- a/src/test/meson.build +++ b/src/test/meson.build @@ -4,6 +4,7 @@ subdir('regress') subdir('isolation') subdir('authentication') +subdir('buffermgr') subdir('postmaster') subdir('recovery') subdir('subscription') diff --git a/src/test/perl/PostgreSQL/Test/BackgroundPsql.pm b/src/test/perl/PostgreSQL/Test/BackgroundPsql.pm index 60bbd5dd445b..16625e94d92e 100644 --- a/src/test/perl/PostgreSQL/Test/BackgroundPsql.pm +++ b/src/test/perl/PostgreSQL/Test/BackgroundPsql.pm @@ -61,6 +61,7 @@ use Config; use IPC::Run; use PostgreSQL::Test::Utils qw(pump_until); use Test::More; +use Time::HiRes qw(usleep); =pod @@ -371,4 +372,79 @@ sub set_query_timer_restart return $self->{query_timer_restart}; } +=pod + +=item $session->poll_query_until($query [, $expected ]) + +Run B<$query> repeatedly in this background session, until it returns the +B<$expected> result ('t', or SQL boolean true, by default). +Continues polling if the query returns an error result. +Times out after a reasonable number of attempts. +Returns 1 if successful, 0 if timed out. + +=cut + +sub poll_query_until +{ + my ($self, $query, $expected) = @_; + + $expected = 't' unless defined($expected); # default value + + my $max_attempts = 10 * $PostgreSQL::Test::Utils::timeout_default; + my $attempts = 0; + my ($stdout, $stderr_flag); + + while ($attempts < $max_attempts) + { + ($stdout, $stderr_flag) = $self->query($query); + + chomp($stdout); + + # If query succeeded and returned expected result + if (!$stderr_flag && $stdout eq $expected) + { + return 1; + } + + # Wait 0.1 second before retrying. + usleep(100_000); + + $attempts++; + } + + # Give up. Print the output from the last attempt, hopefully that's useful + # for debugging. + my $stderr_output = $stderr_flag ? $self->{stderr} : ''; + diag qq(poll_query_until timed out executing this query: +$query +expecting this output: +$expected +last actual query output: +$stdout +with stderr: +$stderr_output); + return 0; +} + +=item $session->wait_for_event(backend_type, wait_event_name) + +Poll pg_stat_activity until backend_type reaches wait_event_name using this +background session. + +=cut + +sub wait_for_event +{ + my ($self, $backend_type, $wait_event_name) = @_; + + $self->poll_query_until(qq[ + SELECT count(*) > 0 FROM pg_stat_activity + WHERE backend_type = '$backend_type' AND wait_event = '$wait_event_name' + ]) + or die + qq(timed out when waiting for $backend_type to reach wait event '$wait_event_name'); + + return; +} + 1; diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index 372a2188c22a..f02e82de520d 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -1765,14 +1765,22 @@ pg_shadow| SELECT pg_authid.rolname AS usename, LEFT JOIN pg_db_role_setting s ON (((pg_authid.oid = s.setrole) AND (s.setdatabase = (0)::oid)))) WHERE pg_authid.rolcanlogin; pg_shmem_allocations| SELECT name, + segment, off, size, allocated_size - FROM pg_get_shmem_allocations() pg_get_shmem_allocations(name, off, size, allocated_size); + FROM pg_get_shmem_allocations() pg_get_shmem_allocations(name, segment, off, size, allocated_size); pg_shmem_allocations_numa| SELECT name, numa_node, size FROM pg_get_shmem_allocations_numa() pg_get_shmem_allocations_numa(name, numa_node, size); +pg_shmem_segments| SELECT id, + name, + size, + freeoffset, + mapping_size, + mapping_reserved_size + FROM pg_get_shmem_segments() pg_get_shmem_segments(id, name, size, freeoffset, mapping_size, mapping_reserved_size); pg_stat_activity| SELECT s.datid, d.datname, s.pid, diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 27a4d1318978..4d9879ac60d7 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -2778,6 +2778,7 @@ ShellTypeInfo ShippableCacheEntry ShippableCacheKey ShmemIndexEnt +ShmemControl ShutdownForeignScan_function ShutdownInformation ShutdownMode