diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 59a0874528a3..ab20630481dd 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -2329,6 +2329,32 @@ include_dir 'conf.d' + + numa (enum) + + numa configuration parameter + + + + + Specifies wheter to use NUMA interleaving policy for the shared memory + segment. Possible values are off, + all (interleaves shared memory across all available NUMA nodes), + auto (as previous, but only if number of available NUMA nodes is 2 or higher) + or [=@]comma-separated list of node numbers or node ranges + + If comma-separated list of NUMA nodes is prefixed with = the memory allocations + are made strict to avoid spilling to other NUMA nodes. + If comma-separated list of NUMA nodes is prefixed with @ the memory allocations + are made strict and also available CPUs are limited only to those of listed NUMA nodes. + + This parameter is only effective on Linux. Parallel Query interleaving is + only supported with dynamic_shared_memory=posix + The default value is off. This parameter can only be + set at server start. + + + diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c index 197926d44f6b..77af7c56ecd3 100644 --- a/src/backend/port/sysv_shmem.c +++ b/src/backend/port/sysv_shmem.c @@ -29,6 +29,7 @@ #include "miscadmin.h" #include "port/pg_bitutils.h" +#include "port/pg_numa.h" #include "portability/mem.h" #include "storage/dsm.h" #include "storage/fd.h" @@ -663,6 +664,27 @@ CreateAnonymousSegment(Size *size) allocsize) : 0)); } + if (numa->setting > NUMA_OFF) + { + /* In strict mode we want to ensure to not spill memory to another NUMA nodes */ + int mem_bind_policy = numa->setting >= NUMA_STRICT_ONLY ? 1 : 0; + + /* We do nothing in auto mode, if there is just one standard NUMA node */ + if(numa->setting == NUMA_AUTO && pg_numa_get_max_node() <= 1) { + elog(DEBUG1, "no NUMA nodes found"); + } else { + elog(LOG, "enabling NUMA shm interleaving"); + pg_numa_interleave_memptr(ptr, allocsize, numa->nodes); + + /* In NUMA_PREFERRED we can spill memory to other nodes, but not in STRICT modes */ + pg_numa_set_bind_policy(mem_bind_policy); + + /* We can also isolate CPUs to just isolated NUMA nodes */ + if(numa->setting >= NUMA_STRICT_ONLY_AND_CPU_TOO) + pg_numa_bind(numa->nodes); + } + } + *size = allocsize; return ptr; } diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 490f7ce36645..bc9e3da8fa7c 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -100,6 +100,7 @@ #include "pg_getopt.h" #include "pgstat.h" #include "port/pg_bswap.h" +#include "port/pg_numa.h" #include "postmaster/autovacuum.h" #include "postmaster/bgworker_internals.h" #include "postmaster/pgarch.h" @@ -113,6 +114,7 @@ #include "storage/fd.h" #include "storage/io_worker.h" #include "storage/ipc.h" +#include "storage/pg_shmem.h" #include "storage/pmsignal.h" #include "storage/proc.h" #include "tcop/backend_startup.h" @@ -453,6 +455,7 @@ static void StartSysLogger(void); static void StartAutovacuumWorker(void); static bool StartBackgroundWorker(RegisteredBgWorker *rw); static void InitPostmasterDeathWatchHandle(void); +static void InitNuma(void); #ifdef WIN32 #define WNOHANG 0 /* ignored, so any integer value will do */ @@ -993,6 +996,9 @@ PostmasterMain(int argc, char *argv[]) ExitPostmaster(0); } + /* Initialize libnuma if necessary */ + InitNuma(); + /* * Set up shared memory and semaphores. * @@ -4616,3 +4622,14 @@ InitPostmasterDeathWatchHandle(void) GetLastError()))); #endif /* WIN32 */ } + + +static void +InitNuma(void) +{ + if(numa->setting > NUMA_OFF) { + if (pg_numa_init() == -1) + elog(ERROR, "libnuma initialization failed or NUMA is not supported on this platform"); + } + return; +} diff --git a/src/backend/storage/ipc/dsm_impl.c b/src/backend/storage/ipc/dsm_impl.c index 6bf8ab5bb5b5..46dcef48394b 100644 --- a/src/backend/storage/ipc/dsm_impl.c +++ b/src/backend/storage/ipc/dsm_impl.c @@ -64,8 +64,10 @@ #include "pgstat.h" #include "portability/mem.h" #include "postmaster/postmaster.h" +#include "port/pg_numa.h" #include "storage/dsm_impl.h" #include "storage/fd.h" +#include "storage/pg_shmem.h" #include "utils/guc.h" #include "utils/memutils.h" @@ -334,6 +336,13 @@ dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size, } *mapped_address = address; *mapped_size = request_size; + + /* We interleave memory only at creation time. */ + if (op == DSM_OP_CREATE && numa->setting > NUMA_OFF) { + elog(DEBUG1, "interleaving shm mem @ %p size=%zu", *mapped_address, *mapped_size); + pg_numa_interleave_memptr(*mapped_address, *mapped_size, numa->nodes); + } + close(fd); ReleaseExternalFD(); @@ -588,6 +597,8 @@ dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size, *mapped_address = address; *mapped_size = request_size; + /* As dynamic_shared_memory=sysv is a bit legacy, we do not peform NUMA interleave here */ + return true; } #endif @@ -937,6 +948,8 @@ dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size, *mapped_address = address; *mapped_size = request_size; + /* As dynamic_shared_memory=mmap is a bit legacy, we do not peform NUMA interleave here */ + if (CloseTransientFile(fd) != 0) { ereport(elevel, diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c index ca3656fc76f4..4ab299a8e34b 100644 --- a/src/backend/storage/ipc/shmem.c +++ b/src/backend/storage/ipc/shmem.c @@ -74,6 +74,10 @@ #include "storage/shmem.h" #include "storage/spin.h" #include "utils/builtins.h" +#include "utils/guc.h" +#include "utils/guc_hooks.h" +#include +#include static void *ShmemAllocRaw(Size size, Size *allocated_size); @@ -763,3 +767,75 @@ pg_numa_available(PG_FUNCTION_ARGS) { PG_RETURN_BOOL(pg_numa_init() != -1); } + +bool +check_numa(char **newval, void **extra, GucSource source) +{ + bool result = true; + NumaConfigData *n; + char *rawstring = *newval; + + n = (NumaConfigData *) guc_malloc(LOG, sizeof(NumaConfigData)); +#ifndef USE_LIBNUMA + n->setting = NUMA_OFF; + + if (!(strcmp(rawstring, "") == 0 || strcmp(rawstring, "off") == 0)) { + + GUC_check_errdetail("\"%s\" is not supported on this platform.", + "numa"); + result = false; + } +#else + + /* in case of just listing NUMA nodes it's list of preferred ones */ + n->setting = NUMA_PREFERRED; + + if (strcmp(rawstring, "") == 0) + n->setting = DEFAULT_NUMA; + else if (pg_strcasecmp(rawstring, "off") == 0) + n->setting = NUMA_OFF; + else if (pg_strcasecmp(rawstring, "all") == 0) { + n->setting = NUMA_ALL; + n->nodes = numa_all_nodes_ptr; + } else if (pg_strcasecmp(rawstring, "auto") == 0) { + n->setting = NUMA_AUTO; + n->nodes = numa_all_nodes_ptr; + } else if (isdigit(rawstring[0])) + n->setting = NUMA_PREFERRED; + else if (rawstring[0] == '=') + n->setting = NUMA_STRICT_ONLY; + else if (rawstring[0] == '@') + n->setting = NUMA_STRICT_ONLY_AND_CPU_TOO; + else { + GUC_check_errdetail("Invalid option \"%s\".", rawstring); + guc_free(n); + return false; + } + + if(n->setting >= NUMA_PREFERRED) { + char *s = rawstring; + + /* skip first character */ + if(n->setting >= NUMA_STRICT_ONLY) + s++; + + n->nodes = pg_numa_parse_nodestring(s); + if(n->nodes == 0) { + GUC_check_errdetail("Invalid list syntax in parameter \"%s\".", + "numa"); + guc_free(n); + return false; + } + } + +#endif + + *extra = n; + return result; +} + +void +assign_numa(const char *newval, void *extra) +{ + numa = (NumaConfigData *) extra; +} diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 511dc32d5192..7e4a6618210d 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -491,6 +491,7 @@ static const struct config_enum_entry file_copy_method_options[] = { {NULL, 0, false} }; + /* * Options for enum values stored in other modules */ @@ -580,6 +581,8 @@ int huge_pages = HUGE_PAGES_TRY; int huge_page_size; int huge_pages_status = HUGE_PAGES_UNKNOWN; +NumaConfigData *numa; + /* * These variables are all dummies that don't do anything, except in some * cases provide the value for SHOW to display. The real state is elsewhere @@ -594,6 +597,7 @@ static char *server_version_string; static int server_version_num; static char *debug_io_direct_string; static char *restrict_nonsystem_relation_kind_string; +static char *numa_string; #ifdef HAVE_SYSLOG #define DEFAULT_SYSLOG_FACILITY LOG_LOCAL0 @@ -4984,6 +4988,16 @@ struct config_string ConfigureNamesString[] = check_log_connections, assign_log_connections, NULL }, + { + {"numa", PGC_POSTMASTER, RESOURCES_MEM, + gettext_noop("Whether to enable NUMA optimizations."), + NULL + }, + &numa_string, + "", + check_numa, assign_numa, NULL + }, + /* End-of-list marker */ { diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 341f88adc87b..d9e0c165a949 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -135,6 +135,8 @@ # (change requires restart) #huge_page_size = 0 # zero for system default # (change requires restart) +#numa = off # off,all, auto, or comma list of NUMA nodes + # (change requires restart) #temp_buffers = 8MB # min 800kB #max_prepared_transactions = 0 # zero disables the feature # (change requires restart) diff --git a/src/include/port/pg_numa.h b/src/include/port/pg_numa.h index 9d1ea6d0db89..2c91535b5584 100644 --- a/src/include/port/pg_numa.h +++ b/src/include/port/pg_numa.h @@ -14,9 +14,19 @@ #ifndef PG_NUMA_H #define PG_NUMA_H +// JW: is this legal to be included here? +#include +#include + +typedef struct bitmask pg_numa_bitmask_t; + extern PGDLLIMPORT int pg_numa_init(void); extern PGDLLIMPORT int pg_numa_query_pages(int pid, unsigned long count, void **pages, int *status); extern PGDLLIMPORT int pg_numa_get_max_node(void); +extern PGDLLIMPORT int pg_numa_interleave_memptr(void *ptr, size_t sz, pg_numa_bitmask_t *mask); +extern PGDLLIMPORT pg_numa_bitmask_t *pg_numa_parse_nodestring(const char *string); +extern PGDLLIMPORT void pg_numa_set_bind_policy(int strict); +extern PGDLLIMPORT void pg_numa_bind(pg_numa_bitmask_t *nodemask); #ifdef USE_LIBNUMA @@ -32,6 +42,9 @@ pg_numa_touch_mem_if_required(void *ptr) touch = *(volatile uint64 *) ptr; } +extern void numa_warn(int num, char *fmt,...) pg_attribute_printf(2, 3); +extern void numa_error(char *where); + #else #define pg_numa_touch_mem_if_required(ptr) \ diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h index 5f7d4b83a60e..0c95fc4cdd08 100644 --- a/src/include/storage/pg_shmem.h +++ b/src/include/storage/pg_shmem.h @@ -25,6 +25,7 @@ #define PG_SHMEM_H #include "storage/dsm_impl.h" +#include "port/pg_numa.h" typedef struct PGShmemHeader /* standard header for all Postgres shmem */ { @@ -41,11 +42,17 @@ typedef struct PGShmemHeader /* standard header for all Postgres shmem */ #endif } PGShmemHeader; +typedef struct NumaConfigData { + int setting; + pg_numa_bitmask_t *nodes; +} NumaConfigData; + /* GUC variables */ extern PGDLLIMPORT int shared_memory_type; extern PGDLLIMPORT int huge_pages; extern PGDLLIMPORT int huge_page_size; extern PGDLLIMPORT int huge_pages_status; +extern PGDLLIMPORT NumaConfigData *numa; /* Possible values for huge_pages and huge_pages_status */ typedef enum @@ -64,6 +71,18 @@ typedef enum SHMEM_TYPE_MMAP, } PGShmemType; +typedef enum +{ + NUMA_OFF, + NUMA_ALL, + NUMA_AUTO, + NUMA_PREFERRED, + NUMA_STRICT_ONLY, + NUMA_STRICT_ONLY_AND_CPU_TOO, +} NumaType; + +#define DEFAULT_NUMA NUMA_OFF + #ifndef WIN32 extern PGDLLIMPORT unsigned long UsedShmemSegID; #else diff --git a/src/include/utils/guc_hooks.h b/src/include/utils/guc_hooks.h index 799fa7ace684..854a7dd02b4c 100644 --- a/src/include/utils/guc_hooks.h +++ b/src/include/utils/guc_hooks.h @@ -94,6 +94,8 @@ extern bool check_multixact_member_buffers(int *newval, void **extra, extern bool check_multixact_offset_buffers(int *newval, void **extra, GucSource source); extern bool check_notify_buffers(int *newval, void **extra, GucSource source); +extern bool check_numa(char **newval, void **extra, GucSource source); +extern void assign_numa(const char *newval, void *extra); extern bool check_primary_slot_name(char **newval, void **extra, GucSource source); extern bool check_random_seed(double *newval, void **extra, GucSource source); diff --git a/src/port/pg_numa.c b/src/port/pg_numa.c index 3368a43a3382..d461c15365fa 100644 --- a/src/port/pg_numa.c +++ b/src/port/pg_numa.c @@ -13,11 +13,18 @@ *------------------------------------------------------------------------- */ -#include "c.h" +//JW:is this legal to replace "c.h" with below: +#ifndef FRONTEND +#include "postgres.h" +#else +#include "postgres_fe.h" +#endif + #include #include "miscadmin.h" #include "port/pg_numa.h" +#include "common/string.h" /* * At this point we provide support only for Linux thanks to libnuma, but in @@ -106,6 +113,87 @@ pg_numa_get_max_node(void) return numa_max_node(); } +int +pg_numa_interleave_memptr(void *ptr, size_t sz, pg_numa_bitmask_t *mask) +{ + numa_interleave_memory(ptr, sz, mask); + return 0; +} + +pg_numa_bitmask_t * +pg_numa_parse_nodestring(const char *string) +{ + return numa_parse_nodestring(string); +} + +void +pg_numa_set_bind_policy(int strict) +{ + numa_set_bind_policy(strict); +} + +void +pg_numa_bind(pg_numa_bitmask_t *nodemask) +{ + numa_bind(nodemask); +} + +#ifndef FRONTEND +/* + * The standard libnuma built-in code can be seen here: + * https://github.com/numactl/numactl/blob/master/libnuma.c + * + */ +void +numa_warn(int num, char *fmt,...) +{ + va_list ap; + int olde = errno; + int needed; + StringInfoData msg; + + initStringInfo(&msg); + + va_start(ap, fmt); + needed = appendStringInfoVA(&msg, fmt, ap); + va_end(ap); + if (needed > 0) + { + enlargeStringInfo(&msg, needed); + va_start(ap, fmt); + appendStringInfoVA(&msg, fmt, ap); + va_end(ap); + } + + /* chomp last newline character */ + pg_strip_crlf(msg.data); + + ereport(WARNING, + (errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION), + errmsg_internal("libnuma: %s", msg.data))); + + pfree(msg.data); + + errno = olde; +} + +void +numa_error(char *where) +{ + int olde = errno; + + /* chomp last newline character */ + pg_strip_crlf(where); + + /* + * XXX: for now we issue just WARNING, but long-term that might depend on + * numa_set_strict() here. + */ + elog(WARNING, "libnuma: %s", where); + errno = olde; +} +#endif /* FRONTEND */ + #else /* Empty wrappers */ @@ -128,4 +216,28 @@ pg_numa_get_max_node(void) return 0; } +int +pg_numa_interleave_memptr(void *ptr, size_t sz, pg_numa_bitmask_t *mask) +{ + return 0; +} + +pg_numa_bitmask_t * +pg_numa_parse_nodestring(const char *string) +{ + return NULL; +} + +void +pg_numa_set_bind_policy(int strict) +{ + return; +} + +void +pg_numa_bind(pg_numa_bitmask_t *nodemask) +{ + return; +} + #endif