diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 59a0874528a3..ab20630481dd 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -2329,6 +2329,32 @@ include_dir 'conf.d'
+
+ numa (enum)
+
+ numa configuration parameter
+
+
+
+
+ Specifies wheter to use NUMA interleaving policy for the shared memory
+ segment. Possible values are off,
+ all (interleaves shared memory across all available NUMA nodes),
+ auto (as previous, but only if number of available NUMA nodes is 2 or higher)
+ or [=@]comma-separated list of node numbers or node ranges
+
+ If comma-separated list of NUMA nodes is prefixed with = the memory allocations
+ are made strict to avoid spilling to other NUMA nodes.
+ If comma-separated list of NUMA nodes is prefixed with @ the memory allocations
+ are made strict and also available CPUs are limited only to those of listed NUMA nodes.
+
+ This parameter is only effective on Linux. Parallel Query interleaving is
+ only supported with dynamic_shared_memory=posix
+ The default value is off. This parameter can only be
+ set at server start.
+
+
+
diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c
index 197926d44f6b..77af7c56ecd3 100644
--- a/src/backend/port/sysv_shmem.c
+++ b/src/backend/port/sysv_shmem.c
@@ -29,6 +29,7 @@
#include "miscadmin.h"
#include "port/pg_bitutils.h"
+#include "port/pg_numa.h"
#include "portability/mem.h"
#include "storage/dsm.h"
#include "storage/fd.h"
@@ -663,6 +664,27 @@ CreateAnonymousSegment(Size *size)
allocsize) : 0));
}
+ if (numa->setting > NUMA_OFF)
+ {
+ /* In strict mode we want to ensure to not spill memory to another NUMA nodes */
+ int mem_bind_policy = numa->setting >= NUMA_STRICT_ONLY ? 1 : 0;
+
+ /* We do nothing in auto mode, if there is just one standard NUMA node */
+ if(numa->setting == NUMA_AUTO && pg_numa_get_max_node() <= 1) {
+ elog(DEBUG1, "no NUMA nodes found");
+ } else {
+ elog(LOG, "enabling NUMA shm interleaving");
+ pg_numa_interleave_memptr(ptr, allocsize, numa->nodes);
+
+ /* In NUMA_PREFERRED we can spill memory to other nodes, but not in STRICT modes */
+ pg_numa_set_bind_policy(mem_bind_policy);
+
+ /* We can also isolate CPUs to just isolated NUMA nodes */
+ if(numa->setting >= NUMA_STRICT_ONLY_AND_CPU_TOO)
+ pg_numa_bind(numa->nodes);
+ }
+ }
+
*size = allocsize;
return ptr;
}
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 490f7ce36645..bc9e3da8fa7c 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -100,6 +100,7 @@
#include "pg_getopt.h"
#include "pgstat.h"
#include "port/pg_bswap.h"
+#include "port/pg_numa.h"
#include "postmaster/autovacuum.h"
#include "postmaster/bgworker_internals.h"
#include "postmaster/pgarch.h"
@@ -113,6 +114,7 @@
#include "storage/fd.h"
#include "storage/io_worker.h"
#include "storage/ipc.h"
+#include "storage/pg_shmem.h"
#include "storage/pmsignal.h"
#include "storage/proc.h"
#include "tcop/backend_startup.h"
@@ -453,6 +455,7 @@ static void StartSysLogger(void);
static void StartAutovacuumWorker(void);
static bool StartBackgroundWorker(RegisteredBgWorker *rw);
static void InitPostmasterDeathWatchHandle(void);
+static void InitNuma(void);
#ifdef WIN32
#define WNOHANG 0 /* ignored, so any integer value will do */
@@ -993,6 +996,9 @@ PostmasterMain(int argc, char *argv[])
ExitPostmaster(0);
}
+ /* Initialize libnuma if necessary */
+ InitNuma();
+
/*
* Set up shared memory and semaphores.
*
@@ -4616,3 +4622,14 @@ InitPostmasterDeathWatchHandle(void)
GetLastError())));
#endif /* WIN32 */
}
+
+
+static void
+InitNuma(void)
+{
+ if(numa->setting > NUMA_OFF) {
+ if (pg_numa_init() == -1)
+ elog(ERROR, "libnuma initialization failed or NUMA is not supported on this platform");
+ }
+ return;
+}
diff --git a/src/backend/storage/ipc/dsm_impl.c b/src/backend/storage/ipc/dsm_impl.c
index 6bf8ab5bb5b5..46dcef48394b 100644
--- a/src/backend/storage/ipc/dsm_impl.c
+++ b/src/backend/storage/ipc/dsm_impl.c
@@ -64,8 +64,10 @@
#include "pgstat.h"
#include "portability/mem.h"
#include "postmaster/postmaster.h"
+#include "port/pg_numa.h"
#include "storage/dsm_impl.h"
#include "storage/fd.h"
+#include "storage/pg_shmem.h"
#include "utils/guc.h"
#include "utils/memutils.h"
@@ -334,6 +336,13 @@ dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
}
*mapped_address = address;
*mapped_size = request_size;
+
+ /* We interleave memory only at creation time. */
+ if (op == DSM_OP_CREATE && numa->setting > NUMA_OFF) {
+ elog(DEBUG1, "interleaving shm mem @ %p size=%zu", *mapped_address, *mapped_size);
+ pg_numa_interleave_memptr(*mapped_address, *mapped_size, numa->nodes);
+ }
+
close(fd);
ReleaseExternalFD();
@@ -588,6 +597,8 @@ dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size,
*mapped_address = address;
*mapped_size = request_size;
+ /* As dynamic_shared_memory=sysv is a bit legacy, we do not peform NUMA interleave here */
+
return true;
}
#endif
@@ -937,6 +948,8 @@ dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
*mapped_address = address;
*mapped_size = request_size;
+ /* As dynamic_shared_memory=mmap is a bit legacy, we do not peform NUMA interleave here */
+
if (CloseTransientFile(fd) != 0)
{
ereport(elevel,
diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c
index ca3656fc76f4..4ab299a8e34b 100644
--- a/src/backend/storage/ipc/shmem.c
+++ b/src/backend/storage/ipc/shmem.c
@@ -74,6 +74,10 @@
#include "storage/shmem.h"
#include "storage/spin.h"
#include "utils/builtins.h"
+#include "utils/guc.h"
+#include "utils/guc_hooks.h"
+#include
+#include
static void *ShmemAllocRaw(Size size, Size *allocated_size);
@@ -763,3 +767,75 @@ pg_numa_available(PG_FUNCTION_ARGS)
{
PG_RETURN_BOOL(pg_numa_init() != -1);
}
+
+bool
+check_numa(char **newval, void **extra, GucSource source)
+{
+ bool result = true;
+ NumaConfigData *n;
+ char *rawstring = *newval;
+
+ n = (NumaConfigData *) guc_malloc(LOG, sizeof(NumaConfigData));
+#ifndef USE_LIBNUMA
+ n->setting = NUMA_OFF;
+
+ if (!(strcmp(rawstring, "") == 0 || strcmp(rawstring, "off") == 0)) {
+
+ GUC_check_errdetail("\"%s\" is not supported on this platform.",
+ "numa");
+ result = false;
+ }
+#else
+
+ /* in case of just listing NUMA nodes it's list of preferred ones */
+ n->setting = NUMA_PREFERRED;
+
+ if (strcmp(rawstring, "") == 0)
+ n->setting = DEFAULT_NUMA;
+ else if (pg_strcasecmp(rawstring, "off") == 0)
+ n->setting = NUMA_OFF;
+ else if (pg_strcasecmp(rawstring, "all") == 0) {
+ n->setting = NUMA_ALL;
+ n->nodes = numa_all_nodes_ptr;
+ } else if (pg_strcasecmp(rawstring, "auto") == 0) {
+ n->setting = NUMA_AUTO;
+ n->nodes = numa_all_nodes_ptr;
+ } else if (isdigit(rawstring[0]))
+ n->setting = NUMA_PREFERRED;
+ else if (rawstring[0] == '=')
+ n->setting = NUMA_STRICT_ONLY;
+ else if (rawstring[0] == '@')
+ n->setting = NUMA_STRICT_ONLY_AND_CPU_TOO;
+ else {
+ GUC_check_errdetail("Invalid option \"%s\".", rawstring);
+ guc_free(n);
+ return false;
+ }
+
+ if(n->setting >= NUMA_PREFERRED) {
+ char *s = rawstring;
+
+ /* skip first character */
+ if(n->setting >= NUMA_STRICT_ONLY)
+ s++;
+
+ n->nodes = pg_numa_parse_nodestring(s);
+ if(n->nodes == 0) {
+ GUC_check_errdetail("Invalid list syntax in parameter \"%s\".",
+ "numa");
+ guc_free(n);
+ return false;
+ }
+ }
+
+#endif
+
+ *extra = n;
+ return result;
+}
+
+void
+assign_numa(const char *newval, void *extra)
+{
+ numa = (NumaConfigData *) extra;
+}
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index 511dc32d5192..7e4a6618210d 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -491,6 +491,7 @@ static const struct config_enum_entry file_copy_method_options[] = {
{NULL, 0, false}
};
+
/*
* Options for enum values stored in other modules
*/
@@ -580,6 +581,8 @@ int huge_pages = HUGE_PAGES_TRY;
int huge_page_size;
int huge_pages_status = HUGE_PAGES_UNKNOWN;
+NumaConfigData *numa;
+
/*
* These variables are all dummies that don't do anything, except in some
* cases provide the value for SHOW to display. The real state is elsewhere
@@ -594,6 +597,7 @@ static char *server_version_string;
static int server_version_num;
static char *debug_io_direct_string;
static char *restrict_nonsystem_relation_kind_string;
+static char *numa_string;
#ifdef HAVE_SYSLOG
#define DEFAULT_SYSLOG_FACILITY LOG_LOCAL0
@@ -4984,6 +4988,16 @@ struct config_string ConfigureNamesString[] =
check_log_connections, assign_log_connections, NULL
},
+ {
+ {"numa", PGC_POSTMASTER, RESOURCES_MEM,
+ gettext_noop("Whether to enable NUMA optimizations."),
+ NULL
+ },
+ &numa_string,
+ "",
+ check_numa, assign_numa, NULL
+ },
+
/* End-of-list marker */
{
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 341f88adc87b..d9e0c165a949 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -135,6 +135,8 @@
# (change requires restart)
#huge_page_size = 0 # zero for system default
# (change requires restart)
+#numa = off # off,all, auto, or comma list of NUMA nodes
+ # (change requires restart)
#temp_buffers = 8MB # min 800kB
#max_prepared_transactions = 0 # zero disables the feature
# (change requires restart)
diff --git a/src/include/port/pg_numa.h b/src/include/port/pg_numa.h
index 9d1ea6d0db89..2c91535b5584 100644
--- a/src/include/port/pg_numa.h
+++ b/src/include/port/pg_numa.h
@@ -14,9 +14,19 @@
#ifndef PG_NUMA_H
#define PG_NUMA_H
+// JW: is this legal to be included here?
+#include
+#include
+
+typedef struct bitmask pg_numa_bitmask_t;
+
extern PGDLLIMPORT int pg_numa_init(void);
extern PGDLLIMPORT int pg_numa_query_pages(int pid, unsigned long count, void **pages, int *status);
extern PGDLLIMPORT int pg_numa_get_max_node(void);
+extern PGDLLIMPORT int pg_numa_interleave_memptr(void *ptr, size_t sz, pg_numa_bitmask_t *mask);
+extern PGDLLIMPORT pg_numa_bitmask_t *pg_numa_parse_nodestring(const char *string);
+extern PGDLLIMPORT void pg_numa_set_bind_policy(int strict);
+extern PGDLLIMPORT void pg_numa_bind(pg_numa_bitmask_t *nodemask);
#ifdef USE_LIBNUMA
@@ -32,6 +42,9 @@ pg_numa_touch_mem_if_required(void *ptr)
touch = *(volatile uint64 *) ptr;
}
+extern void numa_warn(int num, char *fmt,...) pg_attribute_printf(2, 3);
+extern void numa_error(char *where);
+
#else
#define pg_numa_touch_mem_if_required(ptr) \
diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h
index 5f7d4b83a60e..0c95fc4cdd08 100644
--- a/src/include/storage/pg_shmem.h
+++ b/src/include/storage/pg_shmem.h
@@ -25,6 +25,7 @@
#define PG_SHMEM_H
#include "storage/dsm_impl.h"
+#include "port/pg_numa.h"
typedef struct PGShmemHeader /* standard header for all Postgres shmem */
{
@@ -41,11 +42,17 @@ typedef struct PGShmemHeader /* standard header for all Postgres shmem */
#endif
} PGShmemHeader;
+typedef struct NumaConfigData {
+ int setting;
+ pg_numa_bitmask_t *nodes;
+} NumaConfigData;
+
/* GUC variables */
extern PGDLLIMPORT int shared_memory_type;
extern PGDLLIMPORT int huge_pages;
extern PGDLLIMPORT int huge_page_size;
extern PGDLLIMPORT int huge_pages_status;
+extern PGDLLIMPORT NumaConfigData *numa;
/* Possible values for huge_pages and huge_pages_status */
typedef enum
@@ -64,6 +71,18 @@ typedef enum
SHMEM_TYPE_MMAP,
} PGShmemType;
+typedef enum
+{
+ NUMA_OFF,
+ NUMA_ALL,
+ NUMA_AUTO,
+ NUMA_PREFERRED,
+ NUMA_STRICT_ONLY,
+ NUMA_STRICT_ONLY_AND_CPU_TOO,
+} NumaType;
+
+#define DEFAULT_NUMA NUMA_OFF
+
#ifndef WIN32
extern PGDLLIMPORT unsigned long UsedShmemSegID;
#else
diff --git a/src/include/utils/guc_hooks.h b/src/include/utils/guc_hooks.h
index 799fa7ace684..854a7dd02b4c 100644
--- a/src/include/utils/guc_hooks.h
+++ b/src/include/utils/guc_hooks.h
@@ -94,6 +94,8 @@ extern bool check_multixact_member_buffers(int *newval, void **extra,
extern bool check_multixact_offset_buffers(int *newval, void **extra,
GucSource source);
extern bool check_notify_buffers(int *newval, void **extra, GucSource source);
+extern bool check_numa(char **newval, void **extra, GucSource source);
+extern void assign_numa(const char *newval, void *extra);
extern bool check_primary_slot_name(char **newval, void **extra,
GucSource source);
extern bool check_random_seed(double *newval, void **extra, GucSource source);
diff --git a/src/port/pg_numa.c b/src/port/pg_numa.c
index 3368a43a3382..d461c15365fa 100644
--- a/src/port/pg_numa.c
+++ b/src/port/pg_numa.c
@@ -13,11 +13,18 @@
*-------------------------------------------------------------------------
*/
-#include "c.h"
+//JW:is this legal to replace "c.h" with below:
+#ifndef FRONTEND
+#include "postgres.h"
+#else
+#include "postgres_fe.h"
+#endif
+
#include
#include "miscadmin.h"
#include "port/pg_numa.h"
+#include "common/string.h"
/*
* At this point we provide support only for Linux thanks to libnuma, but in
@@ -106,6 +113,87 @@ pg_numa_get_max_node(void)
return numa_max_node();
}
+int
+pg_numa_interleave_memptr(void *ptr, size_t sz, pg_numa_bitmask_t *mask)
+{
+ numa_interleave_memory(ptr, sz, mask);
+ return 0;
+}
+
+pg_numa_bitmask_t *
+pg_numa_parse_nodestring(const char *string)
+{
+ return numa_parse_nodestring(string);
+}
+
+void
+pg_numa_set_bind_policy(int strict)
+{
+ numa_set_bind_policy(strict);
+}
+
+void
+pg_numa_bind(pg_numa_bitmask_t *nodemask)
+{
+ numa_bind(nodemask);
+}
+
+#ifndef FRONTEND
+/*
+ * The standard libnuma built-in code can be seen here:
+ * https://github.com/numactl/numactl/blob/master/libnuma.c
+ *
+ */
+void
+numa_warn(int num, char *fmt,...)
+{
+ va_list ap;
+ int olde = errno;
+ int needed;
+ StringInfoData msg;
+
+ initStringInfo(&msg);
+
+ va_start(ap, fmt);
+ needed = appendStringInfoVA(&msg, fmt, ap);
+ va_end(ap);
+ if (needed > 0)
+ {
+ enlargeStringInfo(&msg, needed);
+ va_start(ap, fmt);
+ appendStringInfoVA(&msg, fmt, ap);
+ va_end(ap);
+ }
+
+ /* chomp last newline character */
+ pg_strip_crlf(msg.data);
+
+ ereport(WARNING,
+ (errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION),
+ errmsg_internal("libnuma: %s", msg.data)));
+
+ pfree(msg.data);
+
+ errno = olde;
+}
+
+void
+numa_error(char *where)
+{
+ int olde = errno;
+
+ /* chomp last newline character */
+ pg_strip_crlf(where);
+
+ /*
+ * XXX: for now we issue just WARNING, but long-term that might depend on
+ * numa_set_strict() here.
+ */
+ elog(WARNING, "libnuma: %s", where);
+ errno = olde;
+}
+#endif /* FRONTEND */
+
#else
/* Empty wrappers */
@@ -128,4 +216,28 @@ pg_numa_get_max_node(void)
return 0;
}
+int
+pg_numa_interleave_memptr(void *ptr, size_t sz, pg_numa_bitmask_t *mask)
+{
+ return 0;
+}
+
+pg_numa_bitmask_t *
+pg_numa_parse_nodestring(const char *string)
+{
+ return NULL;
+}
+
+void
+pg_numa_set_bind_policy(int strict)
+{
+ return;
+}
+
+void
+pg_numa_bind(pg_numa_bitmask_t *nodemask)
+{
+ return;
+}
+
#endif