diff options
| author | Kevin Grittner | 2011-09-24 16:15:45 +0000 |
|---|---|---|
| committer | Kevin Grittner | 2011-09-24 16:15:45 +0000 |
| commit | af8d5448f8be9c3f5fb030ac94509629cccab09b (patch) | |
| tree | 57533e96b2317c49aaa418632ec49046ce0c93a5 /src/backend | |
| parent | bb08357723c3188d73f3eca170987d4d7af58635 (diff) | |
| parent | 337c0b03614c45516f2c3ec956405713bb264d54 (diff) | |
Merge branch 'master' into serializableserializable
Diffstat (limited to 'src/backend')
22 files changed, 368 insertions, 38 deletions
diff --git a/src/backend/catalog/namespace.c b/src/backend/catalog/namespace.c index 040bef6add..fcc90fed5f 100644 --- a/src/backend/catalog/namespace.c +++ b/src/backend/catalog/namespace.c @@ -225,7 +225,6 @@ RangeVarGetRelid(const RangeVar *relation, LOCKMODE lockmode, bool missing_ok, bool nowait) { uint64 inval_count; - Oid namespaceId; Oid relId; Oid oldRelId = InvalidOid; bool retry = false; @@ -278,17 +277,27 @@ RangeVarGetRelid(const RangeVar *relation, LOCKMODE lockmode, bool missing_ok, */ if (relation->relpersistence == RELPERSISTENCE_TEMP) { - if (relation->schemaname) - ereport(ERROR, - (errcode(ERRCODE_INVALID_TABLE_DEFINITION), - errmsg("temporary tables cannot specify a schema name"))); - if (OidIsValid(myTempNamespace)) + if (!OidIsValid(myTempNamespace)) + relId = InvalidOid; /* this probably can't happen? */ + else + { + if (relation->schemaname) + { + Oid namespaceId; + namespaceId = LookupExplicitNamespace(relation->schemaname); + if (namespaceId != myTempNamespace) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TABLE_DEFINITION), + errmsg("temporary tables cannot specify a schema name"))); + } + relId = get_relname_relid(relation->relname, myTempNamespace); - else /* this probably can't happen? */ - relId = InvalidOid; + } } else if (relation->schemaname) { + Oid namespaceId; + /* use exact schema given */ namespaceId = LookupExplicitNamespace(relation->schemaname); relId = get_relname_relid(relation->relname, namespaceId); diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index 6408d1653b..cd9fc92923 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -18,7 +18,6 @@ #include "commands/defrem.h" #include "commands/prepare.h" #include "executor/hashjoin.h" -#include "executor/instrument.h" #include "foreign/fdwapi.h" #include "optimizer/clauses.h" #include "parser/parsetree.h" @@ -76,6 +75,8 @@ static void show_sort_keys_common(PlanState *planstate, List *ancestors, ExplainState *es); static void show_sort_info(SortState *sortstate, ExplainState *es); static void show_hash_info(HashState *hashstate, ExplainState *es); +static void show_instrumentation_count(const char *qlabel, int which, + PlanState *planstate, ExplainState *es); static void show_foreignscan_info(ForeignScanState *fsstate, ExplainState *es); static const char *explain_get_index_name(Oid indexId); static void ExplainScanTarget(Scan *plan, ExplainState *es); @@ -1000,9 +1001,15 @@ ExplainNode(PlanState *planstate, List *ancestors, case T_IndexScan: show_scan_qual(((IndexScan *) plan)->indexqualorig, "Index Cond", planstate, ancestors, es); + if (((IndexScan *) plan)->indexqualorig) + show_instrumentation_count("Rows Removed by Index Recheck", 2, + planstate, es); show_scan_qual(((IndexScan *) plan)->indexorderbyorig, "Order By", planstate, ancestors, es); show_scan_qual(plan->qual, "Filter", planstate, ancestors, es); + if (plan->qual) + show_instrumentation_count("Rows Removed by Filter", 1, + planstate, es); break; case T_BitmapIndexScan: show_scan_qual(((BitmapIndexScan *) plan)->indexqualorig, @@ -1011,6 +1018,9 @@ ExplainNode(PlanState *planstate, List *ancestors, case T_BitmapHeapScan: show_scan_qual(((BitmapHeapScan *) plan)->bitmapqualorig, "Recheck Cond", planstate, ancestors, es); + if (((BitmapHeapScan *) plan)->bitmapqualorig) + show_instrumentation_count("Rows Removed by Index Recheck", 2, + planstate, es); /* FALL THRU */ case T_SeqScan: case T_ValuesScan: @@ -1018,6 +1028,9 @@ ExplainNode(PlanState *planstate, List *ancestors, case T_WorkTableScan: case T_SubqueryScan: show_scan_qual(plan->qual, "Filter", planstate, ancestors, es); + if (plan->qual) + show_instrumentation_count("Rows Removed by Filter", 1, + planstate, es); break; case T_FunctionScan: if (es->verbose) @@ -1025,6 +1038,9 @@ ExplainNode(PlanState *planstate, List *ancestors, "Function Call", planstate, ancestors, es->verbose, es); show_scan_qual(plan->qual, "Filter", planstate, ancestors, es); + if (plan->qual) + show_instrumentation_count("Rows Removed by Filter", 1, + planstate, es); break; case T_TidScan: { @@ -1038,34 +1054,61 @@ ExplainNode(PlanState *planstate, List *ancestors, tidquals = list_make1(make_orclause(tidquals)); show_scan_qual(tidquals, "TID Cond", planstate, ancestors, es); show_scan_qual(plan->qual, "Filter", planstate, ancestors, es); + if (plan->qual) + show_instrumentation_count("Rows Removed by Filter", 1, + planstate, es); } break; case T_ForeignScan: show_scan_qual(plan->qual, "Filter", planstate, ancestors, es); + if (plan->qual) + show_instrumentation_count("Rows Removed by Filter", 1, + planstate, es); show_foreignscan_info((ForeignScanState *) planstate, es); break; case T_NestLoop: show_upper_qual(((NestLoop *) plan)->join.joinqual, "Join Filter", planstate, ancestors, es); + if (((NestLoop *) plan)->join.joinqual) + show_instrumentation_count("Rows Removed by Join Filter", 1, + planstate, es); show_upper_qual(plan->qual, "Filter", planstate, ancestors, es); + if (plan->qual) + show_instrumentation_count("Rows Removed by Filter", 2, + planstate, es); break; case T_MergeJoin: show_upper_qual(((MergeJoin *) plan)->mergeclauses, "Merge Cond", planstate, ancestors, es); show_upper_qual(((MergeJoin *) plan)->join.joinqual, "Join Filter", planstate, ancestors, es); + if (((MergeJoin *) plan)->join.joinqual) + show_instrumentation_count("Rows Removed by Join Filter", 1, + planstate, es); show_upper_qual(plan->qual, "Filter", planstate, ancestors, es); + if (plan->qual) + show_instrumentation_count("Rows Removed by Filter", 2, + planstate, es); break; case T_HashJoin: show_upper_qual(((HashJoin *) plan)->hashclauses, "Hash Cond", planstate, ancestors, es); show_upper_qual(((HashJoin *) plan)->join.joinqual, "Join Filter", planstate, ancestors, es); + if (((HashJoin *) plan)->join.joinqual) + show_instrumentation_count("Rows Removed by Join Filter", 1, + planstate, es); show_upper_qual(plan->qual, "Filter", planstate, ancestors, es); + if (plan->qual) + show_instrumentation_count("Rows Removed by Filter", 2, + planstate, es); break; case T_Agg: case T_Group: show_upper_qual(plan->qual, "Filter", planstate, ancestors, es); + if (plan->qual) + show_instrumentation_count("Rows Removed by Filter", 1, + planstate, es); break; case T_Sort: show_sort_keys((SortState *) planstate, ancestors, es); @@ -1079,6 +1122,9 @@ ExplainNode(PlanState *planstate, List *ancestors, show_upper_qual((List *) ((Result *) plan)->resconstantqual, "One-Time Filter", planstate, ancestors, es); show_upper_qual(plan->qual, "Filter", planstate, ancestors, es); + if (plan->qual) + show_instrumentation_count("Rows Removed by Filter", 1, + planstate, es); break; case T_Hash: show_hash_info((HashState *) planstate, es); @@ -1509,6 +1555,37 @@ show_hash_info(HashState *hashstate, ExplainState *es) } /* + * If it's EXPLAIN ANALYZE, show instrumentation information for a plan node + * + * "which" identifies which instrumentation counter to print + */ +static void +show_instrumentation_count(const char *qlabel, int which, + PlanState *planstate, ExplainState *es) +{ + double nfiltered; + double nloops; + + if (!es->analyze || !planstate->instrument) + return; + + if (which == 2) + nfiltered = planstate->instrument->nfiltered2; + else + nfiltered = planstate->instrument->nfiltered1; + nloops = planstate->instrument->nloops; + + /* In text mode, suppress zero counts; they're not interesting enough */ + if (nfiltered > 0 || es->format != EXPLAIN_FORMAT_TEXT) + { + if (nloops > 0) + ExplainPropertyFloat(qlabel, nfiltered / nloops, 0, es); + else + ExplainPropertyFloat(qlabel, 0.0, 0, es); + } +} + +/* * Show extra information for a ForeignScan node. */ static void diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c index 680962aa44..06d368e077 100644 --- a/src/backend/commands/trigger.c +++ b/src/backend/commands/trigger.c @@ -29,7 +29,6 @@ #include "commands/defrem.h" #include "commands/trigger.h" #include "executor/executor.h" -#include "executor/instrument.h" #include "miscadmin.h" #include "nodes/bitmapset.h" #include "nodes/makefuncs.h" diff --git a/src/backend/executor/execAmi.c b/src/backend/executor/execAmi.c index ffdcc966ee..711e8c7786 100644 --- a/src/backend/executor/execAmi.c +++ b/src/backend/executor/execAmi.c @@ -13,7 +13,6 @@ #include "postgres.h" #include "executor/execdebug.h" -#include "executor/instrument.h" #include "executor/nodeAgg.h" #include "executor/nodeAppend.h" #include "executor/nodeBitmapAnd.h" diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index 1dfe8b9ac7..fd7a9ed033 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -47,7 +47,6 @@ #include "commands/tablespace.h" #include "commands/trigger.h" #include "executor/execdebug.h" -#include "executor/instrument.h" #include "miscadmin.h" #include "optimizer/clauses.h" #include "parser/parse_clause.h" diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c index 17788761d7..8bdfad2222 100644 --- a/src/backend/executor/execProcnode.c +++ b/src/backend/executor/execProcnode.c @@ -77,7 +77,6 @@ #include "postgres.h" #include "executor/executor.h" -#include "executor/instrument.h" #include "executor/nodeAgg.h" #include "executor/nodeAppend.h" #include "executor/nodeBitmapAnd.h" diff --git a/src/backend/executor/execScan.c b/src/backend/executor/execScan.c index e90058847d..d4ed235856 100644 --- a/src/backend/executor/execScan.c +++ b/src/backend/executor/execScan.c @@ -219,6 +219,8 @@ ExecScan(ScanState *node, return slot; } } + else + InstrCountFiltered1(node, 1); /* * Tuple fails qual, so free per-tuple memory and try again. diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c index bf9bf12ab6..9d30200ab3 100644 --- a/src/backend/executor/instrument.c +++ b/src/backend/executor/instrument.c @@ -22,6 +22,7 @@ BufferUsage pgBufferUsage; static void BufferUsageAccumDiff(BufferUsage *dst, const BufferUsage *add, const BufferUsage *sub); + /* Allocate new instrumentation structure(s) */ Instrumentation * InstrAlloc(int n, int instrument_options) @@ -31,13 +32,14 @@ InstrAlloc(int n, int instrument_options) /* timer is always required for now */ Assert(instrument_options & INSTRUMENT_TIMER); + /* initialize all fields to zeroes, then modify as needed */ instr = palloc0(n * sizeof(Instrumentation)); if (instrument_options & INSTRUMENT_BUFFERS) { int i; for (i = 0; i < n; i++) - instr[i].needs_bufusage = true; + instr[i].need_bufusage = true; } return instr; @@ -52,8 +54,8 @@ InstrStartNode(Instrumentation *instr) else elog(DEBUG2, "InstrStartNode called twice in a row"); - /* initialize buffer usage per plan node */ - if (instr->needs_bufusage) + /* save buffer usage totals at node entry, if needed */ + if (instr->need_bufusage) instr->bufusage_start = pgBufferUsage; } @@ -77,8 +79,8 @@ InstrStopNode(Instrumentation *instr, double nTuples) INSTR_TIME_SET_ZERO(instr->starttime); - /* Adds delta of buffer usage to node's count. */ - if (instr->needs_bufusage) + /* Add delta of buffer usage since entry to node's totals */ + if (instr->need_bufusage) BufferUsageAccumDiff(&instr->bufusage, &pgBufferUsage, &instr->bufusage_start); @@ -119,12 +121,12 @@ InstrEndLoop(Instrumentation *instr) instr->tuplecount = 0; } +/* dst += add - sub */ static void BufferUsageAccumDiff(BufferUsage *dst, const BufferUsage *add, const BufferUsage *sub) { - /* dst += add - sub */ dst->shared_blks_hit += add->shared_blks_hit - sub->shared_blks_hit; dst->shared_blks_read += add->shared_blks_read - sub->shared_blks_read; dst->shared_blks_written += add->shared_blks_written - sub->shared_blks_written; diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c index 13d7723480..e769d6d012 100644 --- a/src/backend/executor/nodeAgg.c +++ b/src/backend/executor/nodeAgg.c @@ -1204,6 +1204,8 @@ agg_retrieve_direct(AggState *aggstate) return result; } } + else + InstrCountFiltered1(aggstate, 1); } /* No more groups */ @@ -1354,6 +1356,8 @@ agg_retrieve_hash_table(AggState *aggstate) return result; } } + else + InstrCountFiltered1(aggstate, 1); } /* No more groups */ diff --git a/src/backend/executor/nodeBitmapAnd.c b/src/backend/executor/nodeBitmapAnd.c index 82308cba26..5f318c31e7 100644 --- a/src/backend/executor/nodeBitmapAnd.c +++ b/src/backend/executor/nodeBitmapAnd.c @@ -29,7 +29,6 @@ #include "postgres.h" #include "executor/execdebug.h" -#include "executor/instrument.h" #include "executor/nodeBitmapAnd.h" diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c index 8e50fb1aae..4a8920e6ce 100644 --- a/src/backend/executor/nodeBitmapHeapscan.c +++ b/src/backend/executor/nodeBitmapHeapscan.c @@ -278,6 +278,7 @@ BitmapHeapNext(BitmapHeapScanState *node) if (!ExecQual(node->bitmapqualorig, econtext, false)) { /* Fails recheck, so drop it and loop back for another */ + InstrCountFiltered2(node, 1); ExecClearTuple(slot); continue; } diff --git a/src/backend/executor/nodeBitmapIndexscan.c b/src/backend/executor/nodeBitmapIndexscan.c index 9a56fd4b9f..8e1df079b3 100644 --- a/src/backend/executor/nodeBitmapIndexscan.c +++ b/src/backend/executor/nodeBitmapIndexscan.c @@ -22,7 +22,6 @@ #include "postgres.h" #include "executor/execdebug.h" -#include "executor/instrument.h" #include "executor/nodeBitmapIndexscan.h" #include "executor/nodeIndexscan.h" #include "miscadmin.h" diff --git a/src/backend/executor/nodeBitmapOr.c b/src/backend/executor/nodeBitmapOr.c index 4b064b79a9..d2453d5a4f 100644 --- a/src/backend/executor/nodeBitmapOr.c +++ b/src/backend/executor/nodeBitmapOr.c @@ -29,7 +29,6 @@ #include "postgres.h" #include "executor/execdebug.h" -#include "executor/instrument.h" #include "executor/nodeBitmapOr.h" #include "miscadmin.h" diff --git a/src/backend/executor/nodeGroup.c b/src/backend/executor/nodeGroup.c index fa403e5406..7bef8bbe8b 100644 --- a/src/backend/executor/nodeGroup.c +++ b/src/backend/executor/nodeGroup.c @@ -118,6 +118,8 @@ ExecGroup(GroupState *node) return result; } } + else + InstrCountFiltered1(node, 1); } /* @@ -179,6 +181,8 @@ ExecGroup(GroupState *node) return result; } } + else + InstrCountFiltered1(node, 1); } /* NOTREACHED */ diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c index 2ade2d7fad..e72a71bf51 100644 --- a/src/backend/executor/nodeHash.c +++ b/src/backend/executor/nodeHash.c @@ -28,7 +28,6 @@ #include "commands/tablespace.h" #include "executor/execdebug.h" #include "executor/hashjoin.h" -#include "executor/instrument.h" #include "executor/nodeHash.h" #include "executor/nodeHashjoin.h" #include "miscadmin.h" diff --git a/src/backend/executor/nodeHashjoin.c b/src/backend/executor/nodeHashjoin.c index 3a6698105f..c3c4db4bc2 100644 --- a/src/backend/executor/nodeHashjoin.c +++ b/src/backend/executor/nodeHashjoin.c @@ -325,7 +325,11 @@ ExecHashJoin(HashJoinState *node) return result; } } + else + InstrCountFiltered2(node, 1); } + else + InstrCountFiltered1(node, 1); break; case HJ_FILL_OUTER_TUPLE: @@ -360,6 +364,8 @@ ExecHashJoin(HashJoinState *node) return result; } } + else + InstrCountFiltered2(node, 1); } break; @@ -397,6 +403,8 @@ ExecHashJoin(HashJoinState *node) return result; } } + else + InstrCountFiltered2(node, 1); break; case HJ_NEED_NEW_BATCH: diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index 955008e012..da25384e86 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -96,7 +96,11 @@ IndexNext(IndexScanState *node) econtext->ecxt_scantuple = slot; ResetExprContext(econtext); if (!ExecQual(node->indexqualorig, econtext, false)) - continue; /* nope, so ask index for another one */ + { + /* Fails recheck, so drop it and loop back for another */ + InstrCountFiltered2(node, 1); + continue; + } } return slot; diff --git a/src/backend/executor/nodeMergejoin.c b/src/backend/executor/nodeMergejoin.c index e23dd6c9f5..deaa79ed9f 100644 --- a/src/backend/executor/nodeMergejoin.c +++ b/src/backend/executor/nodeMergejoin.c @@ -505,6 +505,8 @@ MJFillOuter(MergeJoinState *node) return result; } } + else + InstrCountFiltered2(node, 1); return NULL; } @@ -544,6 +546,8 @@ MJFillInner(MergeJoinState *node) return result; } } + else + InstrCountFiltered2(node, 1); return NULL; } @@ -893,7 +897,11 @@ ExecMergeJoin(MergeJoinState *node) return result; } } + else + InstrCountFiltered2(node, 1); } + else + InstrCountFiltered1(node, 1); break; /* diff --git a/src/backend/executor/nodeNestloop.c b/src/backend/executor/nodeNestloop.c index e98bc0f5a3..49b880d0ca 100644 --- a/src/backend/executor/nodeNestloop.c +++ b/src/backend/executor/nodeNestloop.c @@ -214,6 +214,8 @@ ExecNestLoop(NestLoopState *node) return result; } } + else + InstrCountFiltered2(node, 1); } /* @@ -270,7 +272,11 @@ ExecNestLoop(NestLoopState *node) return result; } } + else + InstrCountFiltered2(node, 1); } + else + InstrCountFiltered1(node, 1); /* * Tuple fails qual, so free per-tuple memory and try again. diff --git a/src/backend/storage/lmgr/README.barrier b/src/backend/storage/lmgr/README.barrier new file mode 100644 index 0000000000..f9f3593b77 --- /dev/null +++ b/src/backend/storage/lmgr/README.barrier @@ -0,0 +1,199 @@ +Memory Barriers +=============== + +Modern CPUs make extensive use of pipe-lining and out-of-order execution, +meaning that the CPU is often executing more than one instruction at a +time, and not necessarily in the order that the source code would suggest. +Furthermore, even before the CPU gets a chance to reorder operations, the +compiler may (and often does) reorganize the code for greater efficiency, +particularly at higher optimization levels. Optimizing compilers and +out-of-order execution are both critical for good performance, but they +can lead to surprising results when multiple processes access the same +memory space. + +Example +======= + +Suppose x is a pointer to a structure stored in shared memory, and that the +entire structure has been initialized to zero bytes. One backend executes +the following code fragment: + + x->foo = 1; + x->bar = 1; + +Meanwhile, at approximately the same time, another backend executes this +code fragment: + + bar = x->bar; + foo = x->foo; + +The second backend might end up with foo = 1 and bar = 1 (if it executes +both statements after the first backend), or with foo = 0 and bar = 0 (if +it executes both statements before the first backend), or with foo = 1 and +bar = 0 (if the first backend executes the first statement, the second +backend executes both statements, and then the first backend executes the +second statement). + +Surprisingly, however, the second backend could also end up with foo = 0 +and bar = 1. The compiler might swap the order of the two stores performed +by the first backend, or the two loads performed by the second backend. +Even if it doesn't, on a machine with weak memory ordering (such as PowerPC +or Itanium) the CPU might choose to execute either the loads or the stores +out of order. This surprising result can lead to bugs. + +A common pattern where this actually does result in a bug is when adding items +onto a queue. The writer does this: + + q->items[q->num_items] = new_item; + ++q->num_items; + +The reader does this: + + num_items = q->num_items; + for (i = 0; i < num_items; ++i) + /* do something with q->items[i] */ + +This code turns out to be unsafe, because the writer might increment +q->num_items before it finishes storing the new item into the appropriate slot. +More subtly, the reader might prefetch the contents of the q->items array +before reading q->num_items. Thus, there's still a bug here *even if the +writer does everything in the order we expect*. We need the writer to update +the array before bumping the item counter, and the reader to examine the item +counter before examining the array. + +Note that these types of highly counterintuitive bugs can *only* occur when +multiple processes are interacting with the same memory segment. A given +process always perceives its *own* writes to memory in program order. + +Avoiding Memory Ordering Bugs +============================= + +The simplest (and often best) way to avoid memory ordering bugs is to +protect the data structures involved with an lwlock. For more details, see +src/backend/storage/lmgr/README. For instance, in the above example, the +writer could acquire an lwlock in exclusive mode before appending to the +queue, and each reader could acquire the same lock in shared mode before +reading it. If the data structure is not heavily trafficked, this solution is +generally entirely adequate. + +However, in some cases, it is desirable to avoid the overhead of acquiring +and releasing locks. In this case, memory barriers may be used to ensure +that the apparent order of execution is as the programmer desires. In +PostgreSQL backend code, the pg_memory_barrier() macro may be used to achieve +this result. In the example above, we can prevent the reader from seeing a +garbage value by having the writer do this: + + q->items[q->num_items] = new_item; + pg_memory_barrier(); + ++q->num_items; + +And by having the reader do this: + + num_items = q->num_items; + pg_memory_barrier(); + for (i = 0; i < num_items; ++i) + /* do something with q->items[i] */ + +The pg_memory_barrier() macro will (1) prevent the compiler from rearranging +the code in such a way as to allow the memory accesses to occur out of order +and (2) generate any code (often, inline assembly) that is needed to prevent +the CPU from executing the memory accesses out of order. Specifically, the +barrier prevents loads and stores written after the barrier from being +performed before the barrier, and vice-versa. + +Although this code will work, it is needlessly inefficient. On systems with +strong memory ordering (such as x86), the CPU never reorders loads with other +loads, nor stores with other stores. It can, however, allow a load to +performed before a subsequent store. To avoid emitting unnecessary memory +instructions, we provide two additional primitives: pg_read_barrier(), and +pg_write_barrier(). When a memory barrier is being used to separate two +loads, use pg_read_barrier(); when it is separating two stores, use +pg_write_barrier(); when it is a separating a load and a store (in either +order), use pg_memory_barrier(). pg_memory_barrier() can always substitute +for either a read or a write barrier, but is typically more expensive, and +therefore should be used only when needed. + +With these guidelines in mind, the writer can do this: + + q->items[q->num_items] = new_item; + pg_write_barrier(); + ++q->num_items; + +And the reader can do this: + + num_items = q->num_items; + pg_read_barrier(); + for (i = 0; i < num_items; ++i) + /* do something with q->items[i] */ + +On machines with strong memory ordering, these weaker barriers will simply +prevent compiler rearrangement, without emitting any actual machine code. +On machines with weak memory ordering, they will will prevent compiler +reordering and also emit whatever hardware barrier may be required. Even +on machines with weak memory ordering, a read or write barrier may be able +to use a less expensive instruction than a full barrier. + +Weaknesses of Memory Barriers +============================= + +While memory barriers are a powerful tool, and much cheaper than locks, they +are also much less capable than locks. Here are some of the problems. + +1. Concurrent writers are unsafe. In the above example of a queue, using +memory barriers doesn't make it safe for two processes to add items to the +same queue at the same time. If more than one process can write to the queue, +a spinlock or lwlock must be used to synchronize access. The readers can +perhaps proceed without any lock, but the writers may not. + +Even very simple write operations often require additional synchronization. +For example, it's not safe for multiple writers to simultaneously execute +this code (supposing x is a pointer into shared memory): + + x->foo++; + +Although this may compile down to a single machine-language instruction, +the CPU will execute that instruction by reading the current value of foo, +adding one to it, and then storing the result back to the original address. +If two CPUs try to do this simultaneously, both may do their reads before +either one does their writes. Eventually we might be able to use an atomic +fetch-and-add instruction for this specific case on architectures that support +it, but we can't rely on that being available everywhere, and we currently +have no support for it at all. Use a lock. + +2. Eight-byte loads and stores aren't necessarily atomic. We assume in +various places in the source code that an aligned four-byte load or store is +atomic, and that other processes therefore won't see a half-set value. +Sadly, the same can't be said for eight-byte value: on some platforms, an +aligned eight-byte load or store will generate two four-byte operations. If +you need an atomic eight-byte read or write, you must make it atomic with a +lock. + +3. No ordering guarantees. While memory barriers ensure that any given +process performs loads and stores to shared memory in order, they don't +guarantee synchronization. In the queue example above, we can use memory +barriers to be sure that readers won't see garbage, but there's nothing to +say whether a given reader will run before or after a given writer. If this +matters in a given situation, some other mechanism must be used instead of +or in addition to memory barriers. + +4. Barrier proliferation. Many algorithms that at first seem appealing +require multiple barriers. If the number of barriers required is more than +one or two, you may be better off just using a lock. Keep in mind that, on +some platforms, a barrier may be implemented by acquiring and releasing a +backend-private spinlock. This may be better than a centralized lock under +contention, but it may also be slower in the uncontended case. + +Further Reading +=============== + +Much of the documentation about memory barriers appears to be quite +Linux-specific. The following papers may be helpful: + +Memory Ordering in Modern Microprocessors, by Paul E. McKenney +* http://www.rdrop.com/users/paulmck/scalability/paper/ordering.2007.09.19a.pdf + +Memory Barriers: a Hardware View for Software Hackers, by Paul E. McKenney +* http://www.rdrop.com/users/paulmck/scalability/paper/whymb.2010.06.07c.pdf + +The Linux kernel also has some useful documentation on this topic. Start +with Documentation/memory-barriers.txt diff --git a/src/backend/storage/lmgr/s_lock.c b/src/backend/storage/lmgr/s_lock.c index 1aa9912572..cd1306c182 100644 --- a/src/backend/storage/lmgr/s_lock.c +++ b/src/backend/storage/lmgr/s_lock.c @@ -20,6 +20,7 @@ #include "storage/s_lock.h" +slock_t dummy_spinlock; static int spins_per_delay = DEFAULT_SPINS_PER_DELAY; diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 7112dea0e1..fe5e14b9dc 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -707,8 +707,7 @@ cache_locale_time(void) * otherwise returns the pointer to a static area which * contains the iso formatted locale name. */ -static -char * +static char * IsoLocaleName(const char *winlocname) { #if (_MSC_VER >= 1400) /* VC8.0 or later */ @@ -937,6 +936,29 @@ lc_ctype_is_c(Oid collation) } +/* simple subroutine for reporting errors from newlocale() */ +#ifdef HAVE_LOCALE_T +static void +report_newlocale_failure(const char *localename) +{ + /* copy errno in case one of the ereport auxiliary functions changes it */ + int save_errno = errno; + + /* + * ENOENT means "no such locale", not "no such file", so clarify that + * errno with an errdetail message. + */ + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("could not create locale \"%s\": %m", + localename), + (save_errno == ENOENT ? + errdetail("The operating system could not find any locale data for the locale name \"%s\".", + localename) : 0))); +} +#endif /* HAVE_LOCALE_T */ + + /* * Create a locale_t from a collation OID. Results are cached for the * lifetime of the backend. Thus, do not free the result with freelocale(). @@ -995,10 +1017,7 @@ pg_newlocale_from_collation(Oid collid) result = _create_locale(LC_ALL, collcollate); #endif if (!result) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not create locale \"%s\": %m", - collcollate))); + report_newlocale_failure(collcollate); } else { @@ -1008,16 +1027,10 @@ pg_newlocale_from_collation(Oid collid) loc1 = newlocale(LC_COLLATE_MASK, collcollate, NULL); if (!loc1) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not create locale \"%s\": %m", - collcollate))); + report_newlocale_failure(collcollate); result = newlocale(LC_CTYPE_MASK, collctype, loc1); if (!result) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not create locale \"%s\": %m", - collctype))); + report_newlocale_failure(collctype); #else /* |
