Merge branch 'master' into serializableserializable

author: Kevin Grittner 2011-09-24 16:15:45 +0000
committer: Kevin Grittner 2011-09-24 16:15:45 +0000
commit: af8d5448f8be9c3f5fb030ac94509629cccab09b (patch)
tree: 57533e96b2317c49aaa418632ec49046ce0c93a5 /src/backend
parent: bb08357723c3188d73f3eca170987d4d7af58635 (diff)
parent: 337c0b03614c45516f2c3ec956405713bb264d54 (diff)
22 files changed, 368 insertions, 38 deletions
diff --git a/src/backend/catalog/namespace.c b/src/backend/catalog/namespace.c
index 040bef6add..fcc90fed5f 100644
--- a/src/backend/catalog/namespace.c
+++ b/src/backend/catalog/namespace.c
@@ -225,7 +225,6 @@ RangeVarGetRelid(const RangeVar *relation, LOCKMODE lockmode, bool missing_ok,
 				 bool nowait)
 {
 	uint64		inval_count;
-	Oid			namespaceId;
 	Oid			relId;
 	Oid			oldRelId = InvalidOid;
 	bool		retry = false;
@@ -278,17 +277,27 @@ RangeVarGetRelid(const RangeVar *relation, LOCKMODE lockmode, bool missing_ok,
 		 */
 		if (relation->relpersistence == RELPERSISTENCE_TEMP)
 		{
-			if (relation->schemaname)
-				ereport(ERROR,
-						(errcode(ERRCODE_INVALID_TABLE_DEFINITION),
-					   errmsg("temporary tables cannot specify a schema name")));
-			if (OidIsValid(myTempNamespace))
+			if (!OidIsValid(myTempNamespace))
+				relId = InvalidOid;	/* this probably can't happen? */
+			else
+			{
+				if (relation->schemaname)
+				{
+					Oid		namespaceId;
+					namespaceId = LookupExplicitNamespace(relation->schemaname);
+					if (namespaceId != myTempNamespace)
+						ereport(ERROR,
+								(errcode(ERRCODE_INVALID_TABLE_DEFINITION),
+							   errmsg("temporary tables cannot specify a schema name")));
+				}
+
 				relId = get_relname_relid(relation->relname, myTempNamespace);
-			else	/* this probably can't happen? */
-				relId = InvalidOid;
+			}
 		}
 		else if (relation->schemaname)
 		{
+			Oid			namespaceId;
+
 			/* use exact schema given */
 			namespaceId = LookupExplicitNamespace(relation->schemaname);
 			relId = get_relname_relid(relation->relname, namespaceId);
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 6408d1653b..cd9fc92923 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -18,7 +18,6 @@
 #include "commands/defrem.h"
 #include "commands/prepare.h"
 #include "executor/hashjoin.h"
-#include "executor/instrument.h"
 #include "foreign/fdwapi.h"
 #include "optimizer/clauses.h"
 #include "parser/parsetree.h"
@@ -76,6 +75,8 @@ static void show_sort_keys_common(PlanState *planstate,
 					  List *ancestors, ExplainState *es);
 static void show_sort_info(SortState *sortstate, ExplainState *es);
 static void show_hash_info(HashState *hashstate, ExplainState *es);
+static void show_instrumentation_count(const char *qlabel, int which,
+						   PlanState *planstate, ExplainState *es);
 static void show_foreignscan_info(ForeignScanState *fsstate, ExplainState *es);
 static const char *explain_get_index_name(Oid indexId);
 static void ExplainScanTarget(Scan *plan, ExplainState *es);
@@ -1000,9 +1001,15 @@ ExplainNode(PlanState *planstate, List *ancestors,
 		case T_IndexScan:
 			show_scan_qual(((IndexScan *) plan)->indexqualorig,
 						   "Index Cond", planstate, ancestors, es);
+			if (((IndexScan *) plan)->indexqualorig)
+				show_instrumentation_count("Rows Removed by Index Recheck", 2,
+										   planstate, es);
 			show_scan_qual(((IndexScan *) plan)->indexorderbyorig,
 						   "Order By", planstate, ancestors, es);
 			show_scan_qual(plan->qual, "Filter", planstate, ancestors, es);
+			if (plan->qual)
+				show_instrumentation_count("Rows Removed by Filter", 1,
+										   planstate, es);
 			break;
 		case T_BitmapIndexScan:
 			show_scan_qual(((BitmapIndexScan *) plan)->indexqualorig,
@@ -1011,6 +1018,9 @@ ExplainNode(PlanState *planstate, List *ancestors,
 		case T_BitmapHeapScan:
 			show_scan_qual(((BitmapHeapScan *) plan)->bitmapqualorig,
 						   "Recheck Cond", planstate, ancestors, es);
+			if (((BitmapHeapScan *) plan)->bitmapqualorig)
+				show_instrumentation_count("Rows Removed by Index Recheck", 2,
+										   planstate, es);
 			/* FALL THRU */
 		case T_SeqScan:
 		case T_ValuesScan:
@@ -1018,6 +1028,9 @@ ExplainNode(PlanState *planstate, List *ancestors,
 		case T_WorkTableScan:
 		case T_SubqueryScan:
 			show_scan_qual(plan->qual, "Filter", planstate, ancestors, es);
+			if (plan->qual)
+				show_instrumentation_count("Rows Removed by Filter", 1,
+										   planstate, es);
 			break;
 		case T_FunctionScan:
 			if (es->verbose)
@@ -1025,6 +1038,9 @@ ExplainNode(PlanState *planstate, List *ancestors,
 								"Function Call", planstate, ancestors,
 								es->verbose, es);
 			show_scan_qual(plan->qual, "Filter", planstate, ancestors, es);
+			if (plan->qual)
+				show_instrumentation_count("Rows Removed by Filter", 1,
+										   planstate, es);
 			break;
 		case T_TidScan:
 			{
@@ -1038,34 +1054,61 @@ ExplainNode(PlanState *planstate, List *ancestors,
 					tidquals = list_make1(make_orclause(tidquals));
 				show_scan_qual(tidquals, "TID Cond", planstate, ancestors, es);
 				show_scan_qual(plan->qual, "Filter", planstate, ancestors, es);
+				if (plan->qual)
+					show_instrumentation_count("Rows Removed by Filter", 1,
+											   planstate, es);
 			}
 			break;
 		case T_ForeignScan:
 			show_scan_qual(plan->qual, "Filter", planstate, ancestors, es);
+			if (plan->qual)
+				show_instrumentation_count("Rows Removed by Filter", 1,
+										   planstate, es);
 			show_foreignscan_info((ForeignScanState *) planstate, es);
 			break;
 		case T_NestLoop:
 			show_upper_qual(((NestLoop *) plan)->join.joinqual,
 							"Join Filter", planstate, ancestors, es);
+			if (((NestLoop *) plan)->join.joinqual)
+				show_instrumentation_count("Rows Removed by Join Filter", 1,
+										   planstate, es);
 			show_upper_qual(plan->qual, "Filter", planstate, ancestors, es);
+			if (plan->qual)
+				show_instrumentation_count("Rows Removed by Filter", 2,
+										   planstate, es);
 			break;
 		case T_MergeJoin:
 			show_upper_qual(((MergeJoin *) plan)->mergeclauses,
 							"Merge Cond", planstate, ancestors, es);
 			show_upper_qual(((MergeJoin *) plan)->join.joinqual,
 							"Join Filter", planstate, ancestors, es);
+			if (((MergeJoin *) plan)->join.joinqual)
+				show_instrumentation_count("Rows Removed by Join Filter", 1,
+										   planstate, es);
 			show_upper_qual(plan->qual, "Filter", planstate, ancestors, es);
+			if (plan->qual)
+				show_instrumentation_count("Rows Removed by Filter", 2,
+										   planstate, es);
 			break;
 		case T_HashJoin:
 			show_upper_qual(((HashJoin *) plan)->hashclauses,
 							"Hash Cond", planstate, ancestors, es);
 			show_upper_qual(((HashJoin *) plan)->join.joinqual,
 							"Join Filter", planstate, ancestors, es);
+			if (((HashJoin *) plan)->join.joinqual)
+				show_instrumentation_count("Rows Removed by Join Filter", 1,
+										   planstate, es);
 			show_upper_qual(plan->qual, "Filter", planstate, ancestors, es);
+			if (plan->qual)
+				show_instrumentation_count("Rows Removed by Filter", 2,
+										   planstate, es);
 			break;
 		case T_Agg:
 		case T_Group:
 			show_upper_qual(plan->qual, "Filter", planstate, ancestors, es);
+			if (plan->qual)
+				show_instrumentation_count("Rows Removed by Filter", 1,
+										   planstate, es);
 			break;
 		case T_Sort:
 			show_sort_keys((SortState *) planstate, ancestors, es);
@@ -1079,6 +1122,9 @@ ExplainNode(PlanState *planstate, List *ancestors,
 			show_upper_qual((List *) ((Result *) plan)->resconstantqual,
 							"One-Time Filter", planstate, ancestors, es);
 			show_upper_qual(plan->qual, "Filter", planstate, ancestors, es);
+			if (plan->qual)
+				show_instrumentation_count("Rows Removed by Filter", 1,
+										   planstate, es);
 			break;
 		case T_Hash:
 			show_hash_info((HashState *) planstate, es);
@@ -1509,6 +1555,37 @@ show_hash_info(HashState *hashstate, ExplainState *es)
 }
 
 /*
+ * If it's EXPLAIN ANALYZE, show instrumentation information for a plan node
+ *
+ * "which" identifies which instrumentation counter to print
+ */
+static void
+show_instrumentation_count(const char *qlabel, int which,
+						   PlanState *planstate, ExplainState *es)
+{
+	double		nfiltered;
+	double		nloops;
+
+	if (!es->analyze || !planstate->instrument)
+		return;
+
+	if (which == 2)
+		nfiltered = planstate->instrument->nfiltered2;
+	else
+		nfiltered = planstate->instrument->nfiltered1;
+	nloops = planstate->instrument->nloops;
+
+	/* In text mode, suppress zero counts; they're not interesting enough */
+	if (nfiltered > 0 || es->format != EXPLAIN_FORMAT_TEXT)
+	{
+		if (nloops > 0)
+			ExplainPropertyFloat(qlabel, nfiltered / nloops, 0, es);
+		else
+			ExplainPropertyFloat(qlabel, 0.0, 0, es);
+	}
+}
+
+/*
  * Show extra information for a ForeignScan node.
  */
 static void
diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c
index 680962aa44..06d368e077 100644
--- a/src/backend/commands/trigger.c
+++ b/src/backend/commands/trigger.c
@@ -29,7 +29,6 @@
 #include "commands/defrem.h"
 #include "commands/trigger.h"
 #include "executor/executor.h"
-#include "executor/instrument.h"
 #include "miscadmin.h"
 #include "nodes/bitmapset.h"
 #include "nodes/makefuncs.h"
diff --git a/src/backend/executor/execAmi.c b/src/backend/executor/execAmi.c
index ffdcc966ee..711e8c7786 100644
--- a/src/backend/executor/execAmi.c
+++ b/src/backend/executor/execAmi.c
@@ -13,7 +13,6 @@
 #include "postgres.h"
 
 #include "executor/execdebug.h"
-#include "executor/instrument.h"
 #include "executor/nodeAgg.h"
 #include "executor/nodeAppend.h"
 #include "executor/nodeBitmapAnd.h"
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index 1dfe8b9ac7..fd7a9ed033 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -47,7 +47,6 @@
 #include "commands/tablespace.h"
 #include "commands/trigger.h"
 #include "executor/execdebug.h"
-#include "executor/instrument.h"
 #include "miscadmin.h"
 #include "optimizer/clauses.h"
 #include "parser/parse_clause.h"
diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c
index 17788761d7..8bdfad2222 100644
--- a/src/backend/executor/execProcnode.c
+++ b/src/backend/executor/execProcnode.c
@@ -77,7 +77,6 @@
 #include "postgres.h"
 
 #include "executor/executor.h"
-#include "executor/instrument.h"
 #include "executor/nodeAgg.h"
 #include "executor/nodeAppend.h"
 #include "executor/nodeBitmapAnd.h"
diff --git a/src/backend/executor/execScan.c b/src/backend/executor/execScan.c
index e90058847d..d4ed235856 100644
--- a/src/backend/executor/execScan.c
+++ b/src/backend/executor/execScan.c
@@ -219,6 +219,8 @@ ExecScan(ScanState *node,
 				return slot;
 			}
 		}
+		else
+			InstrCountFiltered1(node, 1);
 
 		/*
 		 * Tuple fails qual, so free per-tuple memory and try again.
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c
index bf9bf12ab6..9d30200ab3 100644
--- a/src/backend/executor/instrument.c
+++ b/src/backend/executor/instrument.c
@@ -22,6 +22,7 @@ BufferUsage pgBufferUsage;
 static void BufferUsageAccumDiff(BufferUsage *dst,
 					 const BufferUsage *add, const BufferUsage *sub);
 
+
 /* Allocate new instrumentation structure(s) */
 Instrumentation *
 InstrAlloc(int n, int instrument_options)
@@ -31,13 +32,14 @@ InstrAlloc(int n, int instrument_options)
 	/* timer is always required for now */
 	Assert(instrument_options & INSTRUMENT_TIMER);
 
+	/* initialize all fields to zeroes, then modify as needed */
 	instr = palloc0(n * sizeof(Instrumentation));
 	if (instrument_options & INSTRUMENT_BUFFERS)
 	{
 		int			i;
 
 		for (i = 0; i < n; i++)
-			instr[i].needs_bufusage = true;
+			instr[i].need_bufusage = true;
 	}
 
 	return instr;
@@ -52,8 +54,8 @@ InstrStartNode(Instrumentation *instr)
 	else
 		elog(DEBUG2, "InstrStartNode called twice in a row");
 
-	/* initialize buffer usage per plan node */
-	if (instr->needs_bufusage)
+	/* save buffer usage totals at node entry, if needed */
+	if (instr->need_bufusage)
 		instr->bufusage_start = pgBufferUsage;
 }
 
@@ -77,8 +79,8 @@ InstrStopNode(Instrumentation *instr, double nTuples)
 
 	INSTR_TIME_SET_ZERO(instr->starttime);
 
-	/* Adds delta of buffer usage to node's count. */
-	if (instr->needs_bufusage)
+	/* Add delta of buffer usage since entry to node's totals */
+	if (instr->need_bufusage)
 		BufferUsageAccumDiff(&instr->bufusage,
 							 &pgBufferUsage, &instr->bufusage_start);
 
@@ -119,12 +121,12 @@ InstrEndLoop(Instrumentation *instr)
 	instr->tuplecount = 0;
 }
 
+/* dst += add - sub */
 static void
 BufferUsageAccumDiff(BufferUsage *dst,
 					 const BufferUsage *add,
 					 const BufferUsage *sub)
 {
-	/* dst += add - sub */
 	dst->shared_blks_hit += add->shared_blks_hit - sub->shared_blks_hit;
 	dst->shared_blks_read += add->shared_blks_read - sub->shared_blks_read;
 	dst->shared_blks_written += add->shared_blks_written - sub->shared_blks_written;
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index 13d7723480..e769d6d012 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -1204,6 +1204,8 @@ agg_retrieve_direct(AggState *aggstate)
 				return result;
 			}
 		}
+		else
+			InstrCountFiltered1(aggstate, 1);
 	}
 
 	/* No more groups */
@@ -1354,6 +1356,8 @@ agg_retrieve_hash_table(AggState *aggstate)
 				return result;
 			}
 		}
+		else
+			InstrCountFiltered1(aggstate, 1);
 	}
 
 	/* No more groups */
diff --git a/src/backend/executor/nodeBitmapAnd.c b/src/backend/executor/nodeBitmapAnd.c
index 82308cba26..5f318c31e7 100644
--- a/src/backend/executor/nodeBitmapAnd.c
+++ b/src/backend/executor/nodeBitmapAnd.c
@@ -29,7 +29,6 @@
 #include "postgres.h"
 
 #include "executor/execdebug.h"
-#include "executor/instrument.h"
 #include "executor/nodeBitmapAnd.h"
 
 
diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c
index 8e50fb1aae..4a8920e6ce 100644
--- a/src/backend/executor/nodeBitmapHeapscan.c
+++ b/src/backend/executor/nodeBitmapHeapscan.c
@@ -278,6 +278,7 @@ BitmapHeapNext(BitmapHeapScanState *node)
 			if (!ExecQual(node->bitmapqualorig, econtext, false))
 			{
 				/* Fails recheck, so drop it and loop back for another */
+				InstrCountFiltered2(node, 1);
 				ExecClearTuple(slot);
 				continue;
 			}
diff --git a/src/backend/executor/nodeBitmapIndexscan.c b/src/backend/executor/nodeBitmapIndexscan.c
index 9a56fd4b9f..8e1df079b3 100644
--- a/src/backend/executor/nodeBitmapIndexscan.c
+++ b/src/backend/executor/nodeBitmapIndexscan.c
@@ -22,7 +22,6 @@
 #include "postgres.h"
 
 #include "executor/execdebug.h"
-#include "executor/instrument.h"
 #include "executor/nodeBitmapIndexscan.h"
 #include "executor/nodeIndexscan.h"
 #include "miscadmin.h"
diff --git a/src/backend/executor/nodeBitmapOr.c b/src/backend/executor/nodeBitmapOr.c
index 4b064b79a9..d2453d5a4f 100644
--- a/src/backend/executor/nodeBitmapOr.c
+++ b/src/backend/executor/nodeBitmapOr.c
@@ -29,7 +29,6 @@
 #include "postgres.h"
 
 #include "executor/execdebug.h"
-#include "executor/instrument.h"
 #include "executor/nodeBitmapOr.h"
 #include "miscadmin.h"
 
diff --git a/src/backend/executor/nodeGroup.c b/src/backend/executor/nodeGroup.c
index fa403e5406..7bef8bbe8b 100644
--- a/src/backend/executor/nodeGroup.c
+++ b/src/backend/executor/nodeGroup.c
@@ -118,6 +118,8 @@ ExecGroup(GroupState *node)
 				return result;
 			}
 		}
+		else
+			InstrCountFiltered1(node, 1);
 	}
 
 	/*
@@ -179,6 +181,8 @@ ExecGroup(GroupState *node)
 				return result;
 			}
 		}
+		else
+			InstrCountFiltered1(node, 1);
 	}
 
 	/* NOTREACHED */
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index 2ade2d7fad..e72a71bf51 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -28,7 +28,6 @@
 #include "commands/tablespace.h"
 #include "executor/execdebug.h"
 #include "executor/hashjoin.h"
-#include "executor/instrument.h"
 #include "executor/nodeHash.h"
 #include "executor/nodeHashjoin.h"
 #include "miscadmin.h"
diff --git a/src/backend/executor/nodeHashjoin.c b/src/backend/executor/nodeHashjoin.c
index 3a6698105f..c3c4db4bc2 100644
--- a/src/backend/executor/nodeHashjoin.c
+++ b/src/backend/executor/nodeHashjoin.c
@@ -325,7 +325,11 @@ ExecHashJoin(HashJoinState *node)
 							return result;
 						}
 					}
+					else
+						InstrCountFiltered2(node, 1);
 				}
+				else
+					InstrCountFiltered1(node, 1);
 				break;
 
 			case HJ_FILL_OUTER_TUPLE:
@@ -360,6 +364,8 @@ ExecHashJoin(HashJoinState *node)
 							return result;
 						}
 					}
+					else
+						InstrCountFiltered2(node, 1);
 				}
 				break;
 
@@ -397,6 +403,8 @@ ExecHashJoin(HashJoinState *node)
 						return result;
 					}
 				}
+				else
+					InstrCountFiltered2(node, 1);
 				break;
 
 			case HJ_NEED_NEW_BATCH:
diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c
index 955008e012..da25384e86 100644
--- a/src/backend/executor/nodeIndexscan.c
+++ b/src/backend/executor/nodeIndexscan.c
@@ -96,7 +96,11 @@ IndexNext(IndexScanState *node)
 			econtext->ecxt_scantuple = slot;
 			ResetExprContext(econtext);
 			if (!ExecQual(node->indexqualorig, econtext, false))
-				continue;		/* nope, so ask index for another one */
+			{
+				/* Fails recheck, so drop it and loop back for another */
+				InstrCountFiltered2(node, 1);
+				continue;
+			}
 		}
 
 		return slot;
diff --git a/src/backend/executor/nodeMergejoin.c b/src/backend/executor/nodeMergejoin.c
index e23dd6c9f5..deaa79ed9f 100644
--- a/src/backend/executor/nodeMergejoin.c
+++ b/src/backend/executor/nodeMergejoin.c
@@ -505,6 +505,8 @@ MJFillOuter(MergeJoinState *node)
 			return result;
 		}
 	}
+	else
+		InstrCountFiltered2(node, 1);
 
 	return NULL;
 }
@@ -544,6 +546,8 @@ MJFillInner(MergeJoinState *node)
 			return result;
 		}
 	}
+	else
+		InstrCountFiltered2(node, 1);
 
 	return NULL;
 }
@@ -893,7 +897,11 @@ ExecMergeJoin(MergeJoinState *node)
 							return result;
 						}
 					}
+					else
+						InstrCountFiltered2(node, 1);
 				}
+				else
+					InstrCountFiltered1(node, 1);
 				break;
 
 				/*
diff --git a/src/backend/executor/nodeNestloop.c b/src/backend/executor/nodeNestloop.c
index e98bc0f5a3..49b880d0ca 100644
--- a/src/backend/executor/nodeNestloop.c
+++ b/src/backend/executor/nodeNestloop.c
@@ -214,6 +214,8 @@ ExecNestLoop(NestLoopState *node)
 						return result;
 					}
 				}
+				else
+					InstrCountFiltered2(node, 1);
 			}
 
 			/*
@@ -270,7 +272,11 @@ ExecNestLoop(NestLoopState *node)
 					return result;
 				}
 			}
+			else
+				InstrCountFiltered2(node, 1);
 		}
+		else
+			InstrCountFiltered1(node, 1);
 
 		/*
 		 * Tuple fails qual, so free per-tuple memory and try again.
diff --git a/src/backend/storage/lmgr/README.barrier b/src/backend/storage/lmgr/README.barrier
new file mode 100644
index 0000000000..f9f3593b77
--- /dev/null
+++ b/src/backend/storage/lmgr/README.barrier
@@ -0,0 +1,199 @@
+Memory Barriers
+===============
+
+Modern CPUs make extensive use of pipe-lining and out-of-order execution,
+meaning that the CPU is often executing more than one instruction at a
+time, and not necessarily in the order that the source code would suggest.
+Furthermore, even before the CPU gets a chance to reorder operations, the
+compiler may (and often does) reorganize the code for greater efficiency,
+particularly at higher optimization levels.  Optimizing compilers and
+out-of-order execution are both critical for good performance, but they
+can lead to surprising results when multiple processes access the same
+memory space.
+
+Example
+=======
+
+Suppose x is a pointer to a structure stored in shared memory, and that the
+entire structure has been initialized to zero bytes.  One backend executes
+the following code fragment:
+
+    x->foo = 1;
+    x->bar = 1;
+
+Meanwhile, at approximately the same time, another backend executes this
+code fragment:
+
+    bar = x->bar;
+    foo = x->foo;
+
+The second backend might end up with foo = 1 and bar = 1 (if it executes
+both statements after the first backend), or with foo = 0 and bar = 0 (if
+it executes both statements before the first backend), or with foo = 1 and
+bar = 0 (if the first backend executes the first statement, the second
+backend executes both statements, and then the first backend executes the
+second statement).
+
+Surprisingly, however, the second backend could also end up with foo = 0
+and bar = 1.  The compiler might swap the order of the two stores performed
+by the first backend, or the two loads performed by the second backend.
+Even if it doesn't, on a machine with weak memory ordering (such as PowerPC
+or Itanium) the CPU might choose to execute either the loads or the stores
+out of order.  This surprising result can lead to bugs.
+
+A common pattern where this actually does result in a bug is when adding items
+onto a queue.  The writer does this:
+
+    q->items[q->num_items] = new_item;
+    ++q->num_items;
+
+The reader does this:
+
+    num_items = q->num_items;
+    for (i = 0; i < num_items; ++i)
+        /* do something with q->items[i] */
+
+This code turns out to be unsafe, because the writer might increment
+q->num_items before it finishes storing the new item into the appropriate slot.
+More subtly, the reader might prefetch the contents of the q->items array
+before reading q->num_items.  Thus, there's still a bug here *even if the
+writer does everything in the order we expect*.  We need the writer to update
+the array before bumping the item counter, and the reader to examine the item
+counter before examining the array.
+
+Note that these types of highly counterintuitive bugs can *only* occur when
+multiple processes are interacting with the same memory segment.  A given
+process always perceives its *own* writes to memory in program order.
+
+Avoiding Memory Ordering Bugs
+=============================
+
+The simplest (and often best) way to avoid memory ordering bugs is to
+protect the data structures involved with an lwlock.  For more details, see
+src/backend/storage/lmgr/README.  For instance, in the above example, the
+writer could acquire an lwlock in exclusive mode before appending to the
+queue, and each reader could acquire the same lock in shared mode before
+reading it.  If the data structure is not heavily trafficked, this solution is
+generally entirely adequate.
+
+However, in some cases, it is desirable to avoid the overhead of acquiring
+and releasing locks.  In this case, memory barriers may be used to ensure
+that the apparent order of execution is as the programmer desires.   In
+PostgreSQL backend code, the pg_memory_barrier() macro may be used to achieve
+this result.  In the example above, we can prevent the reader from seeing a
+garbage value by having the writer do this:
+
+    q->items[q->num_items] = new_item;
+    pg_memory_barrier();
+    ++q->num_items;
+
+And by having the reader do this:
+
+    num_items = q->num_items;
+    pg_memory_barrier();
+    for (i = 0; i < num_items; ++i)
+        /* do something with q->items[i] */
+
+The pg_memory_barrier() macro will (1) prevent the compiler from rearranging
+the code in such a way as to allow the memory accesses to occur out of order
+and (2) generate any code (often, inline assembly) that is needed to prevent
+the CPU from executing the memory accesses out of order.  Specifically, the
+barrier prevents loads and stores written after the barrier from being
+performed before the barrier, and vice-versa.
+
+Although this code will work, it is needlessly inefficient.  On systems with
+strong memory ordering (such as x86), the CPU never reorders loads with other
+loads, nor stores with other stores.  It can, however, allow a load to
+performed before a subsequent store.  To avoid emitting unnecessary memory
+instructions, we provide two additional primitives: pg_read_barrier(), and
+pg_write_barrier().  When a memory barrier is being used to separate two
+loads, use pg_read_barrier(); when it is separating two stores, use
+pg_write_barrier(); when it is a separating a load and a store (in either
+order), use pg_memory_barrier().  pg_memory_barrier() can always substitute
+for either a read or a write barrier, but is typically more expensive, and
+therefore should be used only when needed.
+
+With these guidelines in mind, the writer can do this:
+
+    q->items[q->num_items] = new_item;
+    pg_write_barrier();
+    ++q->num_items;
+
+And the reader can do this:
+
+    num_items = q->num_items;
+    pg_read_barrier();
+    for (i = 0; i < num_items; ++i)
+        /* do something with q->items[i] */
+
+On machines with strong memory ordering, these weaker barriers will simply
+prevent compiler rearrangement, without emitting any actual machine code.
+On machines with weak memory ordering, they will will prevent compiler
+reordering and also emit whatever hardware barrier may be required.  Even
+on machines with weak memory ordering, a read or write barrier may be able
+to use a less expensive instruction than a full barrier.
+
+Weaknesses of Memory Barriers
+=============================
+
+While memory barriers are a powerful tool, and much cheaper than locks, they
+are also much less capable than locks.  Here are some of the problems.
+
+1. Concurrent writers are unsafe.  In the above example of a queue, using
+memory barriers doesn't make it safe for two processes to add items to the
+same queue at the same time.  If more than one process can write to the queue,
+a spinlock or lwlock must be used to synchronize access. The readers can
+perhaps proceed without any lock, but the writers may not.
+
+Even very simple write operations often require additional synchronization.
+For example, it's not safe for multiple writers to simultaneously execute
+this code (supposing x is a pointer into shared memory):
+
+    x->foo++;
+
+Although this may compile down to a single machine-language instruction,
+the CPU will execute that instruction by reading the current value of foo,
+adding one to it, and then storing the result back to the original address.
+If two CPUs try to do this simultaneously, both may do their reads before
+either one does their writes.  Eventually we might be able to use an atomic
+fetch-and-add instruction for this specific case on architectures that support
+it, but we can't rely on that being available everywhere, and we currently
+have no support for it at all.  Use a lock.
+
+2. Eight-byte loads and stores aren't necessarily atomic.  We assume in
+various places in the source code that an aligned four-byte load or store is
+atomic, and that other processes therefore won't see a half-set value.
+Sadly, the same can't be said for eight-byte value: on some platforms, an
+aligned eight-byte load or store will generate two four-byte operations.  If
+you need an atomic eight-byte read or write, you must make it atomic with a
+lock.
+
+3. No ordering guarantees.  While memory barriers ensure that any given
+process performs loads and stores to shared memory in order, they don't
+guarantee synchronization.  In the queue example above, we can use memory
+barriers to be sure that readers won't see garbage, but there's nothing to
+say whether a given reader will run before or after a given writer.  If this
+matters in a given situation, some other mechanism must be used instead of
+or in addition to memory barriers.
+
+4. Barrier proliferation.  Many algorithms that at first seem appealing
+require multiple barriers.  If the number of barriers required is more than
+one or two, you may be better off just using a lock.  Keep in mind that, on
+some platforms, a barrier may be implemented by acquiring and releasing a
+backend-private spinlock.  This may be better than a centralized lock under
+contention, but it may also be slower in the uncontended case.
+
+Further Reading
+===============
+
+Much of the documentation about memory barriers appears to be quite
+Linux-specific.  The following papers may be helpful:
+
+Memory Ordering in Modern Microprocessors, by Paul E. McKenney
+* http://www.rdrop.com/users/paulmck/scalability/paper/ordering.2007.09.19a.pdf
+
+Memory Barriers: a Hardware View for Software Hackers, by Paul E. McKenney
+* http://www.rdrop.com/users/paulmck/scalability/paper/whymb.2010.06.07c.pdf
+
+The Linux kernel also has some useful documentation on this topic.  Start
+with Documentation/memory-barriers.txt
diff --git a/src/backend/storage/lmgr/s_lock.c b/src/backend/storage/lmgr/s_lock.c
index 1aa9912572..cd1306c182 100644
--- a/src/backend/storage/lmgr/s_lock.c
+++ b/src/backend/storage/lmgr/s_lock.c
@@ -20,6 +20,7 @@
 
 #include "storage/s_lock.h"
 
+slock_t  dummy_spinlock;
 
 static int	spins_per_delay = DEFAULT_SPINS_PER_DELAY;
 
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 7112dea0e1..fe5e14b9dc 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -707,8 +707,7 @@ cache_locale_time(void)
  *	otherwise returns the pointer to a static area which
  *	contains the iso formatted locale name.
  */
-static
-char *
+static char *
 IsoLocaleName(const char *winlocname)
 {
 #if (_MSC_VER >= 1400)			/* VC8.0 or later */
@@ -937,6 +936,29 @@ lc_ctype_is_c(Oid collation)
 }
 
 
+/* simple subroutine for reporting errors from newlocale() */
+#ifdef HAVE_LOCALE_T
+static void
+report_newlocale_failure(const char *localename)
+{
+	/* copy errno in case one of the ereport auxiliary functions changes it */
+	int			save_errno = errno;
+
+	/*
+	 * ENOENT means "no such locale", not "no such file", so clarify that
+	 * errno with an errdetail message.
+	 */
+	ereport(ERROR,
+			(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+			 errmsg("could not create locale \"%s\": %m",
+					localename),
+			 (save_errno == ENOENT ?
+			  errdetail("The operating system could not find any locale data for the locale name \"%s\".",
+						localename) : 0)));
+}
+#endif /* HAVE_LOCALE_T */
+
+
 /*
  * Create a locale_t from a collation OID.	Results are cached for the
  * lifetime of the backend.  Thus, do not free the result with freelocale().
@@ -995,10 +1017,7 @@ pg_newlocale_from_collation(Oid collid)
 			result = _create_locale(LC_ALL, collcollate);
 #endif
 			if (!result)
-				ereport(ERROR,
-						(errcode_for_file_access(),
-						 errmsg("could not create locale \"%s\": %m",
-								collcollate)));
+				report_newlocale_failure(collcollate);
 		}
 		else
 		{
@@ -1008,16 +1027,10 @@ pg_newlocale_from_collation(Oid collid)
 
 			loc1 = newlocale(LC_COLLATE_MASK, collcollate, NULL);
 			if (!loc1)
-				ereport(ERROR,
-						(errcode_for_file_access(),
-						 errmsg("could not create locale \"%s\": %m",
-								collcollate)));
+				report_newlocale_failure(collcollate);
 			result = newlocale(LC_CTYPE_MASK, collctype, loc1);
 			if (!result)
-				ereport(ERROR,
-						(errcode_for_file_access(),
-						 errmsg("could not create locale \"%s\": %m",
-								collctype)));
+				report_newlocale_failure(collctype);
 #else
 
 			/*
author	Kevin Grittner	2011-09-24 16:15:45 +0000
committer	Kevin Grittner	2011-09-24 16:15:45 +0000
commit	af8d5448f8be9c3f5fb030ac94509629cccab09b (patch)
tree	57533e96b2317c49aaa418632ec49046ce0c93a5 /src/backend
parent	bb08357723c3188d73f3eca170987d4d7af58635 (diff)
parent	337c0b03614c45516f2c3ec956405713bb264d54 (diff)