Made JSON datatype well-behaved with respect to character sets.

Note that this is currently untested with server encodings other than UTF-8. The encoding policy used is: JSON nodes and most of the JSON functions still operate in UTF-8. Strings are converted between server encoding and UTF-8 when they go in and out of varlena (text*), and a set of helper functions are implemented to make these conversions simple to apply. It is done this way because converting individual codepoints to/from whatever the server encoding may be is nontrivial (possibly requires a loaded module). The JSON code needs to encode/decode codepoints when it deals with escapes. Although a more clever and efficient solution might be to defer charset conversions to when they're necessary (e.g. round up all the escapes and encode them all at once), this is not simple, and it's probably not much more efficient, either. Conversions to/from server encoding and UTF-8 are no-ops when the server encoding is UTF-8, anyway.
author: Joey Adams 2010-08-04 21:44:22 +0000
committer: Joey Adams 2010-08-04 21:44:22 +0000
commit: 2b2fda2b7219004d65c1d121e7fed52ba85a9fb8 (patch)
tree: 30300ad44b786a6d5ed6106f1010c481d49b6e10
parent: f475e581b72b8c42cf951f6653610d15e71caeee (diff)
6 files changed, 255 insertions, 56 deletions
diff --git a/json.c b/json.c
index eed3665..c7a62df 100644
--- a/json.c
+++ b/json.c
@@ -351,6 +351,10 @@ char	   *json_decode_string(const char **sp, size_t *length, bool strict);
 /*
  * json_validate
  *    Make sure the given UTF-8 string is valid JSON.
+ *
+ * TODO: Consider making a dedicated function for this so we don't have to
+ *       convert to UTF-8, build a JSON node, then free both
+ *       whenever we need to validate (such as in json_in and json_recv).
  */
 bool
 json_validate(const char *str)
@@ -364,6 +368,27 @@ json_validate(const char *str)
 }
 
 /*
+ * json_validate_server_encoded
+ *    Variant of json_validate that takes a server-encoded string
+ *    rather than a UTF-8 string.
+ *
+ *    Note that a dedicated json_validate (described in the TODO above)
+ *    would be able to handle both encodings natively, since both are
+ *    ASCII-compatible.
+ */
+bool
+json_validate_server_encoded(const char *str)
+{
+	char	*str_utf8	= server_to_utf8(str, strlen(str));
+	bool	result		= json_validate(str_utf8);
+	
+	if (str_utf8 != str)
+		pfree(str_utf8);
+	
+	return result;
+}
+
+/*
  * json_decode
  *    Convert a JSON-encoded string to a JSON node.
  *    @str must be valid UTF-8.
diff --git a/json.h b/json.h
index de4fb26..588f3cb 100644
--- a/json.h
+++ b/json.h
@@ -125,6 +125,7 @@ struct json_node
 
 
 bool		json_validate(const char *str);
+bool		json_validate_server_encoded(const char *str);
 json_node  *json_decode(const char *str);
 
 #define JSONOPT_USE_ORIG		1
@@ -211,7 +212,4 @@ void		json_set_string(json_node * node, const char *str, size_t length);
 const char *json_get_number(json_node * node);
 void		json_set_number(json_node * node, const char *number, size_t length);
 
-/* Utility function used by json_get to automatically apply from_json to its result. */
-const char *from_json_cstring(const char *input, const char *funcname);
-
 #endif
diff --git a/json_io.c b/json_io.c
index 502372e..130b679 100644
--- a/json_io.c
+++ b/json_io.c
@@ -21,16 +21,12 @@ Datum
 json_in(PG_FUNCTION_ARGS)
 {
 	char	   *string		= PG_GETARG_CSTRING(0);
-	jsontype   *vardata		= cstring_to_text(string);
-	int			len			= VARSIZE(vardata) - VARHDRSZ;
-	char	   *utf8string;
+	jsontype   *vardata;
 
-	utf8string = (char *) pg_do_encoding_conversion(
-			(unsigned char *) string, len, GetDatabaseEncoding(), PG_UTF8);
-
-	if (!json_validate(utf8string))
+	if (!json_validate_server_encoded(string))
 		elog(ERROR, "invalid JSON content");
 
+	vardata = cstring_to_text(string);
 	PG_RETURN_JSON_P(vardata);
 }
 
@@ -39,10 +35,12 @@ Datum		json_out(PG_FUNCTION_ARGS);
 Datum
 json_out(PG_FUNCTION_ARGS)
 {
-	jsontype   *vardata = PG_GETARG_JSON_P(0);
-	char	   *s = text_to_cstring((text *) vardata);
+	jsontype   *vardata	= PG_GETARG_JSON_P(0);
+	char	   *string	= text_to_cstring((text *) vardata);
+	
+	Assert(json_validate_server_encoded(string));
 
-	PG_RETURN_CSTRING(s);
+	PG_RETURN_CSTRING(string);
 }
 
 PG_FUNCTION_INFO_V1(json_recv);
@@ -51,13 +49,13 @@ Datum
 json_recv(PG_FUNCTION_ARGS)
 {
 	StringInfo	buf = (StringInfo) PG_GETARG_POINTER(0);
-	jsontype   *result;
 	char	   *str;
 	int			nbytes;
-
+	jsontype   *result;
+	
 	str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
-
-	if (!json_validate(str))
+	
+	if (!json_validate_server_encoded(str))
 		elog(ERROR, "invalid JSON content");
 
 	result = cstring_to_text_with_len(str, nbytes);
@@ -73,21 +71,37 @@ json_send(PG_FUNCTION_ARGS)
 {
 	jsontype   *t = PG_GETARG_JSON_P(0);
 	StringInfoData buf;
+	
+	#ifdef USE_ASSERT_CHECKING
+	{
+		char *string = text_to_cstring(t);
+		
+		Assert(json_validate_server_encoded(string));
+		
+		pfree(string);
+	}
+	#endif
 
 	pq_begintypsend(&buf);
 	pq_sendtext(&buf, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
 	PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
 }
 
-const char *
-from_json_cstring(const char *input, const char *funcname)
+/*
+ * Performs JSON value extraction in UTF-8 C-String land.
+ *
+ * If the input was JSON-encoded NULL, this function returns NULL
+ * (indicating that we want from_json to yield actual SQL NULL).
+ */
+static const char *
+from_json_cstring(const char *input)
 {
 	size_t		len;
 	json_node  *json = json_decode(input);
 	const char *cstring_out = NULL;
 
 	if (json == NULL)
-		elog(ERROR, "%s: JSON content was corrupted", funcname);
+		elog(ERROR, "from_json: JSON content was corrupted");
 
 	switch (json->type)
 	{
@@ -97,9 +111,8 @@ from_json_cstring(const char *input, const char *funcname)
 		case JSON_STRING:
 			cstring_out = json_get_string(json, &len);
 			if (strlen(cstring_out) != len)
-				elog(ERROR, "%s: null terminator occurred in a JSON string; can't convert to TEXT"		/* (use
-																										 * from_json_as_bytea
-						 instead) */ , funcname);
+				elog(ERROR, "from_json: null terminator occurred in a JSON string; can't convert to TEXT");
+					/* todo: suggest using from_json_as_bytea instead when that is available. */ 
 			break;
 		case JSON_NUMBER:
 			cstring_out = json_get_number(json);
@@ -108,10 +121,10 @@ from_json_cstring(const char *input, const char *funcname)
 			cstring_out = json_get_bool(json) ? "true" : "false";
 			break;
 		case JSON_OBJECT:
-			elog(ERROR, "%s does not support conversion from objects yet", funcname);
+			elog(ERROR, "from_json does not support conversion from objects yet");
 			break;
 		case JSON_ARRAY:
-			elog(ERROR, "%s does not support conversion from arrays yet", funcname);
+			elog(ERROR, "from_json does not support conversion from arrays yet");
 			break;
 		default:
 			Assert(false);
@@ -125,15 +138,15 @@ Datum		from_json(PG_FUNCTION_ARGS);
 Datum
 from_json(PG_FUNCTION_ARGS)
 {
-
-	jsontype   *vardata_in = PG_GETARG_JSON_P(0);
-	char	   *cstring_in = text_to_cstring((text *) vardata_in);
-	const char *cstring_out = from_json_cstring(cstring_in, "from_json");
+	char	   *cstring_in	= text_to_utf8_cstring(PG_GETARG_JSON_P(0));
+	const char *cstring_out	= from_json_cstring(cstring_in);
 	text	   *vardata_out;
+	
+	pfree(cstring_in);
 
 	if (cstring_out)
 	{
-		vardata_out = cstring_to_text(cstring_out);
+		vardata_out = utf8_cstring_to_text(cstring_out);
 		PG_RETURN_TEXT_P(vardata_out);
 	}
 	else
@@ -165,7 +178,7 @@ to_json(PG_FUNCTION_ARGS)
 	if (PG_ARGISNULL(0))
 		PG_RETURN_JSON_P(cstring_to_text("null"));
 
-	PG_RETURN_JSON_P(cstring_to_text(
+	PG_RETURN_JSON_P(utf8_cstring_to_text(
 				  datum_to_json(PG_GETARG_DATUM(0), typeInfo, target_type)));
 }
 
@@ -200,26 +213,41 @@ invalid:
 	return JSON_INVALID;
 }
 
+/*
+ * datum_to_json
+ *    Converts a datum to a UTF-8-encoded JSON string.
+ *
+ *    typeInfo comes from the getTypeInfo function, and 
+ *    target_type comes from decide_json_type .
+ *
+ *    See to_json and array_to_json for examples of
+ *    how to invoke this function.
+ */
 static const char *
 datum_to_json(Datum datum, TypeInfo *typeInfo, json_type target_type)
 {
 	char	   *cstring;
+	char	   *cstring_utf8;
 	json_node  *node;
-	char	   *encoded;
+	char	   *encoded_utf8;
 
 	switch (target_type)
 	{
 		case JSON_STRING:
 		case JSON_NUMBER:
 			cstring = OutputFunctionCall(&typeInfo->proc, datum);
+			cstring_utf8 = server_to_utf8(cstring, strlen(cstring));
+			
+			if (cstring != cstring_utf8)
+				pfree(cstring);
 
 			if (target_type == JSON_STRING)
-				node = json_mkstring(cstring, strlen(cstring));
+				node = json_mkstring(cstring_utf8, strlen(cstring_utf8));
 			else
-				node = json_mknumber(cstring, strlen(cstring));
+				node = json_mknumber(cstring_utf8, strlen(cstring_utf8));
 
-			encoded = json_encode(node, 0);
-			if (!encoded)
+			encoded_utf8 = json_encode(node, 0);
+			if (!encoded_utf8)
 			{
 				/*
 				 * This usually means the given string/number type was
@@ -231,8 +259,8 @@ datum_to_json(Datum datum, TypeInfo *typeInfo, json_type target_type)
 					 target_type == JSON_STRING ? "string" : "number"
 					);
 			}
-
-			return encoded;
+			
+			return encoded_utf8;
 
 		case JSON_BOOL:
 			return DatumGetBool(datum) ? "true" : "false";
@@ -248,7 +276,7 @@ datum_to_json(Datum datum, TypeInfo *typeInfo, json_type target_type)
 
 /*
  * array_to_json
- *	  Converts a PostgreSQL array datum to JSON.
+ *	  Converts a PostgreSQL array datum to a UTF-8-encoded JSON string.
  *
  *	  Note: We assume that any type with typcategory = 'A'
  *			is compatible with array_out.
@@ -271,6 +299,11 @@ array_to_json(Datum datum)
 	const char **values;
 	const char *ret;
 
+	/*
+	 * todo: Consider caching the TypeInfo of array items
+	 *       (which we're computing now) in the fcinfo->flinfo->fn_mcxt
+	 *       of to_json like we do with the TypeInfo of the array itself.
+	 */
 	initTypeInfo(&element_typeinfo, CurrentMemoryContext);
 	getTypeInfo(&element_typeinfo, element_type, IOFunc_output);
 
diff --git a/json_op.c b/json_op.c
index a070c8a..684d8ec 100644
--- a/json_op.c
+++ b/json_op.c
@@ -18,8 +18,12 @@ Datum		json_validate_f(PG_FUNCTION_ARGS);
 Datum
 json_validate_f(PG_FUNCTION_ARGS)
 {
-	const char *s = text_to_cstring(PG_GETARG_JSON_P(0));
-	bool		ret = json_validate(s);
+	char	*string;
+	bool	ret;
+	
+	string = text_to_utf8_cstring(PG_GETARG_JSON_P(0));
+	ret = json_validate(string);
+	pfree(string);
 
 	PG_RETURN_BOOL(ret);
 }
@@ -34,6 +38,10 @@ json_get_type(PG_FUNCTION_ARGS)
 	jsontype   *t = PG_GETARG_JSON_P(0);
 	json_type	type;
 
+	/*
+	 * No need to convert to UTF-8 before calling json_text_type,
+	 * as it looks solely at ASCII characters.
+	 */
 	type = json_text_type(VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
 
 	if (!json_type_is_valid(type))
@@ -51,21 +59,25 @@ Datum		json_condense(PG_FUNCTION_ARGS);
 Datum
 json_condense(PG_FUNCTION_ARGS)
 {
-	char	   *string = text_to_cstring(PG_GETARG_JSON_P(0));
-	json_node  *json = json_decode(string);
-	char	   *condensed = json_encode(json, 0);
+	char	   *string;
+	json_node  *json;
+	char	   *condensed;
+
+	string = text_to_utf8_cstring(PG_GETARG_JSON_P(0));
+	json = json_decode(string);
+	condensed = json_encode(json, 0);
 
 	if (condensed == NULL)
 		elog(ERROR, "json_condense: Corrupt JSON content");
 
-	PG_RETURN_JSON_P(cstring_to_text(condensed));
+	PG_RETURN_JSON_P(utf8_cstring_to_text(condensed));
 }
 
 static List *
 json_path_base(FunctionCallInfo fcinfo, const char *funcname)
 {
-	char	   *json_string = text_to_cstring(PG_GETARG_JSON_P(0));
-	char	   *path_string = text_to_cstring(PG_GETARG_TEXT_PP(1));
+	char	   *json_string = text_to_utf8_cstring(PG_GETARG_JSON_P(0));
+	char	   *path_string = text_to_utf8_cstring(PG_GETARG_TEXT_PP(1));
 	JSONPath   *jpath = jp_parse(path_string);
 	json_node  *json = json_decode(json_string);
 	List	   *result_list;
@@ -87,7 +99,7 @@ json_get(PG_FUNCTION_ARGS)
 {
 	List	   *result_list = json_path_base(fcinfo, "json_get");
 	ListCell   *result;
-	const char *rs_json;
+	char	   *rs_json;
 	jsontype   *result_vardata;
 	int			length = list_length(result_list);
 
@@ -103,7 +115,9 @@ json_get(PG_FUNCTION_ARGS)
 		Assert(rs_json != NULL);
 		Assert(json_validate(rs_json) == true);
 
-		result_vardata = (jsontype *) cstring_to_text(rs_json);
+		result_vardata = (jsontype *) utf8_cstring_to_text(rs_json);
+		
+		pfree(rs_json);
 
 		PG_RETURN_JSON_P(result_vardata);
 	}
@@ -119,12 +133,14 @@ Datum		json_set(PG_FUNCTION_ARGS);
 Datum
 json_set(PG_FUNCTION_ARGS)
 {
-	char	   *json_string = text_to_cstring(PG_GETARG_JSON_P(0));
-	char	   *path_string = text_to_cstring(PG_GETARG_TEXT_PP(1));
-	char	   *rvalue_string = text_to_cstring(PG_GETARG_JSON_P(2));
+	char	   *json_string = text_to_utf8_cstring(PG_GETARG_JSON_P(0));
+	char	   *path_string = text_to_utf8_cstring(PG_GETARG_TEXT_PP(1));
+	char	   *rvalue_string = text_to_utf8_cstring(PG_GETARG_JSON_P(2));
 	JSONPath   *jpath = jp_parse(path_string);
 	json_node  *json = json_decode(json_string);
 	json_node  *rvalue = json_decode(rvalue_string);
+	char	   *result;
+	jsontype   *result_text;
 
 	if (!jpath)
 		elog(ERROR, "json_set: Invalid JSONPath expression");
@@ -134,8 +150,13 @@ json_set(PG_FUNCTION_ARGS)
 		elog(ERROR, "json_set: Corrupt JSON content (3rd parameter)");
 
 	jp_set(jpath, json, rvalue);
+	
+	result = json_encode(json, JSONOPT_USE_ORIG | JSONOPT_NO_TRIM);
+	result_text = utf8_cstring_to_text(result);
+	
+	pfree(result);
 
-	PG_RETURN_JSON_P(cstring_to_text(json_encode(json, JSONOPT_USE_ORIG | JSONOPT_NO_TRIM)));
+	PG_RETURN_JSON_P(result_text);
 }
 
 PG_FUNCTION_INFO_V1(json_path);
@@ -174,7 +195,7 @@ json_path(PG_FUNCTION_ARGS)
 		Assert(json_string != NULL);
 		Assert(json_validate(json_string) == true);
 
-		json_text = cstring_to_text(json_string);
+		json_text = utf8_cstring_to_text(json_string);
 
 		funcctx->user_fctx = lnext(result);
 
@@ -189,7 +210,7 @@ Datum		parse_json_path(PG_FUNCTION_ARGS);
 Datum
 parse_json_path(PG_FUNCTION_ARGS)
 {
-	char	   *string = text_to_cstring(PG_GETARG_TEXT_PP(0));
+	char	   *string = text_to_utf8_cstring(PG_GETARG_TEXT_PP(0));
 	JSONPath   *jpath = jp_parse(string);
 	char	   *normalized;
 
@@ -197,5 +218,5 @@ parse_json_path(PG_FUNCTION_ARGS)
 		PG_RETURN_NULL();
 
 	normalized = jp_show(jpath);
-	PG_RETURN_TEXT_P(cstring_to_text(normalized));
+	PG_RETURN_TEXT_P(utf8_cstring_to_text(normalized));
 }
diff --git a/util.c b/util.c
index 9c64542..3171a4f 100644
--- a/util.c
+++ b/util.c
@@ -52,6 +52,31 @@ enumLabelToOid(const char *typname, const char *label)
 	return ret;
 }
 
+/*
+ * utf8_substring
+ *    Find substring bounds in a UTF-8-encoded string.
+ *
+ *    @src and @srcbytes are the start and byte length of the input string.
+ *    @start and @length are the start and number of characters requested.
+ *
+ *    Writes the bounds of the substring to
+ *    *out_start (start) and *out_bytes (byte length).
+ *    Returns the number of characters (not bytes) in the string.
+ *
+ *    Example:
+ *       const char *out_start;
+ *       int         out_bytes;
+ *       int         out_chars;
+ *
+ *       out_chars =
+ *           unicode_substring("⁰¹²³", 9,
+ *                             1, 100,
+ *                             &out_start, &out_bytes);
+ *
+ *    out_chars will be 3.
+ *    out_start will point to the "¹".
+ *    out_bytes will be 6.
+ */
 size_t
 utf8_substring(
 			   const char *src, size_t srcbytes,
@@ -124,6 +149,11 @@ utf8_decode_char_nocheck(const char **sp, unsigned int *uc)
 	*sp = (const char *) s;
 }
 
+/*
+ * utf8_validate
+ *    Essentially a variant of pg_verify_mbstr(PG_UTF8, str, length, true)
+ *    that allows '\0' characters.
+ */
 bool
 utf8_validate(const char *str, size_t length)
 {
@@ -158,3 +188,69 @@ utf8_encode_char(char *out, unsigned int uc)
 	unicode_to_utf8(uc, (unsigned char *) out);
 	return pg_utf_mblen((unsigned char *) out);
 }
+
+char *
+server_to_utf8(const char *str, int len)
+{
+	return (char *) pg_do_encoding_conversion(
+				(unsigned char *) str, len, GetDatabaseEncoding(), PG_UTF8);
+}
+
+char *
+utf8_to_server(const char *str, int len)
+{
+	return (char *) pg_do_encoding_conversion(
+				(unsigned char *) str, len, PG_UTF8, GetDatabaseEncoding());
+}
+
+/*
+ * Adaptations of text_to_cstring and cstring_to_text for simplifying UTF-8 conversions.
+ *
+ * Just like text_to_cstring, text_to_utf8_cstring will always return a palloc'd,
+ * null-terminated C-string.
+ */
+char *text_to_utf8_cstring(const text *t)
+{
+	/* must cast away the const, just like in text_to_cstring */
+	text		*tunpacked	= pg_detoast_datum_packed((struct varlena *) t);
+	const char	*data		= VARDATA_ANY(tunpacked);
+	int			len			= VARSIZE_ANY_EXHDR(tunpacked);
+	char		*result;
+	
+	result = server_to_utf8(data, len);
+	if (result == data)
+		result = pnstrdup(data, len);
+	
+	if (tunpacked != t)
+		pfree(tunpacked);
+	
+	return result;
+}
+
+text *utf8_cstring_to_text(const char *s)
+{
+	return utf8_cstring_to_text_with_len(s, strlen(s));
+}
+
+text *utf8_cstring_to_text_with_len(const char *s, int len)
+{
+	char	*cstring;
+	int		cstring_len;
+	text	*result;
+	
+	cstring	= utf8_to_server(s, len);
+	if (cstring == s)
+		cstring_len = len;
+	else
+		cstring_len = strlen(cstring);
+	
+	result	= (text *) palloc(len + VARHDRSZ);
+	
+	SET_VARSIZE(result, len + VARHDRSZ);
+	memcpy(VARDATA(result), cstring, cstring_len);
+	
+	if (cstring != s)
+		pfree(cstring);
+	
+	return result;
+}
diff --git a/util.h b/util.h
index 456708a..fa49727 100644
--- a/util.h
+++ b/util.h
@@ -6,6 +6,7 @@
 #include "funcapi.h"
 #include "utils/lsyscache.h"
 
+/* TODO:  Make this less "magic" (e.g. make it a function instead of a macro). */
 #define FN_EXTRA(var, ...) FN_EXTRA_SZ(var, sizeof(*(var)), __VA_ARGS__)
 #define FN_EXTRA_SZ(var, alloc, ...) do \
 	{ \
@@ -50,4 +51,29 @@ void utf8_decode_char_nocheck(const char **sp, unsigned int *uc);
 bool utf8_validate(const char *str, size_t length);
 int utf8_encode_char(char *out, unsigned int uc);
 
+/*
+ * Adaptations of pg_do_encoding_conversion for simplifying UTF-8 conversions.
+ *
+ * These are used frequently in the JSON code because JSON nodes are encoded
+ * in UTF-8.  The reason they are encoded in UTF-8 is because we need to
+ * be able to handle Unicode escapes, and there's
+ * no simple and efficient way to do that with the server encoding.
+ *
+ * Just like pg_do_encoding_conversion, if no conversion is done, the original
+ * pointer given is returned.
+ */
+char *server_to_utf8(const char *str, int len);
+char *utf8_to_server(const char *str, int len);
+
+/*
+ * Adaptations of text_to_cstring and cstring_to_text for simplifying UTF-8 conversions.
+ *
+ * Just like text_to_cstring, text_to_utf8_cstring will always return a palloc'd,
+ * null-terminated C-string.
+ */
+char *text_to_utf8_cstring(const text *t);
+text *utf8_cstring_to_text(const char *s);
+text *utf8_cstring_to_text_with_len(const char *s, int len);
+
+
 #endif
author	Joey Adams	2010-08-04 21:44:22 +0000
committer	Joey Adams	2010-08-04 21:44:22 +0000
commit	2b2fda2b7219004d65c1d121e7fed52ba85a9fb8 (patch)
tree	30300ad44b786a6d5ed6106f1010c481d49b6e10
parent	f475e581b72b8c42cf951f6653610d15e71caeee (diff)