* Migrated my Unicode functions to util.c and made them rely more on

PostgreSQL's pg_wchar.h routines. * Touched up various functions' documentation. json_node's are currently encoded in UTF-8, and the JSON module is not 100% compatible with arbitrary server encodings yet. I plan to switch from UTF-8 to the server encoding pretty soon, after which JSON should be a well-behaved datatype as far as charsets go.
author: Joey Adams 2010-07-24 22:28:46 +0000
committer: Joey Adams 2010-07-24 22:28:46 +0000
commit: f475e581b72b8c42cf951f6653610d15e71caeee (patch)
tree: 0a30a239ae10b3ed799688ecbd6791d4488a2cff
parent: b32257221b4b8e15fada7aabd0fe6e129e00d3e8 (diff)
6 files changed, 135 insertions, 224 deletions
diff --git a/json.c b/json.c
index ec4e0ca..eed3665 100644
--- a/json.c
+++ b/json.c
@@ -22,10 +22,10 @@
 */
 
 #include "json.h"
+#include "util.h"
 
 #include <ctype.h>
 
-
 #define is_internal(node) ((node)->type == JSON_ARRAY || (node)->type == JSON_OBJECT)
 
 /* We can't use isspace() because it also accepts \v and \f, which
@@ -102,10 +102,6 @@ write_hex16(char *out, unsigned int val)
 	*out++ = hex[val & 0xF];
 }
 
-static bool utf8_validate(const char *str, size_t length);
-static void utf8_decode_char_nocheck(const char **sp, unsigned int *uc);
-static int	utf8_encode_char(char *out, unsigned int uc);
-
 
 /*********** json_node creation, manipulation, and deletion **********/
 
@@ -352,6 +348,10 @@ char	   *json_decode_string(const char **sp, size_t *length, bool strict);
    because it's also used to parse object member keys.
    It's also useful outside of json.c, such as in jsonpath.c . */
 
+/*
+ * json_validate
+ *    Make sure the given UTF-8 string is valid JSON.
+ */
 bool
 json_validate(const char *str)
 {
@@ -363,6 +363,11 @@ json_validate(const char *str)
 	return true;
 }
 
+/*
+ * json_decode
+ *    Convert a JSON-encoded string to a JSON node.
+ *    @str must be valid UTF-8.
+ */
 json_node *
 json_decode(const char *str)
 {
@@ -378,8 +383,7 @@ json_decode(const char *str)
 	if (!str)
 		return NULL;
 
-	if (!utf8_validate(str, strlen(str)))
-		return NULL;
+	Assert(utf8_validate(str, strlen(str)));
 
 	expect_endp = false;
 	goto item;
@@ -601,8 +605,7 @@ decode_leaf(const char **sp)
  * However, some JSON parsers are more liberal.  For instance, PHP accepts
  * '.5' and '1.'.  JSON.parse accepts '+3'.
  *
- * This function takes the strict approach.  The user should use
- * json_clean() to handle liberal JSON text.
+ * This function takes the strict approach.
  */
 static bool
 validate_number(const char **sp)
@@ -669,6 +672,27 @@ decode_number(const char **sp)
 	return json_mknumber(start, end - start);
 }
 
+/*
+ * json_decode_string
+ *    If you're interested in the decoding JSON in general, see json_decode.
+ *
+ *    Decodes a JSON string literal (e.g. "\"hello\"").
+ *
+ *    If strict is true, string must be double-quoted,
+ *    as is required by the JSON RFC.
+ *    Otherwise (e.g. if parsing something JSON-like, such as JSONPath),
+ *    the string may be single- or double-quoted.
+ *
+ *    Also, no whitespace skipping is done, so the caller should only
+ *    call this function when it expects **sp to be either " or '
+ *
+ *    On success, returns the decoded string, passes that string's length
+ *    through *length (which must not be NULL), and advances *sp to point
+ *    to the end of string literal (including the quote character).
+ *
+ *    On failure (parse error), returns NULL and
+ *    leaves *length and *sp untouched.
+ */
 char *
 json_decode_string(const char **sp, size_t *length, bool strict)
 {
@@ -754,6 +778,7 @@ json_decode_string(const char **sp, size_t *length, bool strict)
 						goto failed;
 
 					len = utf8_encode_char(buf, uc);
+					Assert(len > 0);
 					appendBinaryStringInfo(&ret, buf, len);
 
 					continue;	/* Continue the enclosing while loop to skip
@@ -825,14 +850,15 @@ json_text_type(const char *str, size_t nbytes)
 
 /****************************** Encoding *****************************/
 
-static bool
-encode_string(StringInfo out, const char *string, size_t length, char quote, bool escape_unicode)
+static void
+encode_string(StringInfo out, const char *string, size_t length, char quote,
+			bool escape_unicode)
 {
 	const char *s = string;
 	const char *e = s + length;
 
-	if (!utf8_validate(string, length) || quote == '\\')
-		return false;
+	Assert(utf8_validate(string, length));
+	Assert(quote != '\\');
 
 	appendStringInfoChar(out, quote);
 
@@ -912,8 +938,6 @@ encode_string(StringInfo out, const char *string, size_t length, char quote, boo
 	}
 
 	appendStringInfoChar(out, quote);
-
-	return true;
 }
 
 static bool
@@ -1010,12 +1034,9 @@ json_encode_recurse(json_node * node, json_encode_ctx * ctx)
 					txt = "false";
 				break;
 			case JSON_STRING:
-				if (!encode_string(&ctx->str,
-								   node->v.string.str,
-								   node->v.string.length,
-								   '"',
-								   ctx->escape_unicode))
-					return false;
+				encode_string(&ctx->str,
+						node->v.string.str, node->v.string.length,
+						'"', ctx->escape_unicode);
 				break;
 			case JSON_NUMBER:
 				if (!encode_number(&ctx->str, node->v.number))
@@ -1049,18 +1070,10 @@ json_encode_recurse(json_node * node, json_encode_ctx * ctx)
 						push_orig(key_left_space);
 
 					if (has_orig(key))
-					{
 						push_orig(key);
-					}
 					else
-					{
-						if (!encode_string(&ctx->str,
-										   node->key,
-										   node->key_length,
-										   '"',
-										   ctx->escape_unicode))
-							return false;
-					}
+						encode_string(&ctx->str, node->key, node->key_length,
+										'"', ctx->escape_unicode);
 
 					if (has_orig(key_right_space))
 						push_orig(key_right_space);
@@ -1092,171 +1105,26 @@ json_encode_recurse(json_node * node, json_encode_ctx * ctx)
 #undef push_orig
 }
 
-char *
-json_encode_string(const char *str, size_t length, char quote, bool escape_unicode)
-{
-	StringInfoData ret;
-
-	initStringInfo(&ret);
-	if (!encode_string(&ret, str, length, quote, escape_unicode))
-	{
-		pfree(ret.data);
-		return NULL;
-	}
-
-	return ret.data;
-}
-
-
-/****************************** Unicode ******************************/
-
-static const bool utf8_allow_surrogates = false;
-
-static void
-utf8_decode_char_nocheck(const char **sp, unsigned int *uc)
-{
-	const unsigned char *s = (const unsigned char *) *sp;
-	unsigned char c = *s++;
-	unsigned int len;
-	unsigned char sf[4] = {0xFF, 0x1F, 0xF, 0x7};
-
-	if (c < 0x80)
-		len = 0;
-	else if (c < 0xE0)
-		len = 1;
-	else if (c < 0xF0)
-		len = 2;
-	else
-		len = 3;
-
-	*uc = c & sf[len];
-	while (len--)
-	{
-		*uc <<= 6;
-		*uc |= *s++ & 0x3F;
-	}
-
-	*sp = (const char *) s;
-}
-
-static bool
-utf8_validate(const char *str, size_t length)
-{
-	const unsigned char *s = (const unsigned char *) str;
-	const unsigned char *e = s + length;
-
-	while (s < e)
-	{
-		unsigned char c = *s++;
-		unsigned int len;		/* number of bytes in sequence - 2 */
-
-		/* If character is ASCII, move on. */
-		if (c < 0x80)
-			continue;
-
-		if (s >= e)
-			return false;		/* Missing bytes in sequence. */
-
-		if (c < 0xE0)
-		{
-			/*
-			 * 2-byte sequence, U+0080 to U+07FF c must be 11000010 or higher
-			 * s[0] must be 10xxxxxx
-			 */
-			len = 0;
-			if (c < 0xC2)
-				return false;
-		}
-		else if (c < 0xF0)
-		{
-			/*
-			 * 3-byte sequence, U+0800 to U+FFFF Note that the surrogate range
-			 * is U+D800 to U+DFFF, and that U+FFFE and U+FFFF are illegal
-			 * characters. c must be >= 11100000 (which it is) If c is
-			 * 11100000, then s[0] must be >= 10100000 If the global parameter
-			 * utf8_allow_surrogates is false: If c is 11101101 and s[0] is >=
-			 * 10100000, then this is a surrogate and we should fail. If c is
-			 * 11101111, s[0] is 10111111, and s[1] >= 10111110, then this is
-			 * an illegal character and we should fail. s[0] and s[1] must be
-			 * 10xxxxxx
-			 */
-			len = 1;
-			if (c == 0xE0 && *s < 0xA0)
-				return false;
-			if (!utf8_allow_surrogates && c == 0xED && *s >= 0xA0)
-				return false;
-			if (c == 0xEF && s[0] == 0xBF && (s + 1 >= e || s[1] >= 0xBE))
-				return false;
-		}
-		else
-		{
-			/*
-			 * 4-byte sequence, U+010000 to U+10FFFF c must be >= 11110000
-			 * (which it is) and <= 11110100 If c is 11110000, then s[0] must
-			 * be >= 10010000 If c is 11110100, then s[0] must be < 10010000
-			 * s[0], s[1], and s[2] must be 10xxxxxx
-			 */
-			len = 2;
-			if (c > 0xF4)
-				return false;
-			if (c == 0xF0 && *s < 0x90)
-				return false;
-			if (c == 0xF4 && *s >= 0x90)
-				return false;
-		}
-
-		if (s + len >= e)
-			return false;		/* Missing bytes in sequence. */
-
-		do
-		{
-			if ((*s++ & 0xC0) != 0x80)
-				return false;
-		} while (len--);
-	}
-
-	return true;
-}
-
 /*
- * Encodes the Unicode character uc as UTF-8, writing it
- * to *out and updating *out to point to the end of the UTF-8 sequence.
+ * json_encode_string
+ *    If you're interested in encoding JSON in general, see json_encode .
  *
- * If uc is too high, no character will be emitted, and *out will
- * not be changed.	If uc is in the UTF-16 surrogate range
- * (U+D800 thru U+DFFF) or is a designated not-a-character
- * (U+FFFE or U+FFFF), the character will be emitted anyway,
- * although it is technically invalid UTF-8.
+ *    Encodes a string literal JSON-style using the given quote character.
+ *    Note that using anything but '"' as the quote character will result
+ *    in invalid JSON.
  *
- * Returns the number of characters emitted.
+ *    @str must be valid UTF-8, though it may contain null characters
+ *       (hence the length argument).
+ *    @quote must not be a backslash.
  */
-static int
-utf8_encode_char(char *out, unsigned int uc)
+char *
+json_encode_string(const char *str, size_t length, char quote,
+					bool escape_unicode)
 {
-	char	   *start = out;
+	StringInfoData ret;
 
-	if (uc < 0x80)
-	{
-		*out++ = uc & 0x7F;
-	}
-	else if (uc < 0x800)
-	{
-		*out++ = 0xC0 | (uc >> 6);
-		*out++ = 0x80 | (uc & 0x3F);
-	}
-	else if (uc < 0x10000)
-	{
-		*out++ = 0xE0 | (uc >> 12);
-		*out++ = 0x80 | ((uc >> 6) & 0x3F);
-		*out++ = 0x80 | (uc & 0x3F);
-	}
-	else if (uc < 0x110000)
-	{
-		*out++ = 0xF0 | ((uc >> 18) & 0x07);
-		*out++ = 0x80 | ((uc >> 12) & 0x3F);
-		*out++ = 0x80 | ((uc >> 6) & 0x3F);
-		*out++ = 0x80 | (uc & 0x3F);
-	}
+	initStringInfo(&ret);
+	encode_string(&ret, str, length, quote, escape_unicode);
 
-	return out - start;
+	return ret.data;
 }
diff --git a/json.h b/json.h
index 3391cea..de4fb26 100644
--- a/json.h
+++ b/json.h
@@ -162,32 +162,7 @@ json_head(json_node * parent)
 	}
 }
 
-
-/*
- * Decodes a JSON-encoded string literal
- *	  (If you're interested in the decoding JSON in general, see json_decode).
- * If strict is true, string must be double-quoted,
- * as is required by the JSON RFC.
- * Otherwise, the string may be single- or double-quoted.
- * Also, no whitespace skipping is done, so the caller should only
- * call this function when it expects **sp to be either " or '
- *
- * On success, returns the decoded string and passes that string's length
- * through *length (which must not be NULL).  On failure (parse error),
- * returns NULL and leaves *length untouched.
- */
 char	   *json_decode_string(const char **sp, size_t *length, bool strict);
-
-/*
- * Encodes a string literal JSON-style using the given quote character,
- * only escaping characters when necessary
- *	  (If you're interested in encoding JSON in general, see json_encode).
- * Note that using anything but '"' as the quote character will result in
- * invalid JSON.
- *
- * Returns NULL if input is invalid UTF-8 or if an invalid quote character
- * (such as backslash) is given.
- */
 char	   *json_encode_string(const char *str, size_t length, char quote, bool escape_unicode);
 
 /* Add child to parent, putting it at the end. */
@@ -199,10 +174,6 @@ void		json_remove(json_node * node);
 /* Update the value of a node, preserving position and key information. */
 void		json_replace_value(json_node * node, json_node * replacement);
 
-/* Note that the factory functions and get/set functions do not validate input.
- * However, json_encode validates node contents to avoid producing
- * invalid JSON. */
-
 /* Node factory functions */
 json_node  *json_mknode(json_type type);
 json_node  *json_mkbool(bool v_bool);
diff --git a/json_io.c b/json_io.c
index da9d6b7..502372e 100644
--- a/json_io.c
+++ b/json_io.c
@@ -2,6 +2,7 @@
 #include "util.h"
 
 #include "utils/array.h"
+#include "mb/pg_wchar.h"
 
 PG_MODULE_MAGIC;
 
@@ -19,10 +20,15 @@ Datum		json_in(PG_FUNCTION_ARGS);
 Datum
 json_in(PG_FUNCTION_ARGS)
 {
-	const char *s = PG_GETARG_CSTRING(0);
-	jsontype   *vardata = cstring_to_text(s);
+	char	   *string		= PG_GETARG_CSTRING(0);
+	jsontype   *vardata		= cstring_to_text(string);
+	int			len			= VARSIZE(vardata) - VARHDRSZ;
+	char	   *utf8string;
 
-	if (!json_validate(s))
+	utf8string = (char *) pg_do_encoding_conversion(
+			(unsigned char *) string, len, GetDatabaseEncoding(), PG_UTF8);
+
+	if (!json_validate(utf8string))
 		elog(ERROR, "invalid JSON content");
 
 	PG_RETURN_JSON_P(vardata);
diff --git a/jsonpath.c b/jsonpath.c
index eaf5b7a..591c5fd 100644
--- a/jsonpath.c
+++ b/jsonpath.c
@@ -176,7 +176,6 @@ jp_show(JSONPath * jp)
 				break;
 			case JP_KEY_SUBSCRIPT:
 				tmp = json_encode_string(elem->data.key.ptr, elem->data.key.length, '"', false);
-				Assert(tmp != NULL);
 				appendStringInfo(string, "%s[%s]", rd ? ".." : "", tmp);
 				pfree(tmp);
 				break;
diff --git a/util.c b/util.c
index 39fb0b8..9c64542 100644
--- a/util.c
+++ b/util.c
@@ -94,3 +94,67 @@ utf8_substring(
 	*out_bytes = sub_end - sub_start;
 	return sub_length;
 }
+
+static const bool utf8_allow_surrogates = false;
+
+void
+utf8_decode_char_nocheck(const char **sp, unsigned int *uc)
+{
+	const unsigned char *s = (const unsigned char *) *sp;
+	unsigned char c = *s++;
+	unsigned int len;
+	unsigned char sf[4] = {0xFF, 0x1F, 0xF, 0x7};
+
+	if (c < 0x80)
+		len = 0;
+	else if (c < 0xE0)
+		len = 1;
+	else if (c < 0xF0)
+		len = 2;
+	else
+		len = 3;
+
+	*uc = c & sf[len];
+	while (len--)
+	{
+		*uc <<= 6;
+		*uc |= *s++ & 0x3F;
+	}
+
+	*sp = (const char *) s;
+}
+
+bool
+utf8_validate(const char *str, size_t length)
+{
+	const unsigned char *s = (const unsigned char *) str;
+	const unsigned char *e = s + length;
+	int len;
+	
+	while (s < e)
+	{
+		if (*s <= 0x7F)
+		{
+			s++;
+			continue;
+		}
+		
+		len = pg_utf_mblen(s);
+		if (s + len > e)
+			return false;
+		
+		if (!pg_utf8_islegal(s, len))
+			return false;
+		
+		s += len;
+	}
+	
+	return true;
+}
+
+int
+utf8_encode_char(char *out, unsigned int uc)
+{
+	unicode_to_utf8(uc, (unsigned char *) out);
+	return pg_utf_mblen((unsigned char *) out);
+}
diff --git a/util.h b/util.h
index 40692cb..456708a 100644
--- a/util.h
+++ b/util.h
@@ -46,5 +46,8 @@ Oid			enumLabelToOid(const char *typname, const char *label);
 size_t utf8_substring(const char *src, size_t srcbytes,
 			   size_t start, size_t length,
 			   const char **out_start, size_t *out_bytes);
+void utf8_decode_char_nocheck(const char **sp, unsigned int *uc);
+bool utf8_validate(const char *str, size_t length);
+int utf8_encode_char(char *out, unsigned int uc);
 
 #endif
author	Joey Adams	2010-07-24 22:28:46 +0000
committer	Joey Adams	2010-07-24 22:28:46 +0000
commit	f475e581b72b8c42cf951f6653610d15e71caeee (patch)
tree	0a30a239ae10b3ed799688ecbd6791d4488a2cff
parent	b32257221b4b8e15fada7aabd0fe6e129e00d3e8 (diff)