1 files changed, 59 insertions, 191 deletions
diff --git a/json.c b/json.c
index ec4e0ca..eed3665 100644
--- a/json.c
+++ b/json.c
@@ -22,10 +22,10 @@
 */
 
 #include "json.h"
+#include "util.h"
 
 #include <ctype.h>
 
-
 #define is_internal(node) ((node)->type == JSON_ARRAY || (node)->type == JSON_OBJECT)
 
 /* We can't use isspace() because it also accepts \v and \f, which
@@ -102,10 +102,6 @@ write_hex16(char *out, unsigned int val)
 	*out++ = hex[val & 0xF];
 }
 
-static bool utf8_validate(const char *str, size_t length);
-static void utf8_decode_char_nocheck(const char **sp, unsigned int *uc);
-static int	utf8_encode_char(char *out, unsigned int uc);
-
 
 /*********** json_node creation, manipulation, and deletion **********/
 
@@ -352,6 +348,10 @@ char	   *json_decode_string(const char **sp, size_t *length, bool strict);
    because it's also used to parse object member keys.
    It's also useful outside of json.c, such as in jsonpath.c . */
 
+/*
+ * json_validate
+ *    Make sure the given UTF-8 string is valid JSON.
+ */
 bool
 json_validate(const char *str)
 {
@@ -363,6 +363,11 @@ json_validate(const char *str)
 	return true;
 }
 
+/*
+ * json_decode
+ *    Convert a JSON-encoded string to a JSON node.
+ *    @str must be valid UTF-8.
+ */
 json_node *
 json_decode(const char *str)
 {
@@ -378,8 +383,7 @@ json_decode(const char *str)
 	if (!str)
 		return NULL;
 
-	if (!utf8_validate(str, strlen(str)))
-		return NULL;
+	Assert(utf8_validate(str, strlen(str)));
 
 	expect_endp = false;
 	goto item;
@@ -601,8 +605,7 @@ decode_leaf(const char **sp)
  * However, some JSON parsers are more liberal.  For instance, PHP accepts
  * '.5' and '1.'.  JSON.parse accepts '+3'.
  *
- * This function takes the strict approach.  The user should use
- * json_clean() to handle liberal JSON text.
+ * This function takes the strict approach.
  */
 static bool
 validate_number(const char **sp)
@@ -669,6 +672,27 @@ decode_number(const char **sp)
 	return json_mknumber(start, end - start);
 }
 
+/*
+ * json_decode_string
+ *    If you're interested in the decoding JSON in general, see json_decode.
+ *
+ *    Decodes a JSON string literal (e.g. "\"hello\"").
+ *
+ *    If strict is true, string must be double-quoted,
+ *    as is required by the JSON RFC.
+ *    Otherwise (e.g. if parsing something JSON-like, such as JSONPath),
+ *    the string may be single- or double-quoted.
+ *
+ *    Also, no whitespace skipping is done, so the caller should only
+ *    call this function when it expects **sp to be either " or '
+ *
+ *    On success, returns the decoded string, passes that string's length
+ *    through *length (which must not be NULL), and advances *sp to point
+ *    to the end of string literal (including the quote character).
+ *
+ *    On failure (parse error), returns NULL and
+ *    leaves *length and *sp untouched.
+ */
 char *
 json_decode_string(const char **sp, size_t *length, bool strict)
 {
@@ -754,6 +778,7 @@ json_decode_string(const char **sp, size_t *length, bool strict)
 						goto failed;
 
 					len = utf8_encode_char(buf, uc);
+					Assert(len > 0);
 					appendBinaryStringInfo(&ret, buf, len);
 
 					continue;	/* Continue the enclosing while loop to skip
@@ -825,14 +850,15 @@ json_text_type(const char *str, size_t nbytes)
 
 /****************************** Encoding *****************************/
 
-static bool
-encode_string(StringInfo out, const char *string, size_t length, char quote, bool escape_unicode)
+static void
+encode_string(StringInfo out, const char *string, size_t length, char quote,
+			bool escape_unicode)
 {
 	const char *s = string;
 	const char *e = s + length;
 
-	if (!utf8_validate(string, length) || quote == '\\')
-		return false;
+	Assert(utf8_validate(string, length));
+	Assert(quote != '\\');
 
 	appendStringInfoChar(out, quote);
 
@@ -912,8 +938,6 @@ encode_string(StringInfo out, const char *string, size_t length, char quote, boo
 	}
 
 	appendStringInfoChar(out, quote);
-
-	return true;
 }
 
 static bool
@@ -1010,12 +1034,9 @@ json_encode_recurse(json_node * node, json_encode_ctx * ctx)
 					txt = "false";
 				break;
 			case JSON_STRING:
-				if (!encode_string(&ctx->str,
-								   node->v.string.str,
-								   node->v.string.length,
-								   '"',
-								   ctx->escape_unicode))
-					return false;
+				encode_string(&ctx->str,
+						node->v.string.str, node->v.string.length,
+						'"', ctx->escape_unicode);
 				break;
 			case JSON_NUMBER:
 				if (!encode_number(&ctx->str, node->v.number))
@@ -1049,18 +1070,10 @@ json_encode_recurse(json_node * node, json_encode_ctx * ctx)
 						push_orig(key_left_space);
 
 					if (has_orig(key))
-					{
 						push_orig(key);
-					}
 					else
-					{
-						if (!encode_string(&ctx->str,
-										   node->key,
-										   node->key_length,
-										   '"',
-										   ctx->escape_unicode))
-							return false;
-					}
+						encode_string(&ctx->str, node->key, node->key_length,
+										'"', ctx->escape_unicode);
 
 					if (has_orig(key_right_space))
 						push_orig(key_right_space);
@@ -1092,171 +1105,26 @@ json_encode_recurse(json_node * node, json_encode_ctx * ctx)
 #undef push_orig
 }
 
-char *
-json_encode_string(const char *str, size_t length, char quote, bool escape_unicode)
-{
-	StringInfoData ret;
-
-	initStringInfo(&ret);
-	if (!encode_string(&ret, str, length, quote, escape_unicode))
-	{
-		pfree(ret.data);
-		return NULL;
-	}
-
-	return ret.data;
-}
-
-
-/****************************** Unicode ******************************/
-
-static const bool utf8_allow_surrogates = false;
-
-static void
-utf8_decode_char_nocheck(const char **sp, unsigned int *uc)
-{
-	const unsigned char *s = (const unsigned char *) *sp;
-	unsigned char c = *s++;
-	unsigned int len;
-	unsigned char sf[4] = {0xFF, 0x1F, 0xF, 0x7};
-
-	if (c < 0x80)
-		len = 0;
-	else if (c < 0xE0)
-		len = 1;
-	else if (c < 0xF0)
-		len = 2;
-	else
-		len = 3;
-
-	*uc = c & sf[len];
-	while (len--)
-	{
-		*uc <<= 6;
-		*uc |= *s++ & 0x3F;
-	}
-
-	*sp = (const char *) s;
-}
-
-static bool
-utf8_validate(const char *str, size_t length)
-{
-	const unsigned char *s = (const unsigned char *) str;
-	const unsigned char *e = s + length;
-
-	while (s < e)
-	{
-		unsigned char c = *s++;
-		unsigned int len;		/* number of bytes in sequence - 2 */
-
-		/* If character is ASCII, move on. */
-		if (c < 0x80)
-			continue;
-
-		if (s >= e)
-			return false;		/* Missing bytes in sequence. */
-
-		if (c < 0xE0)
-		{
-			/*
-			 * 2-byte sequence, U+0080 to U+07FF c must be 11000010 or higher
-			 * s[0] must be 10xxxxxx
-			 */
-			len = 0;
-			if (c < 0xC2)
-				return false;
-		}
-		else if (c < 0xF0)
-		{
-			/*
-			 * 3-byte sequence, U+0800 to U+FFFF Note that the surrogate range
-			 * is U+D800 to U+DFFF, and that U+FFFE and U+FFFF are illegal
-			 * characters. c must be >= 11100000 (which it is) If c is
-			 * 11100000, then s[0] must be >= 10100000 If the global parameter
-			 * utf8_allow_surrogates is false: If c is 11101101 and s[0] is >=
-			 * 10100000, then this is a surrogate and we should fail. If c is
-			 * 11101111, s[0] is 10111111, and s[1] >= 10111110, then this is
-			 * an illegal character and we should fail. s[0] and s[1] must be
-			 * 10xxxxxx
-			 */
-			len = 1;
-			if (c == 0xE0 && *s < 0xA0)
-				return false;
-			if (!utf8_allow_surrogates && c == 0xED && *s >= 0xA0)
-				return false;
-			if (c == 0xEF && s[0] == 0xBF && (s + 1 >= e || s[1] >= 0xBE))
-				return false;
-		}
-		else
-		{
-			/*
-			 * 4-byte sequence, U+010000 to U+10FFFF c must be >= 11110000
-			 * (which it is) and <= 11110100 If c is 11110000, then s[0] must
-			 * be >= 10010000 If c is 11110100, then s[0] must be < 10010000
-			 * s[0], s[1], and s[2] must be 10xxxxxx
-			 */
-			len = 2;
-			if (c > 0xF4)
-				return false;
-			if (c == 0xF0 && *s < 0x90)
-				return false;
-			if (c == 0xF4 && *s >= 0x90)
-				return false;
-		}
-
-		if (s + len >= e)
-			return false;		/* Missing bytes in sequence. */
-
-		do
-		{
-			if ((*s++ & 0xC0) != 0x80)
-				return false;
-		} while (len--);
-	}
-
-	return true;
-}
-
 /*
- * Encodes the Unicode character uc as UTF-8, writing it
- * to *out and updating *out to point to the end of the UTF-8 sequence.
+ * json_encode_string
+ *    If you're interested in encoding JSON in general, see json_encode .
  *
- * If uc is too high, no character will be emitted, and *out will
- * not be changed.	If uc is in the UTF-16 surrogate range
- * (U+D800 thru U+DFFF) or is a designated not-a-character
- * (U+FFFE or U+FFFF), the character will be emitted anyway,
- * although it is technically invalid UTF-8.
+ *    Encodes a string literal JSON-style using the given quote character.
+ *    Note that using anything but '"' as the quote character will result
+ *    in invalid JSON.
  *
- * Returns the number of characters emitted.
+ *    @str must be valid UTF-8, though it may contain null characters
+ *       (hence the length argument).
+ *    @quote must not be a backslash.
  */
-static int
-utf8_encode_char(char *out, unsigned int uc)
+char *
+json_encode_string(const char *str, size_t length, char quote,
+					bool escape_unicode)
 {
-	char	   *start = out;
+	StringInfoData ret;
 
-	if (uc < 0x80)
-	{
-		*out++ = uc & 0x7F;
-	}
-	else if (uc < 0x800)
-	{
-		*out++ = 0xC0 | (uc >> 6);
-		*out++ = 0x80 | (uc & 0x3F);
-	}
-	else if (uc < 0x10000)
-	{
-		*out++ = 0xE0 | (uc >> 12);
-		*out++ = 0x80 | ((uc >> 6) & 0x3F);
-		*out++ = 0x80 | (uc & 0x3F);
-	}
-	else if (uc < 0x110000)
-	{
-		*out++ = 0xF0 | ((uc >> 18) & 0x07);
-		*out++ = 0x80 | ((uc >> 12) & 0x3F);
-		*out++ = 0x80 | ((uc >> 6) & 0x3F);
-		*out++ = 0x80 | (uc & 0x3F);
-	}
+	initStringInfo(&ret);
+	encode_string(&ret, str, length, quote, escape_unicode);
 
-	return out - start;
+	return ret.data;
 }