summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJoey Adams2010-07-24 22:28:46 +0000
committerJoey Adams2010-07-24 22:28:46 +0000
commitf475e581b72b8c42cf951f6653610d15e71caeee (patch)
tree0a30a239ae10b3ed799688ecbd6791d4488a2cff
parentb32257221b4b8e15fada7aabd0fe6e129e00d3e8 (diff)
* Migrated my Unicode functions to util.c and made them rely more on
PostgreSQL's pg_wchar.h routines. * Touched up various functions' documentation. json_node's are currently encoded in UTF-8, and the JSON module is not 100% compatible with arbitrary server encodings yet. I plan to switch from UTF-8 to the server encoding pretty soon, after which JSON should be a well-behaved datatype as far as charsets go.
-rw-r--r--json.c250
-rw-r--r--json.h29
-rw-r--r--json_io.c12
-rw-r--r--jsonpath.c1
-rw-r--r--util.c64
-rw-r--r--util.h3
6 files changed, 135 insertions, 224 deletions
diff --git a/json.c b/json.c
index ec4e0ca..eed3665 100644
--- a/json.c
+++ b/json.c
@@ -22,10 +22,10 @@
*/
#include "json.h"
+#include "util.h"
#include <ctype.h>
-
#define is_internal(node) ((node)->type == JSON_ARRAY || (node)->type == JSON_OBJECT)
/* We can't use isspace() because it also accepts \v and \f, which
@@ -102,10 +102,6 @@ write_hex16(char *out, unsigned int val)
*out++ = hex[val & 0xF];
}
-static bool utf8_validate(const char *str, size_t length);
-static void utf8_decode_char_nocheck(const char **sp, unsigned int *uc);
-static int utf8_encode_char(char *out, unsigned int uc);
-
/*********** json_node creation, manipulation, and deletion **********/
@@ -352,6 +348,10 @@ char *json_decode_string(const char **sp, size_t *length, bool strict);
because it's also used to parse object member keys.
It's also useful outside of json.c, such as in jsonpath.c . */
+/*
+ * json_validate
+ * Make sure the given UTF-8 string is valid JSON.
+ */
bool
json_validate(const char *str)
{
@@ -363,6 +363,11 @@ json_validate(const char *str)
return true;
}
+/*
+ * json_decode
+ * Convert a JSON-encoded string to a JSON node.
+ * @str must be valid UTF-8.
+ */
json_node *
json_decode(const char *str)
{
@@ -378,8 +383,7 @@ json_decode(const char *str)
if (!str)
return NULL;
- if (!utf8_validate(str, strlen(str)))
- return NULL;
+ Assert(utf8_validate(str, strlen(str)));
expect_endp = false;
goto item;
@@ -601,8 +605,7 @@ decode_leaf(const char **sp)
* However, some JSON parsers are more liberal. For instance, PHP accepts
* '.5' and '1.'. JSON.parse accepts '+3'.
*
- * This function takes the strict approach. The user should use
- * json_clean() to handle liberal JSON text.
+ * This function takes the strict approach.
*/
static bool
validate_number(const char **sp)
@@ -669,6 +672,27 @@ decode_number(const char **sp)
return json_mknumber(start, end - start);
}
+/*
+ * json_decode_string
+ * If you're interested in the decoding JSON in general, see json_decode.
+ *
+ * Decodes a JSON string literal (e.g. "\"hello\"").
+ *
+ * If strict is true, string must be double-quoted,
+ * as is required by the JSON RFC.
+ * Otherwise (e.g. if parsing something JSON-like, such as JSONPath),
+ * the string may be single- or double-quoted.
+ *
+ * Also, no whitespace skipping is done, so the caller should only
+ * call this function when it expects **sp to be either " or '
+ *
+ * On success, returns the decoded string, passes that string's length
+ * through *length (which must not be NULL), and advances *sp to point
+ * to the end of string literal (including the quote character).
+ *
+ * On failure (parse error), returns NULL and
+ * leaves *length and *sp untouched.
+ */
char *
json_decode_string(const char **sp, size_t *length, bool strict)
{
@@ -754,6 +778,7 @@ json_decode_string(const char **sp, size_t *length, bool strict)
goto failed;
len = utf8_encode_char(buf, uc);
+ Assert(len > 0);
appendBinaryStringInfo(&ret, buf, len);
continue; /* Continue the enclosing while loop to skip
@@ -825,14 +850,15 @@ json_text_type(const char *str, size_t nbytes)
/****************************** Encoding *****************************/
-static bool
-encode_string(StringInfo out, const char *string, size_t length, char quote, bool escape_unicode)
+static void
+encode_string(StringInfo out, const char *string, size_t length, char quote,
+ bool escape_unicode)
{
const char *s = string;
const char *e = s + length;
- if (!utf8_validate(string, length) || quote == '\\')
- return false;
+ Assert(utf8_validate(string, length));
+ Assert(quote != '\\');
appendStringInfoChar(out, quote);
@@ -912,8 +938,6 @@ encode_string(StringInfo out, const char *string, size_t length, char quote, boo
}
appendStringInfoChar(out, quote);
-
- return true;
}
static bool
@@ -1010,12 +1034,9 @@ json_encode_recurse(json_node * node, json_encode_ctx * ctx)
txt = "false";
break;
case JSON_STRING:
- if (!encode_string(&ctx->str,
- node->v.string.str,
- node->v.string.length,
- '"',
- ctx->escape_unicode))
- return false;
+ encode_string(&ctx->str,
+ node->v.string.str, node->v.string.length,
+ '"', ctx->escape_unicode);
break;
case JSON_NUMBER:
if (!encode_number(&ctx->str, node->v.number))
@@ -1049,18 +1070,10 @@ json_encode_recurse(json_node * node, json_encode_ctx * ctx)
push_orig(key_left_space);
if (has_orig(key))
- {
push_orig(key);
- }
else
- {
- if (!encode_string(&ctx->str,
- node->key,
- node->key_length,
- '"',
- ctx->escape_unicode))
- return false;
- }
+ encode_string(&ctx->str, node->key, node->key_length,
+ '"', ctx->escape_unicode);
if (has_orig(key_right_space))
push_orig(key_right_space);
@@ -1092,171 +1105,26 @@ json_encode_recurse(json_node * node, json_encode_ctx * ctx)
#undef push_orig
}
-char *
-json_encode_string(const char *str, size_t length, char quote, bool escape_unicode)
-{
- StringInfoData ret;
-
- initStringInfo(&ret);
- if (!encode_string(&ret, str, length, quote, escape_unicode))
- {
- pfree(ret.data);
- return NULL;
- }
-
- return ret.data;
-}
-
-
-/****************************** Unicode ******************************/
-
-static const bool utf8_allow_surrogates = false;
-
-static void
-utf8_decode_char_nocheck(const char **sp, unsigned int *uc)
-{
- const unsigned char *s = (const unsigned char *) *sp;
- unsigned char c = *s++;
- unsigned int len;
- unsigned char sf[4] = {0xFF, 0x1F, 0xF, 0x7};
-
- if (c < 0x80)
- len = 0;
- else if (c < 0xE0)
- len = 1;
- else if (c < 0xF0)
- len = 2;
- else
- len = 3;
-
- *uc = c & sf[len];
- while (len--)
- {
- *uc <<= 6;
- *uc |= *s++ & 0x3F;
- }
-
- *sp = (const char *) s;
-}
-
-static bool
-utf8_validate(const char *str, size_t length)
-{
- const unsigned char *s = (const unsigned char *) str;
- const unsigned char *e = s + length;
-
- while (s < e)
- {
- unsigned char c = *s++;
- unsigned int len; /* number of bytes in sequence - 2 */
-
- /* If character is ASCII, move on. */
- if (c < 0x80)
- continue;
-
- if (s >= e)
- return false; /* Missing bytes in sequence. */
-
- if (c < 0xE0)
- {
- /*
- * 2-byte sequence, U+0080 to U+07FF c must be 11000010 or higher
- * s[0] must be 10xxxxxx
- */
- len = 0;
- if (c < 0xC2)
- return false;
- }
- else if (c < 0xF0)
- {
- /*
- * 3-byte sequence, U+0800 to U+FFFF Note that the surrogate range
- * is U+D800 to U+DFFF, and that U+FFFE and U+FFFF are illegal
- * characters. c must be >= 11100000 (which it is) If c is
- * 11100000, then s[0] must be >= 10100000 If the global parameter
- * utf8_allow_surrogates is false: If c is 11101101 and s[0] is >=
- * 10100000, then this is a surrogate and we should fail. If c is
- * 11101111, s[0] is 10111111, and s[1] >= 10111110, then this is
- * an illegal character and we should fail. s[0] and s[1] must be
- * 10xxxxxx
- */
- len = 1;
- if (c == 0xE0 && *s < 0xA0)
- return false;
- if (!utf8_allow_surrogates && c == 0xED && *s >= 0xA0)
- return false;
- if (c == 0xEF && s[0] == 0xBF && (s + 1 >= e || s[1] >= 0xBE))
- return false;
- }
- else
- {
- /*
- * 4-byte sequence, U+010000 to U+10FFFF c must be >= 11110000
- * (which it is) and <= 11110100 If c is 11110000, then s[0] must
- * be >= 10010000 If c is 11110100, then s[0] must be < 10010000
- * s[0], s[1], and s[2] must be 10xxxxxx
- */
- len = 2;
- if (c > 0xF4)
- return false;
- if (c == 0xF0 && *s < 0x90)
- return false;
- if (c == 0xF4 && *s >= 0x90)
- return false;
- }
-
- if (s + len >= e)
- return false; /* Missing bytes in sequence. */
-
- do
- {
- if ((*s++ & 0xC0) != 0x80)
- return false;
- } while (len--);
- }
-
- return true;
-}
-
/*
- * Encodes the Unicode character uc as UTF-8, writing it
- * to *out and updating *out to point to the end of the UTF-8 sequence.
+ * json_encode_string
+ * If you're interested in encoding JSON in general, see json_encode .
*
- * If uc is too high, no character will be emitted, and *out will
- * not be changed. If uc is in the UTF-16 surrogate range
- * (U+D800 thru U+DFFF) or is a designated not-a-character
- * (U+FFFE or U+FFFF), the character will be emitted anyway,
- * although it is technically invalid UTF-8.
+ * Encodes a string literal JSON-style using the given quote character.
+ * Note that using anything but '"' as the quote character will result
+ * in invalid JSON.
*
- * Returns the number of characters emitted.
+ * @str must be valid UTF-8, though it may contain null characters
+ * (hence the length argument).
+ * @quote must not be a backslash.
*/
-static int
-utf8_encode_char(char *out, unsigned int uc)
+char *
+json_encode_string(const char *str, size_t length, char quote,
+ bool escape_unicode)
{
- char *start = out;
+ StringInfoData ret;
- if (uc < 0x80)
- {
- *out++ = uc & 0x7F;
- }
- else if (uc < 0x800)
- {
- *out++ = 0xC0 | (uc >> 6);
- *out++ = 0x80 | (uc & 0x3F);
- }
- else if (uc < 0x10000)
- {
- *out++ = 0xE0 | (uc >> 12);
- *out++ = 0x80 | ((uc >> 6) & 0x3F);
- *out++ = 0x80 | (uc & 0x3F);
- }
- else if (uc < 0x110000)
- {
- *out++ = 0xF0 | ((uc >> 18) & 0x07);
- *out++ = 0x80 | ((uc >> 12) & 0x3F);
- *out++ = 0x80 | ((uc >> 6) & 0x3F);
- *out++ = 0x80 | (uc & 0x3F);
- }
+ initStringInfo(&ret);
+ encode_string(&ret, str, length, quote, escape_unicode);
- return out - start;
+ return ret.data;
}
diff --git a/json.h b/json.h
index 3391cea..de4fb26 100644
--- a/json.h
+++ b/json.h
@@ -162,32 +162,7 @@ json_head(json_node * parent)
}
}
-
-/*
- * Decodes a JSON-encoded string literal
- * (If you're interested in the decoding JSON in general, see json_decode).
- * If strict is true, string must be double-quoted,
- * as is required by the JSON RFC.
- * Otherwise, the string may be single- or double-quoted.
- * Also, no whitespace skipping is done, so the caller should only
- * call this function when it expects **sp to be either " or '
- *
- * On success, returns the decoded string and passes that string's length
- * through *length (which must not be NULL). On failure (parse error),
- * returns NULL and leaves *length untouched.
- */
char *json_decode_string(const char **sp, size_t *length, bool strict);
-
-/*
- * Encodes a string literal JSON-style using the given quote character,
- * only escaping characters when necessary
- * (If you're interested in encoding JSON in general, see json_encode).
- * Note that using anything but '"' as the quote character will result in
- * invalid JSON.
- *
- * Returns NULL if input is invalid UTF-8 or if an invalid quote character
- * (such as backslash) is given.
- */
char *json_encode_string(const char *str, size_t length, char quote, bool escape_unicode);
/* Add child to parent, putting it at the end. */
@@ -199,10 +174,6 @@ void json_remove(json_node * node);
/* Update the value of a node, preserving position and key information. */
void json_replace_value(json_node * node, json_node * replacement);
-/* Note that the factory functions and get/set functions do not validate input.
- * However, json_encode validates node contents to avoid producing
- * invalid JSON. */
-
/* Node factory functions */
json_node *json_mknode(json_type type);
json_node *json_mkbool(bool v_bool);
diff --git a/json_io.c b/json_io.c
index da9d6b7..502372e 100644
--- a/json_io.c
+++ b/json_io.c
@@ -2,6 +2,7 @@
#include "util.h"
#include "utils/array.h"
+#include "mb/pg_wchar.h"
PG_MODULE_MAGIC;
@@ -19,10 +20,15 @@ Datum json_in(PG_FUNCTION_ARGS);
Datum
json_in(PG_FUNCTION_ARGS)
{
- const char *s = PG_GETARG_CSTRING(0);
- jsontype *vardata = cstring_to_text(s);
+ char *string = PG_GETARG_CSTRING(0);
+ jsontype *vardata = cstring_to_text(string);
+ int len = VARSIZE(vardata) - VARHDRSZ;
+ char *utf8string;
- if (!json_validate(s))
+ utf8string = (char *) pg_do_encoding_conversion(
+ (unsigned char *) string, len, GetDatabaseEncoding(), PG_UTF8);
+
+ if (!json_validate(utf8string))
elog(ERROR, "invalid JSON content");
PG_RETURN_JSON_P(vardata);
diff --git a/jsonpath.c b/jsonpath.c
index eaf5b7a..591c5fd 100644
--- a/jsonpath.c
+++ b/jsonpath.c
@@ -176,7 +176,6 @@ jp_show(JSONPath * jp)
break;
case JP_KEY_SUBSCRIPT:
tmp = json_encode_string(elem->data.key.ptr, elem->data.key.length, '"', false);
- Assert(tmp != NULL);
appendStringInfo(string, "%s[%s]", rd ? ".." : "", tmp);
pfree(tmp);
break;
diff --git a/util.c b/util.c
index 39fb0b8..9c64542 100644
--- a/util.c
+++ b/util.c
@@ -94,3 +94,67 @@ utf8_substring(
*out_bytes = sub_end - sub_start;
return sub_length;
}
+
+static const bool utf8_allow_surrogates = false;
+
+void
+utf8_decode_char_nocheck(const char **sp, unsigned int *uc)
+{
+ const unsigned char *s = (const unsigned char *) *sp;
+ unsigned char c = *s++;
+ unsigned int len;
+ unsigned char sf[4] = {0xFF, 0x1F, 0xF, 0x7};
+
+ if (c < 0x80)
+ len = 0;
+ else if (c < 0xE0)
+ len = 1;
+ else if (c < 0xF0)
+ len = 2;
+ else
+ len = 3;
+
+ *uc = c & sf[len];
+ while (len--)
+ {
+ *uc <<= 6;
+ *uc |= *s++ & 0x3F;
+ }
+
+ *sp = (const char *) s;
+}
+
+bool
+utf8_validate(const char *str, size_t length)
+{
+ const unsigned char *s = (const unsigned char *) str;
+ const unsigned char *e = s + length;
+ int len;
+
+ while (s < e)
+ {
+ if (*s <= 0x7F)
+ {
+ s++;
+ continue;
+ }
+
+ len = pg_utf_mblen(s);
+ if (s + len > e)
+ return false;
+
+ if (!pg_utf8_islegal(s, len))
+ return false;
+
+ s += len;
+ }
+
+ return true;
+}
+
+int
+utf8_encode_char(char *out, unsigned int uc)
+{
+ unicode_to_utf8(uc, (unsigned char *) out);
+ return pg_utf_mblen((unsigned char *) out);
+}
diff --git a/util.h b/util.h
index 40692cb..456708a 100644
--- a/util.h
+++ b/util.h
@@ -46,5 +46,8 @@ Oid enumLabelToOid(const char *typname, const char *label);
size_t utf8_substring(const char *src, size_t srcbytes,
size_t start, size_t length,
const char **out_start, size_t *out_bytes);
+void utf8_decode_char_nocheck(const char **sp, unsigned int *uc);
+bool utf8_validate(const char *str, size_t length);
+int utf8_encode_char(char *out, unsigned int uc);
#endif