diff options
Diffstat (limited to 'json.c')
| -rw-r--r-- | json.c | 250 |
1 files changed, 59 insertions, 191 deletions
@@ -22,10 +22,10 @@ */ #include "json.h" +#include "util.h" #include <ctype.h> - #define is_internal(node) ((node)->type == JSON_ARRAY || (node)->type == JSON_OBJECT) /* We can't use isspace() because it also accepts \v and \f, which @@ -102,10 +102,6 @@ write_hex16(char *out, unsigned int val) *out++ = hex[val & 0xF]; } -static bool utf8_validate(const char *str, size_t length); -static void utf8_decode_char_nocheck(const char **sp, unsigned int *uc); -static int utf8_encode_char(char *out, unsigned int uc); - /*********** json_node creation, manipulation, and deletion **********/ @@ -352,6 +348,10 @@ char *json_decode_string(const char **sp, size_t *length, bool strict); because it's also used to parse object member keys. It's also useful outside of json.c, such as in jsonpath.c . */ +/* + * json_validate + * Make sure the given UTF-8 string is valid JSON. + */ bool json_validate(const char *str) { @@ -363,6 +363,11 @@ json_validate(const char *str) return true; } +/* + * json_decode + * Convert a JSON-encoded string to a JSON node. + * @str must be valid UTF-8. + */ json_node * json_decode(const char *str) { @@ -378,8 +383,7 @@ json_decode(const char *str) if (!str) return NULL; - if (!utf8_validate(str, strlen(str))) - return NULL; + Assert(utf8_validate(str, strlen(str))); expect_endp = false; goto item; @@ -601,8 +605,7 @@ decode_leaf(const char **sp) * However, some JSON parsers are more liberal. For instance, PHP accepts * '.5' and '1.'. JSON.parse accepts '+3'. * - * This function takes the strict approach. The user should use - * json_clean() to handle liberal JSON text. + * This function takes the strict approach. */ static bool validate_number(const char **sp) @@ -669,6 +672,27 @@ decode_number(const char **sp) return json_mknumber(start, end - start); } +/* + * json_decode_string + * If you're interested in the decoding JSON in general, see json_decode. + * + * Decodes a JSON string literal (e.g. "\"hello\""). + * + * If strict is true, string must be double-quoted, + * as is required by the JSON RFC. + * Otherwise (e.g. if parsing something JSON-like, such as JSONPath), + * the string may be single- or double-quoted. + * + * Also, no whitespace skipping is done, so the caller should only + * call this function when it expects **sp to be either " or ' + * + * On success, returns the decoded string, passes that string's length + * through *length (which must not be NULL), and advances *sp to point + * to the end of string literal (including the quote character). + * + * On failure (parse error), returns NULL and + * leaves *length and *sp untouched. + */ char * json_decode_string(const char **sp, size_t *length, bool strict) { @@ -754,6 +778,7 @@ json_decode_string(const char **sp, size_t *length, bool strict) goto failed; len = utf8_encode_char(buf, uc); + Assert(len > 0); appendBinaryStringInfo(&ret, buf, len); continue; /* Continue the enclosing while loop to skip @@ -825,14 +850,15 @@ json_text_type(const char *str, size_t nbytes) /****************************** Encoding *****************************/ -static bool -encode_string(StringInfo out, const char *string, size_t length, char quote, bool escape_unicode) +static void +encode_string(StringInfo out, const char *string, size_t length, char quote, + bool escape_unicode) { const char *s = string; const char *e = s + length; - if (!utf8_validate(string, length) || quote == '\\') - return false; + Assert(utf8_validate(string, length)); + Assert(quote != '\\'); appendStringInfoChar(out, quote); @@ -912,8 +938,6 @@ encode_string(StringInfo out, const char *string, size_t length, char quote, boo } appendStringInfoChar(out, quote); - - return true; } static bool @@ -1010,12 +1034,9 @@ json_encode_recurse(json_node * node, json_encode_ctx * ctx) txt = "false"; break; case JSON_STRING: - if (!encode_string(&ctx->str, - node->v.string.str, - node->v.string.length, - '"', - ctx->escape_unicode)) - return false; + encode_string(&ctx->str, + node->v.string.str, node->v.string.length, + '"', ctx->escape_unicode); break; case JSON_NUMBER: if (!encode_number(&ctx->str, node->v.number)) @@ -1049,18 +1070,10 @@ json_encode_recurse(json_node * node, json_encode_ctx * ctx) push_orig(key_left_space); if (has_orig(key)) - { push_orig(key); - } else - { - if (!encode_string(&ctx->str, - node->key, - node->key_length, - '"', - ctx->escape_unicode)) - return false; - } + encode_string(&ctx->str, node->key, node->key_length, + '"', ctx->escape_unicode); if (has_orig(key_right_space)) push_orig(key_right_space); @@ -1092,171 +1105,26 @@ json_encode_recurse(json_node * node, json_encode_ctx * ctx) #undef push_orig } -char * -json_encode_string(const char *str, size_t length, char quote, bool escape_unicode) -{ - StringInfoData ret; - - initStringInfo(&ret); - if (!encode_string(&ret, str, length, quote, escape_unicode)) - { - pfree(ret.data); - return NULL; - } - - return ret.data; -} - - -/****************************** Unicode ******************************/ - -static const bool utf8_allow_surrogates = false; - -static void -utf8_decode_char_nocheck(const char **sp, unsigned int *uc) -{ - const unsigned char *s = (const unsigned char *) *sp; - unsigned char c = *s++; - unsigned int len; - unsigned char sf[4] = {0xFF, 0x1F, 0xF, 0x7}; - - if (c < 0x80) - len = 0; - else if (c < 0xE0) - len = 1; - else if (c < 0xF0) - len = 2; - else - len = 3; - - *uc = c & sf[len]; - while (len--) - { - *uc <<= 6; - *uc |= *s++ & 0x3F; - } - - *sp = (const char *) s; -} - -static bool -utf8_validate(const char *str, size_t length) -{ - const unsigned char *s = (const unsigned char *) str; - const unsigned char *e = s + length; - - while (s < e) - { - unsigned char c = *s++; - unsigned int len; /* number of bytes in sequence - 2 */ - - /* If character is ASCII, move on. */ - if (c < 0x80) - continue; - - if (s >= e) - return false; /* Missing bytes in sequence. */ - - if (c < 0xE0) - { - /* - * 2-byte sequence, U+0080 to U+07FF c must be 11000010 or higher - * s[0] must be 10xxxxxx - */ - len = 0; - if (c < 0xC2) - return false; - } - else if (c < 0xF0) - { - /* - * 3-byte sequence, U+0800 to U+FFFF Note that the surrogate range - * is U+D800 to U+DFFF, and that U+FFFE and U+FFFF are illegal - * characters. c must be >= 11100000 (which it is) If c is - * 11100000, then s[0] must be >= 10100000 If the global parameter - * utf8_allow_surrogates is false: If c is 11101101 and s[0] is >= - * 10100000, then this is a surrogate and we should fail. If c is - * 11101111, s[0] is 10111111, and s[1] >= 10111110, then this is - * an illegal character and we should fail. s[0] and s[1] must be - * 10xxxxxx - */ - len = 1; - if (c == 0xE0 && *s < 0xA0) - return false; - if (!utf8_allow_surrogates && c == 0xED && *s >= 0xA0) - return false; - if (c == 0xEF && s[0] == 0xBF && (s + 1 >= e || s[1] >= 0xBE)) - return false; - } - else - { - /* - * 4-byte sequence, U+010000 to U+10FFFF c must be >= 11110000 - * (which it is) and <= 11110100 If c is 11110000, then s[0] must - * be >= 10010000 If c is 11110100, then s[0] must be < 10010000 - * s[0], s[1], and s[2] must be 10xxxxxx - */ - len = 2; - if (c > 0xF4) - return false; - if (c == 0xF0 && *s < 0x90) - return false; - if (c == 0xF4 && *s >= 0x90) - return false; - } - - if (s + len >= e) - return false; /* Missing bytes in sequence. */ - - do - { - if ((*s++ & 0xC0) != 0x80) - return false; - } while (len--); - } - - return true; -} - /* - * Encodes the Unicode character uc as UTF-8, writing it - * to *out and updating *out to point to the end of the UTF-8 sequence. + * json_encode_string + * If you're interested in encoding JSON in general, see json_encode . * - * If uc is too high, no character will be emitted, and *out will - * not be changed. If uc is in the UTF-16 surrogate range - * (U+D800 thru U+DFFF) or is a designated not-a-character - * (U+FFFE or U+FFFF), the character will be emitted anyway, - * although it is technically invalid UTF-8. + * Encodes a string literal JSON-style using the given quote character. + * Note that using anything but '"' as the quote character will result + * in invalid JSON. * - * Returns the number of characters emitted. + * @str must be valid UTF-8, though it may contain null characters + * (hence the length argument). + * @quote must not be a backslash. */ -static int -utf8_encode_char(char *out, unsigned int uc) +char * +json_encode_string(const char *str, size_t length, char quote, + bool escape_unicode) { - char *start = out; + StringInfoData ret; - if (uc < 0x80) - { - *out++ = uc & 0x7F; - } - else if (uc < 0x800) - { - *out++ = 0xC0 | (uc >> 6); - *out++ = 0x80 | (uc & 0x3F); - } - else if (uc < 0x10000) - { - *out++ = 0xE0 | (uc >> 12); - *out++ = 0x80 | ((uc >> 6) & 0x3F); - *out++ = 0x80 | (uc & 0x3F); - } - else if (uc < 0x110000) - { - *out++ = 0xF0 | ((uc >> 18) & 0x07); - *out++ = 0x80 | ((uc >> 12) & 0x3F); - *out++ = 0x80 | ((uc >> 6) & 0x3F); - *out++ = 0x80 | (uc & 0x3F); - } + initStringInfo(&ret); + encode_string(&ret, str, length, quote, escape_unicode); - return out - start; + return ret.data; } |
