/* Copyright (C) 2010 Joseph A. Adams (joeyadams3.14159@gmail.com) All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "json.h" #include "util.h" #include #define is_internal(node) ((node)->type == JSON_ARRAY || (node)->type == JSON_OBJECT) /* We can't use isspace() because it also accepts \v and \f, which aren't legal whitespace characters in strict JSON. */ #define is_whitespace(c) ((c)==' ' || (c)=='\t' || (c)=='\n' || (c)=='\r') static void skip_whitespace(const char **sp) { const char *s = *sp; while (is_whitespace(*s)) s++; *sp = s; } static char end_parenthesis(json_node * node) { if (!node) return 0; switch (node->type) { case JSON_ARRAY: return ']'; case JSON_OBJECT: return '}'; default: return 0; } } /* * Reads exactly 4 hex characters (capital or lowercase). * Writes the result to *out . * Returns true on success, false on failure. */ static bool read_hex16(const char *in, unsigned int *out) { unsigned int i; unsigned int tmp; char c; *out = 0; for (i = 0; i < 4; i++) { c = *in++; if (c >= '0' && c <= '9') tmp = c - '0'; else if (c >= 'A' && c <= 'F') tmp = c - 'A' + 10; else if (c >= 'a' && c <= 'f') tmp = c - 'a' + 10; else return false; *out <<= 4; *out += tmp; } return true; } static void write_hex16(char *out, unsigned int val) { const char *hex = "0123456789ABCDEF"; *out++ = hex[(val >> 12) & 0xF]; *out++ = hex[(val >> 8) & 0xF]; *out++ = hex[(val >> 4) & 0xF]; *out++ = hex[val & 0xF]; } /*********** json_node creation, manipulation, and deletion **********/ json_node * json_mknode(json_type type) { json_node *node = palloc(sizeof(*node)); memset(node, 0, sizeof(*node)); node->type = type; return node; } json_node * json_mkbool(bool v_bool) { json_node *node = json_mknode(JSON_BOOL); node->v.v_bool = v_bool; return node; } json_node * json_mkstring(const char *str, size_t length) { json_node *node = json_mknode(JSON_STRING); if (str) { node->v.string.str = pnstrdup(str, length); node->v.string.length = length; } return node; } json_node * json_mknumber(const char *number, size_t length) { json_node *node = json_mknode(JSON_NUMBER); if (number) node->v.number = pnstrdup(number, length); return node; } /* Indicate that the node's value has changed, * marking ancestors as necessary. * * Call json_touch_value so that json_encode(, JSONOPT_ORIG) * will encode the new value rather than using original text. */ void json_touch_value(json_node * node) { while (node && node->orig.value.start) { node->orig.value.start = NULL; node = node->parent; } } static void json_append_notouch(json_node * parent, json_node * child) { Assert(parent->type == JSON_ARRAY || parent->type == JSON_OBJECT); Assert(child->parent == NULL); parent->v.children.count++; child->parent = parent; child->prev = parent->v.children.tail; child->next = NULL; if (parent->v.children.tail) { parent->v.children.tail->next = child; parent->v.children.tail = child; } else { parent->v.children.head = parent->v.children.tail = child; } } void json_append(json_node * parent, json_node * child) { json_append_notouch(parent, child); json_touch_value(parent); } void json_remove(json_node * node) { json_node *parent = node->parent; if (!parent) return; Assert(parent->type == JSON_ARRAY || parent->type == JSON_OBJECT); Assert(parent->v.children.count > 0); if (node->prev) node->prev->next = node->next; else parent->v.children.head = node->next; if (node->next) node->next->prev = node->prev; else parent->v.children.tail = node->prev; parent->v.children.count--; node->parent = NULL; node->prev = NULL; node->next = NULL; json_touch_value(parent); } void json_replace_value(json_node * node, json_node * replacement) { node->type = replacement->type; node->v = replacement->v; node->orig.value = replacement->orig.value; if (node->parent) json_touch_value(node->parent); } const char * json_get_string(json_node * node, size_t *length_out) { Assert(node->type == JSON_STRING); if (length_out) *length_out = node->v.string.length; return node->v.string.str; } void json_set_string(json_node * node, const char *str, size_t length) { Assert(node->type == JSON_STRING); if (node->v.string.str) pfree(node->v.string.str); if (str) { node->v.string.str = pnstrdup(str, length); node->v.string.length = length; } else { node->v.string.str = NULL; node->v.string.length = 0; } json_touch_value(node); } const char * json_get_number(json_node * node) { Assert(node->type == JSON_NUMBER); return node->v.number; } void json_set_number(json_node * node, const char *number, size_t length) { Assert(node->type == JSON_NUMBER); if (node->v.number) pfree(node->v.number); if (number) node->v.number = pnstrdup(number, length); else node->v.number = NULL; json_touch_value(node); } /* Non-recursively free a node */ static void free_node(json_node * node) { if (node->type == JSON_STRING) { if (node->v.string.str) pfree(node->v.string.str); } else if (node->type == JSON_NUMBER) { if (node->v.number) pfree(node->v.number); } if (node->key) pfree(node->key); pfree(node); } void json_delete(json_node * node) { json_node *parent, *next; if (!node) return; /* Remove node from parent (if it has one). */ json_remove(node); goto descend; descend: while (is_internal(node) && node->v.children.head) node = node->v.children.head; goto advance; advance: parent = node->parent; next = node->next; free_node(node); node = next; if (node) goto descend; else goto ascend; ascend: node = parent; if (node) goto advance; else return; } /*********************** Parsing and validation **********************/ static json_node *decode_leaf(const char **sp); static json_node *decode_number(const char **sp); char *json_decode_string(const char **sp, size_t *length, bool strict); /* json_decode_string has a different signature than its friends because it's also used to parse object member keys. It's also useful outside of json.c, such as in jsonpath.c . */ /* * json_validate * Make sure the given UTF-8 string is valid JSON. */ bool json_validate(const char *str) { json_node *node = json_decode(str); if (!node) return false; json_delete(node); return true; } /* * json_decode * Convert a JSON-encoded string to a JSON node. * @str must be valid UTF-8. */ json_node * json_decode(const char *str) { json_node *root = NULL, *parent = NULL, *node = NULL; const char *s = str; char *key; size_t key_length; struct json_node_orig orig; bool expect_endp; if (!str) return NULL; Assert(utf8_validate(str, strlen(str))); expect_endp = false; goto item; item: /* Expect a value (set expect_endp before goto * item; ) */ key = NULL; key_length = 0; memset(&orig, 0, sizeof(orig)); orig.key_left_space.start = s; orig.left_space.start = s; skip_whitespace(&s); if (expect_endp) { if (*s == ']' || *s == '}') goto endp; } if (parent && parent->type == JSON_OBJECT) { /* Parse member key string. */ orig.key_left_space.end = s; orig.key.start = s; key = json_decode_string(&s, &key_length, true); if (!key) goto failed; orig.key.end = s; orig.key_right_space.start = s; /* Eat the " : " */ skip_whitespace(&s); if (*s != ':') goto failed; orig.key_right_space.end = s; s++; orig.left_space.start = s; skip_whitespace(&s); } /* * The way orig.value and company are initialized is a bit funky. If this * node has children, we have to finish parsing the node's children before * we know where it ends. Hence, initialization of orig.value_end and * after will be deferred if this node has children. */ orig.left_space.end = s; orig.value.start = s; node = decode_leaf(&s); if (!node) { if (*s == '[') node = json_mknode(JSON_ARRAY); else if (*s == '{') node = json_mknode(JSON_OBJECT); else goto failed; s++; /* * orig.value.end and later are dangling (actually NULL) for now, but * will be initialized when we get to state 'endp' . */ } else { orig.value.end = s; orig.right_space.start = s; skip_whitespace(&s); orig.right_space.end = s; } node->key = key; node->key_length = key_length; /* * The key now belongs to the node. This prevents a double free on * failure (see the failed: label). */ key = NULL; node->orig = orig; if (parent) json_append_notouch(parent, node); else root = node; if (is_internal(node)) { /* * "push" node onto the "stack". Nodes point up to their parents, * which is why this function doesn't need a "stack" per se. */ parent = node; expect_endp = true; goto item; } if (parent) goto comma_endp; else goto end; comma_endp: /* Expect a comma or end bracket/brace */ if (*s == ',') { s++; expect_endp = false; goto item; } if (*s == ']' || *s == '}') goto endp; goto failed; endp: /* Handle an end bracket/brace */ if (*s != end_parenthesis(parent)) goto failed; s++; /* "pop" a node from the "stack" */ node = parent; parent = parent->parent; /* * The other pointers were set when we started parsing this node in the * 'item' state. */ node->orig.value.end = s; node->orig.right_space.start = s; skip_whitespace(&s); node->orig.right_space.end = s; if (parent) goto comma_endp; else goto end; end: /* Expect end of text */ if (*s) goto failed; return node; failed: /* Handle failure */ if (key) pfree(key); json_delete(root); return NULL; } /* * Decode and skip a node that does not have children. * Whitespace is not skipped first (it is done in the primary decode loop). * * Returns NULL if next character is '[', '{', or invalid. */ static json_node * decode_leaf(const char **sp) { char c = **sp; if (c == '"') { size_t length; char *str = json_decode_string(sp, &length, true); if (str) { json_node *node = json_mknode(JSON_STRING); node->v.string.str = str; node->v.string.length = length; return node; } return NULL; } if ((c >= '0' && c <= '9') || c == '-') return decode_number(sp); if (!strncmp(*sp, "true", 4)) { (*sp) += 4; return json_mkbool(true); } if (!strncmp(*sp, "false", 5)) { (*sp) += 5; return json_mkbool(false); } if (!strncmp(*sp, "null", 4)) { (*sp) += 4; return json_mknode(JSON_NULL); } return NULL; } /* * The JSON spec says that a number shall follow this precise pattern * (spaces and quotes added for readability): * '-'? (0 | [1-9][0-9]*) ('.' [0-9]+)? ([Ee] [+-]? [0-9]+)? * * However, some JSON parsers are more liberal. For instance, PHP accepts * '.5' and '1.'. JSON.parse accepts '+3'. * * This function takes the strict approach. */ static bool validate_number(const char **sp) { const char *s = *sp; /* '-'? */ if (*s == '-') s++; /* (0 | [1-9][0-9]*) */ if (*s == '0') { s++; } else { if (!isdigit(*s)) return false; do s++; while (isdigit(*s)); } /* ('.' [0-9]+)? */ if (*s == '.') { s++; if (!isdigit(*s)) return false; do s++; while (isdigit(*s)); } /* ([Ee] [+-]? [0-9]+)? */ if (*s == 'E' || *s == 'e') { s++; if (*s == '+' || *s == '-') s++; if (!isdigit(*s)) return false; do s++; while (isdigit(*s)); } *sp = s; return true; } static json_node * decode_number(const char **sp) { const char *start, *end; start = *sp; if (!validate_number(sp)) return NULL; end = *sp; return json_mknumber(start, end - start); } /* * json_decode_string * If you're interested in the decoding JSON in general, see json_decode. * * Decodes a JSON string literal (e.g. "\"hello\""). * * If strict is true, string must be double-quoted, * as is required by the JSON RFC. * Otherwise (e.g. if parsing something JSON-like, such as JSONPath), * the string may be single- or double-quoted. * * Also, no whitespace skipping is done, so the caller should only * call this function when it expects **sp to be either " or ' * * On success, returns the decoded string, passes that string's length * through *length (which must not be NULL), and advances *sp to point * to the end of string literal (including the quote character). * * On failure (parse error), returns NULL and * leaves *length and *sp untouched. */ char * json_decode_string(const char **sp, size_t *length, bool strict) { const char *s = *sp; StringInfoData ret; char buf[4]; int len; char quote; Assert(length != NULL); initStringInfo(&ret); quote = *s++; if (strict) { if (quote != '"') return NULL; } else { if (quote != '"' && quote != '\'') return NULL; } while (*s && *s != quote) { unsigned char c = *s++; unsigned int uc; unsigned int lc; if (c == '\\') { c = *s++; switch (c) { case '\\': case '/': break; case 'b': c = '\b'; break; case 'f': c = '\f'; break; case 'n': c = '\n'; break; case 'r': c = '\r'; break; case 't': c = '\t'; break; case 'u': if (!read_hex16(s, &uc)) goto failed; s += 4; if (uc >= 0xD800 && uc <= 0xDFFF) { /* Handle UTF-16 surrogate pair. */ if (uc >= 0xDC00) goto failed; /* Second surrogate not * preceded by first * surrogate. */ if (s[0] != '\\' || s[1] != 'u' || !read_hex16(s + 2, &lc) || !(lc >= 0xDC00 && lc <= 0xDFFF)) goto failed; /* First surrogate not * followed by second * surrogate. */ s += 6; uc = 0x10000 | ((uc & 0x3FF) << 10) | (lc & 0x3FF); } /* 0xFFFE and 0xFFFF are invalid Unicode */ if (uc == 0xFFFE || uc == 0xFFFF) goto failed; len = utf8_encode_char(buf, uc); Assert(len > 0); appendBinaryStringInfo(&ret, buf, len); continue; /* Continue the enclosing while loop to skip * the str_append below. */ default: /* Invalid escape */ if (c == quote) break; if (!strict && (c == '"' || c == '\'')) break; goto failed; /* Invalid escape */ } } else if (c <= 0x1F) { /* Control characters not allowed in string literals. */ goto failed; } appendStringInfoChar(&ret, c); } if (!*s++) goto failed; *length = ret.len; *sp = s; return ret.data; failed: pfree(ret.data); return NULL; } json_type json_text_type(const char *str, size_t nbytes) { const char *s = str; const char *e = str + nbytes; char c; /* Skip whitespace characters. */ while (s < e && is_whitespace(*s)) s++; /* Get first non-white character, making sure it's in bounds. */ if (s >= e) return JSON_INVALID; c = *s; switch (c) { case 'n': return JSON_NULL; case '"': return JSON_STRING; case 't': case 'f': return JSON_BOOL; case '{': return JSON_OBJECT; case '[': return JSON_ARRAY; default: if (c == '-' || (c >= '0' && c <= '9')) return JSON_NUMBER; return JSON_INVALID; } } /****************************** Encoding *****************************/ static void encode_string(StringInfo out, const char *string, size_t length, char quote, bool escape_unicode) { const char *s = string; const char *e = s + length; Assert(utf8_validate(string, length)); Assert(quote != '\\'); appendStringInfoChar(out, quote); while (s < e) { unsigned char c = *s++; unsigned char e; switch (c) { case '\\': e = '\\'; break; case '\b': e = 'b'; break; case '\f': e = 'f'; break; case '\n': e = 'n'; break; case '\r': e = 'r'; break; case '\t': e = 't'; break; default: { if (c == quote) { e = quote; break; } if (c < 0x1F || (c >= 0x80 && escape_unicode)) { /* Encode using \u.... */ unsigned int uc, lc; char txt[13]; s--; utf8_decode_char_nocheck(&s, &uc); txt[0] = '\\'; txt[1] = 'u'; txt[6] = '\\'; txt[7] = 'u'; if (uc <= 0xFFFF) { write_hex16(txt + 2, uc); txt[6] = '\0'; } else { uc -= 0x10000; lc = uc & 0x3FF; uc = uc >> 10; uc |= 0xD800; lc |= 0xDC00; write_hex16(txt + 2, uc); write_hex16(txt + 8, lc); txt[12] = '\0'; } appendStringInfoString(out, txt); continue; /* Skip backslash-encoding code below. */ } e = 0; } } appendStringInfoChar(out, e ? '\\' : c); if (e) appendStringInfoChar(out, e); } appendStringInfoChar(out, quote); } static bool encode_number(StringInfo out, const char *string) { const char *s = string; const char *start, *end; if (!string) return false; /* Validate number, trimming whitespace. */ skip_whitespace(&s); start = s; if (!validate_number(&s)) return false; end = s; skip_whitespace(&s); if (*s != '\0') return false; /* Append number to out */ appendBinaryStringInfo(out, start, end - start); return true; } typedef struct { StringInfoData str; bool use_orig; bool escape_unicode; bool trim; } json_encode_ctx; static bool json_encode_recurse(json_node * node, json_encode_ctx * ctx); char * json_encode(json_node * node, int options) { json_encode_ctx ctx; initStringInfo(&ctx.str); ctx.use_orig = !!(options & JSONOPT_USE_ORIG); ctx.escape_unicode = !!(options & JSONOPT_ESCAPE_UNICODE); ctx.trim = !(options & JSONOPT_NO_TRIM); if (!json_encode_recurse(node, &ctx)) { pfree(ctx.str.data); return NULL; } return ctx.str.data; } static bool json_encode_recurse(json_node * node, json_encode_ctx * ctx) { #define has_orig(field) \ (use_orig && node->orig.field.start) #define push_orig(field) \ appendBinaryStringInfo(&ctx->str, node->orig.field.start, \ node->orig.field.end - node->orig.field.start) bool use_orig = ctx->use_orig; bool trim = ctx->trim; ctx->trim = false; /* Don't trim internal nodes, just the root * node. */ if (!trim && has_orig(left_space)) push_orig(left_space); if (has_orig(value)) { push_orig(value); } else { const char *txt = NULL; json_node *child; switch (node->type) { case JSON_NULL: txt = "null"; break; case JSON_BOOL: if (node->v.v_bool) txt = "true"; else txt = "false"; break; case JSON_STRING: encode_string(&ctx->str, node->v.string.str, node->v.string.length, '"', ctx->escape_unicode); break; case JSON_NUMBER: if (!encode_number(&ctx->str, node->v.number)) return false; break; case JSON_ARRAY: appendStringInfoChar(&ctx->str, '['); json_foreach(child, node) { json_encode_recurse(child, ctx); if (child->next) appendStringInfoChar(&ctx->str, ','); } appendStringInfoChar(&ctx->str, ']'); break; case JSON_OBJECT: appendStringInfoChar(&ctx->str, '{'); json_foreach(child, node) { /* * Shadows the parent node (assigned to the variable * @node) so we can use our macros on the child node * instead. Hurray for lexical scoping! */ json_node *node = child; if (has_orig(key_left_space)) push_orig(key_left_space); if (has_orig(key)) push_orig(key); else encode_string(&ctx->str, node->key, node->key_length, '"', ctx->escape_unicode); if (has_orig(key_right_space)) push_orig(key_right_space); appendStringInfoChar(&ctx->str, ':'); json_encode_recurse(node, ctx); if (node->next) appendStringInfoChar(&ctx->str, ','); } appendStringInfoChar(&ctx->str, '}'); break; default: return false; } if (txt) appendStringInfoString(&ctx->str, txt); } if (!trim && has_orig(right_space)) push_orig(right_space); return true; #undef has_orig #undef push_orig } /* * json_encode_string * If you're interested in encoding JSON in general, see json_encode . * * Encodes a string literal JSON-style using the given quote character. * Note that using anything but '"' as the quote character will result * in invalid JSON. * * @str must be valid UTF-8, though it may contain null characters * (hence the length argument). * @quote must not be a backslash. */ char * json_encode_string(const char *str, size_t length, char quote, bool escape_unicode) { StringInfoData ret; initStringInfo(&ret); encode_string(&ret, str, length, quote, escape_unicode); return ret.data; }