/*------------------------------------------------------------------------- * * json.c * Core JSON manipulation routines used by JSON data type support. * * Copyright (c) 2011, PostgreSQL Global Development Group * Written by Joey Adams . * *------------------------------------------------------------------------- */ #include "json.h" #include "compat.h" #define is_space(c) ((c) == '\t' || (c) == '\n' || (c) == '\r' || (c) == ' ') #define is_digit(c) ((c) >= '0' && (c) <= '9') #define is_hex_digit(c) (((c) >= '0' && (c) <= '9') || \ ((c) >= 'A' && (c) <= 'F') || \ ((c) >= 'a' && (c) <= 'f')) static unsigned int read_hex16(const char *in); static void write_hex16(char *out, unsigned int val); static pg_wchar from_surrogate_pair(unsigned int uc, unsigned int lc); static void to_surrogate_pair(pg_wchar unicode, unsigned int *uc, unsigned int *lc); static void appendStringInfoUtf8(StringInfo str, pg_wchar unicode); static void appendStringInfoEscape(StringInfo str, unsigned int c); static const char *stringify_value(StringInfo buf, const char *s, const char *e, const char *space, size_t space_length, int indent); static void append_indent(StringInfo buf, const char *space, size_t space_length, int indent); /* * Parsing functions and macros * * The functions and macros that follow are used to simplify the implementation * of the recursive descent parser used for JSON validation. See the * implementation of expect_object() for a good example of them in action. * * These functions/macros use a few unifying concepts: * * * const char *s and const char *e form a "slice" within a string, * where s points to the first character and e points after the last * character. Hence, the length of a slice is e - s. Although it would be * simpler to just get rid of const char *e and rely on strings being * null-terminated, varlena texts are not guaranteed to be null-terminated, * meaning we would have to copy them to parse them. * * * Every expect_* function sees if the beginning of a slice matches what it * expects, and returns the end of the slice on success. To illustrate: * * s expect_number(s, e) e * | | | * {"pi": 3.14159265, "e": 2.71828183, "phi": 1.61803399} * * * When a parse error occurs, s is set to NULL. Moreover, the parser * functions and macros check to see if s is NULL before using it. * This means parser functions built entirely of parser functions can proceed * with the illusion that the input will always be valid, rather than having * to do a NULL check on every line (see expect_number, which has no explicit * checks). However, one must ensure that the parser will always halt, * even in the NULL case. * * Bear in mind that while pop_*, optional_*, and skip_* update s, * the expect_* functions do not. Example: * * s = expect_char(s, e, '{'); * s = expect_space(s, e); * * if (optional_char(s, e, '}')) * return s; * * Also, note that functions traversing an already-validated JSON text * can take advantage of the assumption that the input is valid. * For example, stringify_value does not perform NULL checks, nor does it * check if s < e before dereferencing s. */ static const char *expect_value(const char *s, const char *e); static const char *expect_object(const char *s, const char *e); static const char *expect_array(const char *s, const char *e); static const char *expect_string(const char *s, const char *e); static const char *expect_number(const char *s, const char *e); static const char *expect_literal(const char *s, const char *e, const char *literal); static const char *expect_space(const char *s, const char *e); /* * All of these macros evaluate s multiple times. * * Macros ending in _pred take a function or macro of the form: * * bool pred(char c); * * Macros ending in _cond take an expression, where *s is the character in question. */ /* * expect_char: Expect the next character to be @c, and consume it. * expect_char_pred: Expect pred(next character) to hold, and consume it. * expect_char_cond: Expect a character to be available and cond to hold, and consume. * expect_eof: Expect there to be no more input left. * * These macros, like any expect_ macros/functions, return a new pointer * rather than updating @s. */ #define expect_char(s, e, c) expect_char_cond(s, e, *(s) == (c)) #define expect_char_pred(s, e, pred) expect_char_cond(s, e, pred(*(s))) #define expect_char_cond(s, e, cond) \ ((s) != NULL && (s) < (e) && (cond) ? (s) + 1 : NULL) #define expect_eof(s, e) ((s) != NULL && (s) == (e) ? (s) : NULL) /* * next_char: Get the next character, but do not consume it. * next_char_pred: Apply pred to the next character. * next_char_cond: Evaluate cond if a character is available. * * On EOF or error, next_char returns EOF, and * next_char_pred and next_char_cond return false. */ #define next_char(s, e) \ ((s) != NULL && (s) < (e) ? (int)(unsigned char) *(s) : (int) EOF) #define next_char_pred(s, e, pred) next_char_cond(s, e, pred(*(s))) #define next_char_cond(s, e, cond) ((s) != NULL && (s) < (e) ? (cond) : false) /* * pop_char: Consume the next character, and return it. * pop_char_pred: Consume the next character, and apply pred to it. * pop_char_cond is impossible to implement portably. * * On EOF or error, these macros do nothing, * pop_char returns EOF, and pop_char_cond returns false. */ #define pop_char(s, e) ((s) != NULL && (s) < (e) ? (int)(unsigned char) *(s)++ : (int) EOF) #define pop_char_pred(s, e) \ ((s) != NULL && (s) < (e) ? (s)++, pred((s)[-1]) : false) /* * optional_char: If the next character is @c, consume it. * optional_char_pred: If pred(next character) holds, consume it. * optional_char_cond: If a character is available, and cond holds, consume. * * These macros, when they consume, update @s and return true. * Otherwise, they do nothing and return false. */ #define optional_char(s, e, c) optional_char_cond(s, e, *(s) == (c)) #define optional_char_pred(s, e, pred) optional_char_cond(s, e, pred(*(s))) #define optional_char_cond(s, e, cond) \ ((s) != NULL && (s) < (e) && (cond) ? (s)++, true : false) /* * skip_pred: Skip zero or more characters matching pred. * skip1_pred: Skip one or more characters matching pred. * skip_cond: Skip zero or more characters where cond holds. * skip1_cond: Skip one or more characters where cond holds. */ #define skip_pred(s, e, pred) skip_cond(s, e, pred(*(s))) #define skip1_pred(s, e, pred) skip1_cond(s, e, pred(*(s))) #define skip_cond(s, e, cond) do { \ while (next_char_cond(s, e, cond)) \ (s)++; \ } while (0) #define skip1_cond(s, e, cond) do { \ if (next_char_cond(s, e, cond)) \ { \ (s)++; \ while (next_char_cond(s, e, cond)) \ (s)++; \ } \ else \ { \ (s) = NULL; \ } \ } while (0) /* * json_validate - Test if text is valid JSON. * * Note: scalar values (strings, numbers, booleans, and nulls) * are considered valid by this function, and by the JSON datatype. */ bool json_validate(const char *str, size_t length) { const char *s = str; const char *e = str + length; s = expect_space(s, e); s = expect_value(s, e); s = expect_space(s, e); s = expect_eof(s, e); return s != NULL; } /* * json_validate_nospace - Test if text is valid JSON and has no whitespace around tokens. * * JSON data is condensed on input, meaning spaces around tokens are removed. * The fact that these spaces are gone is exploited in functions that * traverse and manipulate JSON. */ bool json_validate_nospace(const char *str, size_t length) { const char *s = str; const char *e = str + length; if (!json_validate(str, length)) return false; while (s < e) { if (*s == '"') { s = expect_string(s, e); if (s == NULL) /* should never happen */ return false; } else if (is_space(*s)) { return false; } s++; } return true; } /* * json_condense - Make JSON content shorter by removing spaces * and unescaping characters. */ char * json_condense(const char *json_str, size_t length, size_t *out_length) { const char *s = json_str; const char *e = s + length; StringInfoData buf; bool inside_string = false; bool server_encoding_is_utf8 = GetDatabaseEncoding() == PG_UTF8; Assert(json_validate(json_str, length)); initStringInfo(&buf); while (s < e) { /* * To make sense of this procedural mess, think of it as a flow chart * that branches based on the characters that follow. * * When the algorithm wants to unescape a character, * it will append the unescaped character, advance s, * then continue. Otherwise, it will perform the default * behavior and emit the character as is. */ if (inside_string) { /* When we are inside a string literal, convert escapes * to the characters they represent when possible. */ if (*s == '\\' && s+1 < e) { /* Change \/ to / */ if (s[1] == '/') { appendStringInfoChar(&buf, '/'); s += 2; continue; } /* Emit single-character escape as is now * to avoid getting mixed up by \\ and \" */ if (s[1] != 'u') { appendStringInfoChar(&buf, s[0]); appendStringInfoChar(&buf, s[1]); s += 2; continue; } /* Unescape \uXXXX if it is possible and feasible. */ if (s+5 < e && s[1] == 'u') { unsigned int uc = read_hex16(s+2); /* If a \uXXXX escape stands for a non-control ASCII * character, unescape it. */ if (uc >= 0x20 && uc <= 0x7E) { s += 6; appendStringInfoChar(&buf, uc); continue; } /* Do not unescape 0x7F (DEL) because, although * the JSON RFC does not mention it, it is in fact * a control character. */ /* Unescape Unicode characters only if * the server encoding is UTF-8. */ if (uc > 0x7F && server_encoding_is_utf8) { if (uc >= 0xD800 && uc <= 0xDFFF) { /* Unescape a UTF-16 surrogate pair, * but only if it's present and valid. */ if (s+11 < e && s[6] == '\\' && s[7] == 'u') { unsigned int lc = read_hex16(s+8); if (uc >= 0xD800 && uc <= 0xDBFF && lc >= 0xDC00 && lc <= 0xDFFF) { s += 12; appendStringInfoUtf8(&buf, from_surrogate_pair(uc, lc)); continue; } } } else { s += 6; appendStringInfoUtf8(&buf, uc); continue; } } } } } else { /* When we are not in a string literal, remove spaces. */ if (is_space(*s)) { do s++; while (s < e && is_space(*s)); continue; } } /* If we get here, it means we want to emit this character as is. */ appendStringInfoChar(&buf, *s); if (*s++ == '"') inside_string = !inside_string; } if (out_length != NULL) *out_length = buf.len; return buf.data; } /* * json_need_to_escape_unicode * Determine whether we need to convert non-ASCII characters * to \uXXXX escapes to prevent transcoding errors. * * If any of the following hold, no escaping needs to be done: * * * The client encoding is UTF-8. Escaping is not necessary because * the client can encode all Unicode codepoints. * * * The client encoding and the server encoding are the same. * Escaping is not necessary because the client can encode all * codepoints the server can encode. * * * The server encoding is SQL_ASCII. This encoding tells PostgreSQL * to shirk transcoding in favor of speed. It wasn't unescaped on input, * so don't worry about escaping on output. * * * The client encoding is SQL_ASCII. This encoding tells PostgreSQL * to not perform encoding conversion. * * Otherwise, (no matter how expensive it is) all non-ASCII characters are escaped. */ bool json_need_to_escape_unicode(void) { int server_encoding = GetDatabaseEncoding(); int client_encoding = pg_get_client_encoding(); if (client_encoding == PG_UTF8 || client_encoding == server_encoding || server_encoding == PG_SQL_ASCII || client_encoding == PG_SQL_ASCII) return false; return true; } /* * json_escape_unicode - Convert non-ASCII characters to \uXXXX escapes. */ char * json_escape_unicode(const char *json, size_t length, size_t *out_length) { const char *s; const char *e; StringInfoData buf; /* Convert to UTF-8, if necessary. */ { const char *orig = json; json = (const char *) pg_do_encoding_conversion((unsigned char *) json, length, GetDatabaseEncoding(), PG_UTF8); if (json != orig) length = strlen(json); } Assert(json_validate(json, length)); s = json; e = json + length; initStringInfo(&buf); while (s < e) { if ((unsigned char) *s > 0x7F) { int len; pg_wchar u; len = pg_utf_mblen((const unsigned char *) s); if (s + len > e) { Assert(false); appendStringInfoChar(&buf, *s); s++; continue; } u = utf8_to_unicode((const unsigned char *) s); s += len; if (u <= 0xFFFF) { appendStringInfoEscape(&buf, u); } else { unsigned int uc, lc; to_surrogate_pair(u, &uc, &lc); appendStringInfoEscape(&buf, uc); appendStringInfoEscape(&buf, lc); } } else { appendStringInfoChar(&buf, *s); s++; } } if (out_length != NULL) *out_length = buf.len; return buf.data; } /* * json_stringify - Format JSON into text with indentation. * * Input must be valid, condensed JSON. */ char * json_stringify(const char *json, size_t length, const char *space, size_t space_length, size_t *out_length) { const char *s = json; const char *e = json + length; StringInfoData buf; if (!json_validate_nospace(json, length)) report_corrupt_json(); initStringInfo(&buf); s = stringify_value(&buf, s, e, space, space_length, 0); Assert(s == e); if (out_length != NULL) *out_length = buf.len; return buf.data; } static const char * stringify_value(StringInfo buf, const char *s, const char *e, const char *space, size_t space_length, int indent) { const char *s2; Assert(s < e); switch (*s) { case '[': appendStringInfoString(buf, "[\n"); s++; if (*s != ']') { for (;;) { append_indent(buf, space, space_length, indent + 1); s = stringify_value(buf, s, e, space, space_length, indent + 1); Assert(s < e && (*s == ',' || *s == ']')); if (*s == ']') break; appendStringInfoString(buf, ",\n"); s++; } appendStringInfoChar(buf, '\n'); } append_indent(buf, space, space_length, indent); appendStringInfoChar(buf, ']'); return s + 1; case '{': appendStringInfoString(buf, "{\n"); s++; if (*s != '}') { for (;;) { append_indent(buf, space, space_length, indent + 1); s2 = expect_string(s, e); appendBinaryStringInfo(buf, s, s2 - s); s = s2; Assert(s < e && *s == ':'); appendStringInfoString(buf, ": "); s++; s = stringify_value(buf, s, e, space, space_length, indent + 1); Assert(s < e && (*s == ',' || *s == '}')); if (*s == '}') break; appendStringInfoString(buf, ",\n"); s++; } appendStringInfoChar(buf, '\n'); } append_indent(buf, space, space_length, indent); appendStringInfoChar(buf, '}'); return s + 1; default: s2 = expect_value(s, e); appendBinaryStringInfo(buf, s, s2 - s); return s2; } } static void append_indent(StringInfo buf, const char *space, size_t space_length, int indent) { int i; for (i = 0; i < indent; i++) appendBinaryStringInfo(buf, space, space_length); } /* * json_get_type - Determine the type of JSON content * given the first non-space character. * * Return JSON_INVALID if the first character is not recognized. */ JsonType json_get_type(int c) { switch (c) { case 'n': return JSON_NULL; case '"': return JSON_STRING; case '-': return JSON_NUMBER; case 'f': case 't': return JSON_BOOL; case '{': return JSON_OBJECT; case '[': return JSON_ARRAY; default: if (is_digit(c)) return JSON_NUMBER; return JSON_INVALID; } } /* * Reads exactly 4 hex characters (capital or lowercase). * Expects in[0..3] to be in bounds, and expects them to be hexadecimal characters. */ static unsigned int read_hex16(const char *in) { unsigned int i; unsigned int tmp; unsigned int ret = 0; for (i = 0; i < 4; i++) { char c = *in++; Assert(is_hex_digit(c)); if (c >= '0' && c <= '9') tmp = c - '0'; else if (c >= 'A' && c <= 'F') tmp = c - 'A' + 10; else /* if (c >= 'a' && c <= 'f') */ tmp = c - 'a' + 10; ret <<= 4; ret += tmp; } return ret; } /* * Encodes a 16-bit number in hexadecimal, writing exactly 4 hex characters. */ static void write_hex16(char *out, unsigned int val) { const char *hex = "0123456789ABCDEF"; *out++ = hex[(val >> 12) & 0xF]; *out++ = hex[(val >> 8) & 0xF]; *out++ = hex[(val >> 4) & 0xF]; *out++ = hex[val & 0xF]; } /* Compute the Unicode codepoint of a UTF-16 surrogate pair. */ static pg_wchar from_surrogate_pair(unsigned int uc, unsigned int lc) { Assert(uc >= 0xD800 && uc <= 0xDBFF && lc >= 0xDC00 && lc <= 0xDFFF); return 0x10000 + ((((pg_wchar)uc & 0x3FF) << 10) | (lc & 0x3FF)); } /* Construct a UTF-16 surrogate pair given a Unicode codepoint. */ static void to_surrogate_pair(pg_wchar unicode, unsigned int *uc, unsigned int *lc) { pg_wchar n = unicode - 0x10000; *uc = ((n >> 10) & 0x3FF) | 0xD800; *lc = (n & 0x3FF) | 0xDC00; } /* Append a Unicode character by converting it to UTF-8. */ static void appendStringInfoUtf8(StringInfo str, pg_wchar unicode) { if (str->len + 4 >= str->maxlen) enlargeStringInfo(str, 4); unicode_to_utf8(unicode, (unsigned char *) &str->data[str->len]); str->len += pg_utf_mblen((const unsigned char *) &str->data[str->len]); str->data[str->len] = '\0'; } static void appendStringInfoEscape(StringInfo str, unsigned int c) { if (str->len + 6 >= str->maxlen) enlargeStringInfo(str, 6); str->data[str->len++] = '\\'; str->data[str->len++] = 'u'; write_hex16(str->data + str->len, c); str->len += 4; str->data[str->len] = '\0'; } static const char * expect_value(const char *s, const char *e) { int c = next_char(s, e); switch (c) { case '{': return expect_object(s, e); case '[': return expect_array(s, e); case '"': return expect_string(s, e); case '-': return expect_number(s, e); case 'n': return expect_literal(s, e, "null"); case 'f': return expect_literal(s, e, "false"); case 't': return expect_literal(s, e, "true"); default: if (is_digit(c)) return expect_number(s, e); return NULL; } } static const char * expect_object(const char *s, const char *e) { s = expect_char(s, e, '{'); s = expect_space(s, e); if (optional_char(s, e, '}')) return s; while (s != NULL) { s = expect_string(s, e); s = expect_space(s, e); s = expect_char(s, e, ':'); s = expect_space(s, e); s = expect_value(s, e); s = expect_space(s, e); if (optional_char(s, e, '}')) return s; s = expect_char(s, e, ','); s = expect_space(s, e); } return NULL; } static const char * expect_array(const char *s, const char *e) { s = expect_char(s, e, '['); s = expect_space(s, e); if (optional_char(s, e, ']')) return s; while (s != NULL) { s = expect_value(s, e); s = expect_space(s, e); if (optional_char(s, e, ']')) return s; s = expect_char(s, e, ','); s = expect_space(s, e); } return NULL; } static const char * expect_string(const char *s, const char *e) { s = expect_char(s, e, '"'); for (;;) { int c = pop_char(s, e); if (c <= 0x1F) /* Control character, EOF, or error */ return NULL; if (c == '"') return s; if (c == '\\') { switch (pop_char(s, e)) { case '"': case '\\': case '/': case 'b': case 'f': case 'n': case 'r': case 't': break; case 'u': { int i; for (i = 0; i < 4; i++) { c = pop_char(s, e); if (!is_hex_digit(c)) return NULL; } } break; default: return NULL; } } } } static const char * expect_number(const char *s, const char *e) { optional_char(s, e, '-'); if (!optional_char(s, e, '0')) skip1_pred(s, e, is_digit); if (optional_char(s, e, '.')) skip1_pred(s, e, is_digit); if (optional_char_cond(s, e, *s == 'E' || *s == 'e')) { optional_char_cond(s, e, *s == '+' || *s == '-'); skip1_pred(s, e, is_digit); } return s; } static const char * expect_literal(const char *s, const char *e, const char *literal) { if (s == NULL) return NULL; while (*literal != '\0') if (s >= e || *s++ != *literal++) return NULL; return s; } /* Accepts *zero* or more spaces. */ static const char * expect_space(const char *s, const char *e) { if (s == NULL) return NULL; for (; s < e && is_space(*s); s++) {} return s; }