summaryrefslogtreecommitdiff
path: root/json.c
diff options
context:
space:
mode:
authorJoey Adams2010-06-09 04:19:29 +0000
committerJoey Adams2010-06-09 04:19:29 +0000
commitbb5bbf028efbf663c15628e748a2a878e766ef65 (patch)
tree9671d5174cb383877d7d63ef57b2163537b74b9b /json.c
parent27d164bc9af93c9fe87649367248b7b794d1aacd (diff)
Moved everything in /contrib/json along with json.sgml to root directory.
This repository should probably be an honest-to-goodness branch of mainline PostgreSQL. I'm looking into that :-)
Diffstat (limited to 'json.c')
-rw-r--r--json.c1210
1 files changed, 1210 insertions, 0 deletions
diff --git a/json.c b/json.c
new file mode 100644
index 0000000..d530edd
--- /dev/null
+++ b/json.c
@@ -0,0 +1,1210 @@
+/*
+ Copyright (C) 2010 Joseph A. Adams (joeyadams3.14159@gmail.com)
+ All rights reserved.
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+#include "json.h"
+
+#include <ctype.h>
+
+
+
+bool json_escape_unicode = false;
+
+#define JSON_malloc palloc
+
+/* repalloc and pfree can't take a null pointer, unlike normal realloc and free. */
+static void *JSON_realloc(void *ptr, Size size)
+{
+ if (ptr)
+ return repalloc(ptr, size);
+ else
+ return palloc(size);
+}
+static void JSON_free(void *ptr)
+{
+ if (ptr)
+ pfree(ptr);
+}
+
+static char *JSON_strdup(const char *str, size_t length)
+{
+ char *ret = JSON_malloc(length + 1);
+ memcpy(ret, str, length);
+ ret[length] = 0;
+ return ret;
+}
+
+#define is_internal(node) ((node)->type == JSON_ARRAY || (node)->type == JSON_OBJECT)
+
+/* We can't use isspace() because it also accepts \v and \f, which
+ aren't legal whitespace characters in strict JSON. */
+#define is_whitespace(c) ((c)==' ' || (c)=='\t' || (c)=='\n' || (c)=='\r')
+
+static void skip_whitespace(const char **sp)
+{
+ const char *s = *sp;
+ while (is_whitespace(*s))
+ s++;
+ *sp = s;
+}
+
+static char end_parenthesis(json_node *node)
+{
+ if (!node)
+ return 0;
+ switch (node->type) {
+ case JSON_ARRAY: return ']';
+ case JSON_OBJECT: return '}';
+ default: return 0;
+ }
+}
+
+/*
+ * Reads exactly 4 hex characters (capital or lowercase).
+ * Writes the result to *out .
+ * Returns true on success, false on failure.
+ */
+static bool read_hex16(const char *in, unsigned int *out)
+{
+ unsigned int i;
+ unsigned int tmp;
+ char c;
+
+ *out = 0;
+
+ for (i=0; i<4; i++) {
+ c = *in++;
+ if (c >= '0' && c <= '9')
+ tmp = c - '0';
+ else if (c >= 'A' && c <= 'F')
+ tmp = c - 'A' + 10;
+ else if (c >= 'a' && c <= 'f')
+ tmp = c - 'a' + 10;
+ else
+ return false;
+
+ *out <<= 4;
+ *out += tmp;
+ }
+
+ return true;
+}
+
+static void write_hex16(char *out, unsigned int val)
+{
+ const char *hex = "0123456789ABCDEF";
+ *out++ = hex[(val >> 12) & 0xF];
+ *out++ = hex[(val >> 8) & 0xF];
+ *out++ = hex[(val >> 4) & 0xF];
+ *out++ = hex[val & 0xF];
+}
+
+static bool utf8_validate(const char *str, size_t length);
+static void utf8_decode_char_nocheck(const char **sp, unsigned int *uc);
+static int utf8_encode_char(char *out, unsigned int uc);
+
+
+/*************************** String buffer ***************************/
+
+typedef struct {
+ char *buffer;
+ size_t length;
+ size_t alloc;
+} String[1];
+
+/* Declare and initialize a String with the given name. */
+#define String(name) String name = {{NULL, 0, 0}}
+
+/* Grow the string by @need characters, reallocating if necessary.
+ * Returns a pointer to the uninitialized range where text is to go.
+ * A '\0' terminator is added automatically. */
+static char *string_grow(String str, size_t need)
+{
+ size_t end = str->length;
+ str->length += need;
+ if (str->alloc <= str->length) {
+ str->alloc = str->length*3/2 + 1;
+ if (str->alloc < 8)
+ str->alloc = 8;
+ str->buffer = JSON_realloc(str->buffer, str->alloc);
+ }
+ str->buffer[str->length] = '\0';
+ return str->buffer + end;
+}
+static char *string_buffer(String str)
+{
+ if (!str->buffer)
+ string_grow(str, 0);
+ return str->buffer;
+}
+static inline void string_append_length(String str, const char *append, size_t len)
+{
+ char *dest = string_grow(str, len);
+ memcpy(dest, append, len);
+}
+static inline void string_append(String str, const char *append)
+{
+ string_append_length(str, append, strlen(append));
+}
+static inline void string_append_char(String str, char c)
+{
+ *string_grow(str, 1) = c;
+}
+static inline void string_trunc(String str, size_t len)
+{
+ str->length = len;
+ str->buffer[len] = '\0';
+}
+static inline void string_free(String str)
+{
+ JSON_free(str->buffer);
+}
+
+
+/*********** json_node creation, manipulation, and deletion **********/
+
+json_node *json_mknode(json_type type)
+{
+ json_node *node = JSON_malloc(sizeof(*node));
+ memset(node, 0, sizeof(*node));
+ node->type = type;
+ return node;
+}
+
+json_node *json_mkbool(bool v_bool)
+{
+ json_node *node = json_mknode(JSON_BOOL);
+ node->v.v_bool = v_bool;
+ return node;
+}
+
+json_node *json_mkstring(const char *str, size_t length)
+{
+ json_node *node = json_mknode(JSON_STRING);
+ if (str) {
+ node->v.string.str = JSON_strdup(str, length);
+ node->v.string.length = length;
+ }
+ return node;
+}
+
+json_node *json_mknumber(const char *number, size_t length)
+{
+ json_node *node = json_mknode(JSON_NUMBER);
+ if (number)
+ node->v.number = JSON_strdup(number, length);
+ return node;
+}
+
+void json_append(json_node *parent, json_node *child)
+{
+ Assert(parent->type==JSON_ARRAY || parent->type==JSON_OBJECT);
+ Assert(child->parent == NULL);
+
+ parent->v.children.count++;
+ child->parent = parent;
+ child->prev = parent->v.children.tail;
+ child->next = NULL;
+
+ if (parent->v.children.tail) {
+ parent->v.children.tail->next = child;
+ parent->v.children.tail = child;
+ } else {
+ parent->v.children.head = parent->v.children.tail = child;
+ }
+}
+
+void json_remove(json_node *node)
+{
+ json_node *parent = node->parent;
+
+ if (!parent)
+ return;
+ Assert(parent->type==JSON_ARRAY || parent->type==JSON_OBJECT);
+ Assert(parent->v.children.count > 0);
+
+ if (node->prev)
+ node->prev->next = node->next;
+ else
+ parent->v.children.head = node->next;
+ if (node->next)
+ node->next->prev = node->prev;
+ else
+ parent->v.children.tail = node->prev;
+
+ parent->v.children.count--;
+ node->parent = NULL;
+ node->prev = NULL;
+ node->next = NULL;
+}
+
+const char *json_get_string(json_node *node, size_t *length_out)
+{
+ Assert(node->type == JSON_STRING);
+ if (length_out)
+ *length_out = node->v.string.length;
+ return node->v.string.str;
+}
+
+void json_set_string(json_node *node, const char *str, size_t length)
+{
+ Assert(node->type == JSON_STRING);
+ if (node->v.string.str)
+ JSON_free(node->v.string.str);
+ if (str) {
+ node->v.string.str = JSON_strdup(str, length);
+ node->v.string.length = length;
+ } else {
+ node->v.string.str = NULL;
+ node->v.string.length = 0;
+ }
+}
+
+const char *json_get_number(json_node *node)
+{
+ Assert(node->type == JSON_NUMBER);
+ return node->v.number;
+}
+
+void json_set_number(json_node *node, const char *number, size_t length)
+{
+ Assert(node->type == JSON_NUMBER);
+ if (node->v.number)
+ JSON_free(node->v.number);
+ if (number)
+ node->v.number = JSON_strdup(number, length);
+ else
+ node->v.number = NULL;
+}
+
+/* Non-recursively free a node */
+static void free_node(json_node *node)
+{
+ if (node->type == JSON_STRING)
+ JSON_free(node->v.string.str);
+ else if (node->type == JSON_NUMBER)
+ JSON_free(node->v.number);
+ if (node->key)
+ JSON_free(node->key);
+ JSON_free(node);
+}
+
+void json_delete(json_node *node)
+{
+ json_node *parent, *next;
+
+ if (!node)
+ return;
+
+ /* Remove node from parent (if it has one). */
+ json_remove(node);
+
+ goto descend;
+
+descend:
+ while (is_internal(node) && node->v.children.head)
+ node = node->v.children.head;
+ goto advance;
+
+advance:
+ parent = node->parent;
+ next = node->next;
+ free_node(node);
+ node = next;
+
+ if (node)
+ goto descend;
+ else
+ goto ascend;
+
+ascend:
+ node = parent;
+ if (node)
+ goto advance;
+ else
+ return;
+}
+
+
+/*********************** Parsing and validation **********************/
+
+static json_node *decode_leaf(const char **sp);
+static json_node *decode_number(const char **sp);
+static char *decode_string(const char **sp, size_t *length);
+/* decode_string has a different signature than its friends
+ because it's also used to parse object member keys. */
+
+bool json_validate(const char *str)
+{
+ json_node *node = json_decode(str);
+ if (!node)
+ return false;
+ json_delete(node);
+ return true;
+}
+
+json_node *json_decode(const char *str)
+{
+ json_node *root = NULL, *parent = NULL, *node = NULL;
+ const char *s = str;
+ char *key = NULL;
+ size_t key_length = 0;
+
+ if (!str)
+ return NULL;
+
+ if (!utf8_validate(str, strlen(str)))
+ return NULL;
+
+ goto item;
+
+item: /* Expect a value */
+ skip_whitespace(&s);
+
+ if (parent && parent->type == JSON_OBJECT) {
+ /* Parse member key string. */
+ key = decode_string(&s, &key_length);
+ if (!key)
+ goto failed;
+
+ /* Eat the " : " */
+ skip_whitespace(&s);
+ if (*s != ':')
+ goto failed;
+ s++;
+ skip_whitespace(&s);
+ } else {
+ key = NULL;
+ }
+
+ node = decode_leaf(&s);
+ if (!node) {
+ if (*s == '[')
+ node = json_mknode(JSON_ARRAY);
+ else if (*s == '{')
+ node = json_mknode(JSON_OBJECT);
+ else
+ goto failed;
+ s++;
+ }
+
+ if (key) {
+ node->key = key;
+ node->key_length = key_length;
+ key = NULL;
+ }
+
+ if (parent)
+ json_append(parent, node);
+ else
+ root = node;
+
+ if (is_internal(node)) {
+ parent = node;
+ goto item_endp;
+ }
+
+ if (parent)
+ goto comma_endp;
+ else
+ goto end;
+
+comma_endp: /* Expect a comma or end bracket/brace */
+ skip_whitespace(&s);
+
+ if (*s == ',') {
+ s++;
+ goto item;
+ }
+ if (*s == ']' || *s == '}')
+ goto endp;
+
+ goto failed;
+
+item_endp: /* Expect a value or end bracket/brace */
+ skip_whitespace(&s);
+ if (*s == ']' || *s == '}')
+ goto endp;
+ goto item;
+
+endp: /* Handle an end bracket/brace */
+ if (*s != end_parenthesis(parent))
+ goto failed;
+ s++;
+ node = parent;
+ parent = parent->parent;
+ if (parent)
+ goto comma_endp;
+ else
+ goto end;
+
+end: /* Expect end of text */
+ skip_whitespace(&s);
+ if (*s)
+ goto failed;
+ return node;
+
+failed: /* Handle failure */
+ if (key)
+ JSON_free(key);
+ json_delete(root);
+ return NULL;
+}
+
+/*
+ * Decode and skip a node that does not have children.
+ * Whitespace is not skipped first (it is done in the primary decode loop).
+ *
+ * Returns NULL if next character is '[', '{', or invalid.
+ */
+static json_node *decode_leaf(const char **sp)
+{
+ char c = **sp;
+
+ if (c == '"') {
+ size_t length;
+ char *str = decode_string(sp, &length);
+
+ if (str) {
+ json_node *node = json_mknode(JSON_STRING);
+ node->v.string.str = str;
+ node->v.string.length = length;
+ return node;
+ }
+
+ return NULL;
+ }
+ if ((c >= '0' && c <= '9') || c == '-')
+ return decode_number(sp);
+ if (!strncmp(*sp, "true", 4)) {
+ (*sp) += 4;
+ return json_mkbool(true);
+ }
+ if (!strncmp(*sp, "false", 5)) {
+ (*sp) += 5;
+ return json_mkbool(false);
+ }
+ if (!strncmp(*sp, "null", 4)) {
+ (*sp) += 4;
+ return json_mknode(JSON_NULL);
+ }
+
+ return NULL;
+}
+
+/*
+ * The JSON spec says that a number shall follow this precise pattern
+ * (spaces and quotes added for readability):
+ * '-'? (0 | [1-9][0-9]*) ('.' [0-9]+)? ([Ee] [+-]? [0-9]+)?
+ *
+ * However, some JSON parsers are more liberal. For instance, PHP accepts
+ * '.5' and '1.'. JSON.parse accepts '+3'.
+ *
+ * This function takes the strict approach. The user should use
+ * json_clean() to handle liberal JSON text.
+ */
+static bool validate_number(const char **sp)
+{
+ const char *s = *sp;
+
+ /* '-'? */
+ if (*s == '-')
+ s++;
+
+ /* (0 | [1-9][0-9]*) */
+ if (*s == '0') {
+ s++;
+ } else {
+ if (!isdigit(*s))
+ return false;
+ do s++; while (isdigit(*s));
+ }
+
+ /* ('.' [0-9]+)? */
+ if (*s == '.') {
+ s++;
+ if (!isdigit(*s))
+ return false;
+ do s++; while (isdigit(*s));
+ }
+
+ /* ([Ee] [+-]? [0-9]+)? */
+ if (*s=='E' || *s=='e') {
+ s++;
+ if (*s=='+' || *s=='-')
+ s++;
+ if (!isdigit(*s))
+ return false;
+ do s++; while (isdigit(*s));
+ }
+
+ *sp = s;
+ return true;
+}
+
+static json_node *decode_number(const char **sp)
+{
+ const char *start, *end;
+
+ start = *sp;
+ if (!validate_number(sp))
+ return NULL;
+ end = *sp;
+
+ return json_mknumber(start, end - start);
+}
+
+static char *decode_string(const char **sp, size_t *length)
+{
+ const char *s = *sp;
+ String(ret);
+ char *out;
+ size_t size;
+
+ if (*s++ != '"')
+ return NULL;
+
+ while (*s && *s != '"') {
+ unsigned char c = *s++;
+ unsigned int uc, lc;
+
+ if (c == '\\') {
+ c = *s++;
+ switch (c) {
+ case '"':
+ case '\\':
+ case '/':
+ break;
+ case 'b': c = '\b'; break;
+ case 'f': c = '\f'; break;
+ case 'n': c = '\n'; break;
+ case 'r': c = '\r'; break;
+ case 't': c = '\t'; break;
+ case 'u':
+ size = ret->length;
+ out = string_grow(ret, 4);
+
+ if (!read_hex16(s, &uc))
+ goto failed;
+ s += 4;
+
+ if (uc >= 0xD800 && uc <= 0xDFFF) {
+ /* Handle UTF-16 surrogate pair. */
+
+ if (uc >= 0xDC00)
+ goto failed; /* Second surrogate not preceded by
+ first surrogate. */
+
+ if (s[0] != '\\' || s[1] != 'u'
+ || !read_hex16(s+2, &lc)
+ || !(lc >= 0xDC00 && lc <= 0xDFFF))
+ goto failed; /* First surrogate not followed by
+ second surrogate. */
+
+ s += 6;
+
+ uc = 0x10000 | ((uc & 0x3FF) << 10) | (lc & 0x3FF);
+ }
+
+ /* 0xFFFE and 0xFFFF are invalid Unicode */
+ if (uc == 0xFFFE || uc == 0xFFFF)
+ goto failed;
+
+ size += utf8_encode_char(out, uc);
+ string_trunc(ret, size);
+
+ continue; /* Continue the enclosing while loop to skip
+ the str_append below. */
+ default: /* Invalid escape */
+ goto failed;
+ }
+ } else if (c <= 0x1F) {
+ /* Control characters not allowed in string literals. */
+ goto failed;
+ }
+ string_append_char(ret, c);
+ }
+
+ if (!*s++)
+ goto failed;
+
+ *length = ret->length;
+ *sp = s;
+ return string_buffer(ret);
+
+failed:
+ string_free(ret);
+ return NULL;
+}
+
+json_type json_text_type(const char *str, size_t nbytes)
+{
+ const char *s = str;
+ const char *e = str + nbytes;
+ char c;
+
+ /* Skip whitespace characters. */
+ while (s < e && is_whitespace(*s))
+ s++;
+
+ /* Get first non-white character, making sure it's in bounds. */
+ if (s >= e)
+ return JSON_INVALID;
+ c = *s;
+
+ switch (c) {
+ case 'n':
+ return JSON_NULL;
+ case '"':
+ return JSON_STRING;
+ case 't':
+ case 'f':
+ return JSON_BOOL;
+ case '{':
+ return JSON_OBJECT;
+ case '[':
+ return JSON_ARRAY;
+ default:
+ if (c == '-' || (c >= '0' && c <= '9'))
+ return JSON_NUMBER;
+ return JSON_INVALID;
+ }
+}
+
+
+/****************************** Encoding *****************************/
+
+static bool encode_string(String out, const char *string, size_t length)
+{
+ const char *s = string;
+ const char *e = s + length;
+
+ if (!utf8_validate(string, length))
+ return false;
+
+ string_append_char(out, '"');
+
+ while (s < e) {
+ unsigned char c = *s++;
+ unsigned char e;
+
+ switch (c) {
+ case '"': e = '"'; break;
+ case '\\': e = '\\'; break;
+ case '\b': e = 'b'; break;
+ case '\f': e = 'f'; break;
+ case '\n': e = 'n'; break;
+ case '\r': e = 'r'; break;
+ case '\t': e = 't'; break;
+ default: {
+ if (c < 0x1F || (c >= 0x80 && json_escape_unicode)) {
+ /* Encode using \u.... */
+ unsigned int uc, lc;
+ char txt[13];
+
+ s--;
+ utf8_decode_char_nocheck(&s, &uc);
+
+ txt[0] = '\\';
+ txt[1] = 'u';
+ txt[6] = '\\';
+ txt[7] = 'u';
+ if (uc <= 0xFFFF) {
+ write_hex16(txt+2, uc);
+ txt[6] = '\0';
+ } else {
+ uc -= 0x10000;
+ lc = uc & 0x3FF;
+ uc = uc >> 10;
+ uc |= 0xD800;
+ lc |= 0xDC00;
+ write_hex16(txt+2, uc);
+ write_hex16(txt+8, lc);
+ txt[12] = '\0';
+ }
+
+ string_append(out, txt);
+ continue; /* Skip backslash-encoding code below. */
+ }
+ e = 0;
+ }
+ }
+
+ string_append_char(out, e ? '\\' : c);
+ if (e)
+ string_append_char(out, e);
+ }
+
+ string_append_char(out, '"');
+
+ return true;
+}
+
+static bool encode_number(String out, const char *string)
+{
+ const char *s = string;
+ const char *start, *end;
+
+ if (!string)
+ return false;
+
+ /* Validate number, trimming whitespace. */
+ skip_whitespace(&s);
+ start = s;
+ if (!validate_number(&s))
+ return false;
+ end = s;
+ skip_whitespace(&s);
+ if (*s != '\0')
+ return false;
+
+ /* Append number to out */
+ string_append_length(out, start, end-start);
+
+ return true;
+}
+
+char *json_encode(json_node *node)
+{
+ String(ret);
+ const char *txt;
+ json_node *sentinel;
+
+ if (!node)
+ return NULL;
+ sentinel = node->parent;
+
+ goto begin_nokey;
+
+begin: /* Encode entire node, or (if it's an array or object)
+ the beginning of it. */
+
+ if (node->key) {
+ if (!encode_string(ret, node->key, node->key_length))
+ goto failed;
+ string_append_char(ret, ':');
+ }
+ goto begin_nokey;
+
+begin_nokey:
+
+ txt = NULL;
+ switch (node->type) {
+ case JSON_NULL:
+ txt = "null";
+ break;
+ case JSON_BOOL:
+ if (node->v.v_bool)
+ txt = "true";
+ else
+ txt = "false";
+ break;
+ case JSON_STRING:
+ if (!encode_string(ret, node->v.string.str, node->v.string.length))
+ goto failed;
+ break;
+ case JSON_NUMBER:
+ if (!encode_number(ret, node->v.number))
+ goto failed;
+ break;
+ case JSON_ARRAY:
+ txt = "[";
+ break;
+ case JSON_OBJECT:
+ txt = "{";
+ break;
+ default:
+ goto failed;
+ }
+ if (txt)
+ string_append(ret, txt);
+
+ if (is_internal(node) && node->v.children.head) {
+ node = node->v.children.head;
+ goto begin;
+ } else {
+ goto finish;
+ }
+
+finish: /* Finish a node and move to the next one. */
+ if (node->type == JSON_ARRAY)
+ string_append_char(ret, ']');
+ else if (node->type == JSON_OBJECT)
+ string_append_char(ret, '}');
+
+ if (node->next) {
+ string_append_char(ret, ',');
+ node = node->next;
+ goto begin;
+ }
+ if (node->parent != sentinel) {
+ node = node->parent;
+ goto finish;
+ }
+ goto end;
+
+end: /* All nodes finished being serialized. */
+ return string_buffer(ret);
+
+failed: /* Handle error. */
+ string_free(ret);
+ return NULL;
+}
+
+
+/************************ Liberal JSON support ***********************/
+
+bool json_validate_liberal(const char *str)
+{
+ json_node *node = json_decode_liberal(str);
+ if (!node)
+ return false;
+ json_delete(node);
+ return true;
+}
+
+json_node *json_decode_liberal(const char *str)
+{
+ char *cleaned = json_cleanup(str);
+ json_node *node = json_decode(cleaned);
+ if (cleaned)
+ JSON_free(cleaned);
+ return node;
+}
+
+char *json_cleanup(const char *str)
+{
+ String(ret);
+ const char *p = str;
+ const char *s = str;
+ int comment_start_width = 0;
+ char quote_char = 0;
+ /* flush(): flush content we have scanned, meaning append characters
+ * from p thru s to ret, then set p to s. */
+ #define flush() do { \
+ string_append_length(ret, p, s-p); \
+ p = s; \
+ } while(0)
+
+ if (!str)
+ return NULL;
+
+ goto begin;
+
+begin:
+ for (;*s; s++) {
+ if (*s == '"' || *s == '\'')
+ goto quote;
+ if (isdigit(*s) || *s=='-' || *s=='+' || *s=='.')
+ goto number;
+ if (s[0]=='#') {
+ comment_start_width = 1;
+ goto line_comment;
+ }
+ if (s[0]=='/' && s[1]=='/') {
+ comment_start_width = 2;
+ goto line_comment;
+ }
+ if (s[0]=='/' && s[1]=='*') {
+ comment_start_width = 2;
+ goto c_comment;
+ }
+ }
+ flush();
+ return ret->buffer;
+
+quote:
+ quote_char = *s;
+ if (*s == '\'') {
+ flush();
+ string_append_char(ret, '"');
+ p = s = s+1;
+ } else {
+ s++;
+ }
+ while (*s) {
+ if (*s == quote_char) {
+ if (*s == '\'') {
+ flush();
+ string_append_char(ret, '"');
+ p = s = s+1;
+ } else {
+ s++;
+ }
+ break;
+ } if (*s == '"') {
+ /* We're converting single quotes to double quotes,
+ * so double quotes need to be automatically escaped. */
+ flush();
+ string_append_char(ret, '\\');
+ s++;
+ } else if (*s == '\\') {
+ s++;
+ switch (*s) {
+ case '\0':
+ break;
+ case '\'':
+ /* Convert \' to \u0027 */
+ flush();
+ string_append(ret, "u0027");
+ p = s = s+1;
+ break;
+ default:
+ s++;
+ }
+ } else {
+ s++;
+ }
+ }
+ goto begin;
+
+number:
+ /* Skip a '-', or remove a '+' if present. */
+ if (*s == '-') {
+ s++;
+ } else if (*s == '+') {
+ flush();
+ p = s = s+1;
+ }
+ /* Make sure number has at least one digit. */
+ if (!isdigit(*s)) {
+ if (*s != '.')
+ goto failed;
+ if (!isdigit(s[1]))
+ goto failed;
+ }
+ /* Make sure that if first digit before '.' is '0', that it is the only digit.
+ * Leading 0s are not allowed, and for a good reason: to avoid ambiguity
+ * between octal and decimal formats. */
+ if (*s == '0') {
+ s++;
+ if (isdigit(*s))
+ goto failed;
+ goto frac;
+ }
+ /* Skip digits, or add a '0' if none are present. */
+ if (isdigit(*s)) {
+ do s++; while (isdigit(*s));
+ } else {
+ flush();
+ string_append_char(ret, '0');
+ }
+ goto frac;
+
+frac:
+ if (*s == '.') {
+ s++;
+ if (isdigit(*s)) {
+ do s++; while (isdigit(*s));
+ } else {
+ flush();
+ string_append_char(ret, '0');
+ }
+ }
+/* exp: */
+ if (*s=='E' || *s=='e') {
+ s++;
+ if (*s=='+' || *s=='-')
+ s++;
+ if (!isdigit(*s))
+ goto failed;
+ do s++; while (isdigit(*s));
+ }
+ /* The isdigit check is not needed, but here
+ * for clarity and safety. */
+ if (isdigit(*s) || *s=='-' || *s=='+' || *s=='.')
+ goto failed;
+ goto begin;
+
+line_comment: /* Remove all characters up to newline */
+ flush();
+ s += comment_start_width;
+ /* Skip characters up to newline */
+ while (*s && !(*s == '\n' || *s == '\r'))
+ s++;
+ /* Skip newline character and its complement (if present) */
+ if (s[0]) {
+ if (s[1] == '\n'+'\r'-s[0])
+ s++;
+ s++;
+ }
+ /* Set begin marker so characters skipped are not
+ * appended to output on next flush. */
+ p = s;
+ goto begin;
+
+c_comment: /* Remove all characters up to star-slash */
+ flush();
+ s += comment_start_width;
+ /* Skip characters up to and including star-slash */
+ while (*s && !(s[0] == '*' && s[1] == '/'))
+ s++;
+ if (*s)
+ s += 2;
+ else
+ goto failed; /* No star-slash present */
+ /* Set begin marker so characters skipped are not
+ * appended to output on next flush. */
+ p = s;
+ goto begin;
+
+failed:
+ string_free(ret);
+ return NULL;
+
+ #undef flush
+}
+
+
+/****************************** Unicode ******************************/
+
+static const bool utf8_allow_surrogates = false;
+
+static void utf8_decode_char_nocheck(const char **sp, unsigned int *uc)
+{
+ const unsigned char *s = (const unsigned char *)*sp;
+ unsigned char c = *s++;
+ unsigned int len;
+ unsigned char sf[4] = {0xFF, 0x1F, 0xF, 0x7};
+
+ if (c < 0x80)
+ len = 0;
+ else if (c < 0xE0)
+ len = 1;
+ else if (c < 0xF0)
+ len = 2;
+ else
+ len = 3;
+
+ *uc = c & sf[len];
+ while (len--) {
+ *uc <<= 6;
+ *uc |= *s++ & 0x3F;
+ }
+
+ *sp = (const char*)s;
+}
+
+static bool utf8_validate(const char *str, size_t length)
+{
+ const unsigned char *s = (const unsigned char*)str;
+ const unsigned char *e = s + length;
+
+ while (s < e) {
+ unsigned char c = *s++;
+ unsigned int len; /* number of bytes in sequence - 2 */
+
+ /* If character is ASCII, move on. */
+ if (c < 0x80)
+ continue;
+
+ if (s >= e)
+ return false; /* Missing bytes in sequence. */
+
+ if (c < 0xE0) {
+ /* 2-byte sequence, U+0080 to U+07FF
+ c must be 11000010 or higher
+ s[0] must be 10xxxxxx */
+ len = 0;
+ if (c < 0xC2)
+ return false;
+ } else if (c < 0xF0) {
+ /* 3-byte sequence, U+0800 to U+FFFF
+ Note that the surrogate range is U+D800 to U+DFFF,
+ and that U+FFFE and U+FFFF are illegal characters.
+ c must be >= 11100000 (which it is)
+ If c is 11100000, then s[0] must be >= 10100000
+ If the global parameter utf8_allow_surrogates is false:
+ If c is 11101101 and s[0] is >= 10100000,
+ then this is a surrogate and we should fail.
+ If c is 11101111, s[0] is 10111111, and s[1] >= 10111110,
+ then this is an illegal character and we should fail.
+ s[0] and s[1] must be 10xxxxxx */
+ len = 1;
+ if (c == 0xE0 && *s < 0xA0)
+ return false;
+ if (!utf8_allow_surrogates && c == 0xED && *s >= 0xA0)
+ return false;
+ if (c == 0xEF && s[0] == 0xBF && (s+1 >= e || s[1] >= 0xBE))
+ return false;
+ } else {
+ /* 4-byte sequence, U+010000 to U+10FFFF
+ c must be >= 11110000 (which it is) and <= 11110100
+ If c is 11110000, then s[0] must be >= 10010000
+ If c is 11110100, then s[0] must be < 10010000
+ s[0], s[1], and s[2] must be 10xxxxxx */
+ len = 2;
+ if (c > 0xF4)
+ return false;
+ if (c == 0xF0 && *s < 0x90)
+ return false;
+ if (c == 0xF4 && *s >= 0x90)
+ return false;
+ }
+
+ if (s + len >= e)
+ return false; /* Missing bytes in sequence. */
+
+ do {
+ if ((*s++ & 0xC0) != 0x80)
+ return false;
+ } while (len--);
+ }
+
+ return true;
+}
+
+/*
+ * Encodes the Unicode character uc as UTF-8, writing it
+ * to *out and updating *out to point to the end of the UTF-8 sequence.
+ *
+ * If uc is too high, no character will be emitted, and *out will
+ * not be changed. If uc is in the UTF-16 surrogate range
+ * (U+D800 thru U+DFFF) or is a designated not-a-character
+ * (U+FFFE or U+FFFF), the character will be emitted anyway,
+ * although it is technically invalid UTF-8.
+ *
+ * Returns the number of characters emitted.
+ */
+static int utf8_encode_char(char *out, unsigned int uc)
+{
+ char *start = out;
+
+ if (uc < 0x80) {
+ *out++ = uc & 0x7F;
+ } else if (uc < 0x800) {
+ *out++ = 0xC0 | (uc >> 6);
+ *out++ = 0x80 | (uc & 0x3F);
+ } else if (uc < 0x10000) {
+ *out++ = 0xE0 | (uc >> 12);
+ *out++ = 0x80 | ((uc >> 6) & 0x3F);
+ *out++ = 0x80 | (uc & 0x3F);
+ } else if (uc < 0x110000) {
+ *out++ = 0xF0 | ((uc >> 18) & 0x07);
+ *out++ = 0x80 | ((uc >> 12) & 0x3F);
+ *out++ = 0x80 | ((uc >> 6) & 0x3F);
+ *out++ = 0x80 | (uc & 0x3F);
+ }
+
+ return out - start;
+}
+