From bb5bbf028efbf663c15628e748a2a878e766ef65 Mon Sep 17 00:00:00 2001
From: Joey Adams
Date: Wed, 9 Jun 2010 00:19:29 -0400
Subject: Moved everything in /contrib/json along with json.sgml to root
 directory.

This repository should probably be an honest-to-goodness branch of
mainline PostgreSQL.  I'm looking into that :-)
---
 json.c | 1210 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 1210 insertions(+)
 create mode 100644 json.c

(limited to 'json.c')

diff --git a/json.c b/json.c
new file mode 100644
index 0000000..d530edd
--- /dev/null
+++ b/json.c
@@ -0,0 +1,1210 @@
+/*
+  Copyright (C) 2010 Joseph A. Adams (joeyadams3.14159@gmail.com)
+  All rights reserved.
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to deal
+  in the Software without restriction, including without limitation the rights
+  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+  copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+  THE SOFTWARE.
+*/
+
+#include "json.h"
+
+#include <ctype.h>
+
+
+
+bool json_escape_unicode = false;
+
+#define JSON_malloc palloc
+
+/* repalloc and pfree can't take a null pointer, unlike normal realloc and free. */
+static void *JSON_realloc(void *ptr, Size size)
+{
+	if (ptr)
+		return repalloc(ptr, size);
+	else
+		return palloc(size);
+}
+static void JSON_free(void *ptr)
+{
+	if (ptr)
+		pfree(ptr);
+}
+
+static char *JSON_strdup(const char *str, size_t length)
+{
+	char *ret = JSON_malloc(length + 1);
+	memcpy(ret, str, length);
+	ret[length] = 0;
+	return ret;
+}
+
+#define is_internal(node) ((node)->type == JSON_ARRAY || (node)->type == JSON_OBJECT)
+
+/* We can't use isspace() because it also accepts \v and \f, which
+   aren't legal whitespace characters in strict JSON. */
+#define is_whitespace(c) ((c)==' ' || (c)=='\t' || (c)=='\n' || (c)=='\r')
+
+static void skip_whitespace(const char **sp)
+{
+	const char *s = *sp;
+	while (is_whitespace(*s))
+		s++;
+	*sp = s;
+}
+
+static char end_parenthesis(json_node *node)
+{
+	if (!node)
+		return 0;
+	switch (node->type) {
+		case JSON_ARRAY: return ']';
+		case JSON_OBJECT: return '}';
+		default: return 0;
+	}
+}
+
+/*
+ * Reads exactly 4 hex characters (capital or lowercase).
+ * Writes the result to *out .
+ * Returns true on success, false on failure.
+ */
+static bool read_hex16(const char *in, unsigned int *out)
+{
+	unsigned int i;
+	unsigned int tmp;
+	char c;
+	
+	*out = 0;
+	
+	for (i=0; i<4; i++) {
+		c = *in++;
+		if (c >= '0' && c <= '9')
+			tmp = c - '0';
+		else if (c >= 'A' && c <= 'F')
+			tmp = c - 'A' + 10;
+		else if (c >= 'a' && c <= 'f')
+			tmp = c - 'a' + 10;
+		else
+			return false;
+		
+		*out <<= 4;
+		*out += tmp;
+	}
+	
+	return true;
+}
+
+static void write_hex16(char *out, unsigned int val)
+{
+	const char *hex = "0123456789ABCDEF";
+	*out++ = hex[(val >> 12) & 0xF];
+	*out++ = hex[(val >> 8) & 0xF];
+	*out++ = hex[(val >> 4) & 0xF];
+	*out++ = hex[val & 0xF];
+}
+
+static bool utf8_validate(const char *str, size_t length);
+static void utf8_decode_char_nocheck(const char **sp, unsigned int *uc);
+static int utf8_encode_char(char *out, unsigned int uc);
+
+
+/*************************** String buffer ***************************/
+
+typedef struct {
+	char *buffer;
+	size_t length;
+	size_t alloc;
+} String[1];
+
+/* Declare and initialize a String with the given name. */
+#define String(name) String name = {{NULL, 0, 0}}
+
+/* Grow the string by @need characters, reallocating if necessary.
+ * Returns a pointer to the uninitialized range where text is to go.
+ * A '\0' terminator is added automatically. */
+static char *string_grow(String str, size_t need)
+{
+	size_t end = str->length;
+	str->length += need;
+	if (str->alloc <= str->length) {
+		str->alloc = str->length*3/2 + 1;
+		if (str->alloc < 8)
+			str->alloc = 8;
+		str->buffer = JSON_realloc(str->buffer, str->alloc);
+	}
+	str->buffer[str->length] = '\0';
+	return str->buffer + end;
+}
+static char *string_buffer(String str)
+{
+	if (!str->buffer)
+		string_grow(str, 0);
+	return str->buffer;
+}
+static inline void string_append_length(String str, const char *append, size_t len)
+{
+	char *dest = string_grow(str, len);
+	memcpy(dest, append, len);
+}
+static inline void string_append(String str, const char *append)
+{
+	string_append_length(str, append, strlen(append));
+}
+static inline void string_append_char(String str, char c)
+{
+	*string_grow(str, 1) = c;
+}
+static inline void string_trunc(String str, size_t len)
+{
+	str->length = len;
+	str->buffer[len] = '\0';
+}
+static inline void string_free(String str)
+{
+	JSON_free(str->buffer);
+}
+
+
+/*********** json_node creation, manipulation, and deletion **********/
+
+json_node *json_mknode(json_type type)
+{
+	json_node *node = JSON_malloc(sizeof(*node));
+	memset(node, 0, sizeof(*node));
+	node->type = type;
+	return node;
+}
+
+json_node *json_mkbool(bool v_bool)
+{
+	json_node *node = json_mknode(JSON_BOOL);
+	node->v.v_bool = v_bool;
+	return node;
+}
+
+json_node *json_mkstring(const char *str, size_t length)
+{
+	json_node *node = json_mknode(JSON_STRING);
+	if (str) {
+		node->v.string.str = JSON_strdup(str, length);
+		node->v.string.length = length;
+	}
+	return node;
+}
+
+json_node *json_mknumber(const char *number, size_t length)
+{
+	json_node *node = json_mknode(JSON_NUMBER);
+	if (number)
+		node->v.number = JSON_strdup(number, length);
+	return node;
+}
+
+void json_append(json_node *parent, json_node *child)
+{
+	Assert(parent->type==JSON_ARRAY || parent->type==JSON_OBJECT);
+	Assert(child->parent == NULL);
+	
+	parent->v.children.count++;
+	child->parent = parent;
+	child->prev = parent->v.children.tail;
+	child->next = NULL;
+	
+	if (parent->v.children.tail) {
+		parent->v.children.tail->next = child;
+		parent->v.children.tail = child;
+	} else {
+		parent->v.children.head = parent->v.children.tail = child;
+	}
+}
+
+void json_remove(json_node *node)
+{
+	json_node *parent = node->parent;
+	
+	if (!parent)
+		return;
+	Assert(parent->type==JSON_ARRAY || parent->type==JSON_OBJECT);
+	Assert(parent->v.children.count > 0);
+	
+	if (node->prev)
+		node->prev->next = node->next;
+	else
+		parent->v.children.head = node->next;
+	if (node->next)
+		node->next->prev = node->prev;
+	else
+		parent->v.children.tail = node->prev;
+	
+	parent->v.children.count--;
+	node->parent = NULL;
+	node->prev = NULL;
+	node->next = NULL;
+}
+
+const char *json_get_string(json_node *node, size_t *length_out)
+{
+	Assert(node->type == JSON_STRING);
+	if (length_out)
+		*length_out = node->v.string.length;
+	return node->v.string.str;
+}
+
+void json_set_string(json_node *node, const char *str, size_t length)
+{
+	Assert(node->type == JSON_STRING);
+	if (node->v.string.str)
+		JSON_free(node->v.string.str);
+	if (str) {
+		node->v.string.str = JSON_strdup(str, length);
+		node->v.string.length = length;
+	} else {
+		node->v.string.str = NULL;
+		node->v.string.length = 0;
+	}
+}
+
+const char *json_get_number(json_node *node)
+{
+	Assert(node->type == JSON_NUMBER);
+	return node->v.number;
+}
+
+void json_set_number(json_node *node, const char *number, size_t length)
+{
+	Assert(node->type == JSON_NUMBER);
+	if (node->v.number)
+		JSON_free(node->v.number);
+	if (number)
+		node->v.number = JSON_strdup(number, length);
+	else
+		node->v.number = NULL;
+}
+
+/* Non-recursively free a node */
+static void free_node(json_node *node)
+{
+	if (node->type == JSON_STRING)
+		JSON_free(node->v.string.str);
+	else if (node->type == JSON_NUMBER)
+		JSON_free(node->v.number);
+	if (node->key)
+		JSON_free(node->key);
+	JSON_free(node);
+}
+
+void json_delete(json_node *node)
+{
+	json_node *parent, *next;
+	
+	if (!node)
+		return;
+	
+	/* Remove node from parent (if it has one). */
+	json_remove(node);
+	
+	goto descend;
+	
+descend:
+	while (is_internal(node) && node->v.children.head)
+		node = node->v.children.head;
+	goto advance;
+	
+advance:
+	parent = node->parent;
+	next = node->next;
+	free_node(node);
+	node = next;
+	
+	if (node)
+		goto descend;
+	else
+		goto ascend;
+	
+ascend:
+	node = parent;
+	if (node)
+		goto advance;
+	else
+		return;
+}
+
+
+/*********************** Parsing and validation **********************/
+
+static json_node *decode_leaf(const char **sp);
+static json_node *decode_number(const char **sp);
+static char *decode_string(const char **sp, size_t *length);
+/* decode_string has a different signature than its friends
+   because it's also used to parse object member keys. */
+
+bool json_validate(const char *str)
+{
+	json_node *node = json_decode(str);
+	if (!node)
+		return false;
+	json_delete(node);
+	return true;
+}
+
+json_node *json_decode(const char *str)
+{
+	json_node *root = NULL, *parent = NULL, *node = NULL;
+	const char *s = str;
+	char *key = NULL;
+	size_t key_length = 0;
+	
+	if (!str)
+		return NULL;
+	
+	if (!utf8_validate(str, strlen(str)))
+		return NULL;
+	
+	goto item;
+	
+item: /* Expect a value */
+	skip_whitespace(&s);
+	
+	if (parent && parent->type == JSON_OBJECT) {
+		/* Parse member key string. */
+		key = decode_string(&s, &key_length);
+		if (!key)
+			goto failed;
+		
+		/* Eat the " : " */
+		skip_whitespace(&s);
+		if (*s != ':')
+			goto failed;
+		s++;
+		skip_whitespace(&s);
+	} else {
+		key = NULL;
+	}
+	
+	node = decode_leaf(&s);
+	if (!node) {
+		if (*s == '[')
+			node = json_mknode(JSON_ARRAY);
+		else if (*s == '{')
+			node = json_mknode(JSON_OBJECT);
+		else
+			goto failed;
+		s++;
+	}
+	
+	if (key) {
+		node->key = key;
+		node->key_length = key_length;
+		key = NULL;
+	}
+	
+	if (parent)
+		json_append(parent, node);
+	else
+		root = node;
+	
+	if (is_internal(node)) {
+		parent = node;
+		goto item_endp;
+	}
+	
+	if (parent)
+		goto comma_endp;
+	else
+		goto end;
+	
+comma_endp: /* Expect a comma or end bracket/brace */
+	skip_whitespace(&s);
+	
+	if (*s == ',') {
+		s++;
+		goto item;
+	}
+	if (*s == ']' || *s == '}')
+		goto endp;
+	
+	goto failed;
+
+item_endp: /* Expect a value or end bracket/brace */
+	skip_whitespace(&s);
+	if (*s == ']' || *s == '}')
+		goto endp;
+	goto item;
+
+endp: /* Handle an end bracket/brace */
+	if (*s != end_parenthesis(parent))
+		goto failed;
+	s++;
+	node = parent;
+	parent = parent->parent;
+	if (parent)
+		goto comma_endp;
+	else
+		goto end;
+
+end: /* Expect end of text */
+	skip_whitespace(&s);
+	if (*s)
+		goto failed;
+	return node;
+	
+failed: /* Handle failure */
+	if (key)
+		JSON_free(key);
+	json_delete(root);
+	return NULL;
+}
+
+/*
+ * Decode and skip a node that does not have children.
+ * Whitespace is not skipped first (it is done in the primary decode loop).
+ *
+ * Returns NULL if next character is '[', '{', or invalid.
+ */
+static json_node *decode_leaf(const char **sp)
+{
+	char c = **sp;
+	
+	if (c == '"') {
+		size_t length;
+		char *str = decode_string(sp, &length);
+		
+		if (str) {
+			json_node *node = json_mknode(JSON_STRING);
+			node->v.string.str = str;
+			node->v.string.length = length;
+			return node;
+		}
+		
+		return NULL;
+	}
+	if ((c >= '0' && c <= '9') || c == '-')
+		return decode_number(sp);
+	if (!strncmp(*sp, "true", 4)) {
+		(*sp) += 4;
+		return json_mkbool(true);
+	}
+	if (!strncmp(*sp, "false", 5)) {
+		(*sp) += 5;
+		return json_mkbool(false);
+	}
+	if (!strncmp(*sp, "null", 4)) {
+		(*sp) += 4;
+		return json_mknode(JSON_NULL);
+	}
+	
+	return NULL;
+}
+
+/*
+ * The JSON spec says that a number shall follow this precise pattern
+ * (spaces and quotes added for readability):
+ *   '-'? (0 | [1-9][0-9]*) ('.' [0-9]+)? ([Ee] [+-]? [0-9]+)?
+ *
+ * However, some JSON parsers are more liberal.  For instance, PHP accepts
+ * '.5' and '1.'.  JSON.parse accepts '+3'.
+ *
+ * This function takes the strict approach.  The user should use
+ * json_clean() to handle liberal JSON text.
+ */
+static bool validate_number(const char **sp)
+{
+	const char *s = *sp;
+	
+	/* '-'? */
+	if (*s == '-')
+		s++;
+	
+	/* (0 | [1-9][0-9]*) */
+	if (*s == '0') {
+		s++;
+	} else {
+		if (!isdigit(*s))
+			return false;
+		do s++; while (isdigit(*s));
+	}
+
+	/* ('.' [0-9]+)? */
+	if (*s == '.') {
+		s++;
+		if (!isdigit(*s))
+			return false;
+		do s++; while (isdigit(*s));
+	}
+	
+	/* ([Ee] [+-]? [0-9]+)? */
+	if (*s=='E' || *s=='e') {
+		s++;
+		if (*s=='+' || *s=='-')
+			s++;
+		if (!isdigit(*s))
+			return false;
+		do s++; while (isdigit(*s));
+	}
+	
+	*sp = s;
+	return true;
+}
+
+static json_node *decode_number(const char **sp)
+{
+	const char *start, *end;
+	
+	start = *sp;
+	if (!validate_number(sp))
+		return NULL;
+	end = *sp;
+	
+	return json_mknumber(start, end - start);
+}
+
+static char *decode_string(const char **sp, size_t *length)
+{
+	const char *s = *sp;
+	String(ret);
+	char *out;
+	size_t size;
+	
+	if (*s++ != '"')
+		return NULL;
+	
+	while (*s && *s != '"') {
+		unsigned char c = *s++;
+		unsigned int uc, lc;
+		
+		if (c == '\\') {
+			c = *s++;
+			switch (c) {
+				case '"':
+				case '\\':
+				case '/':
+					break;
+				case 'b': c = '\b'; break;
+				case 'f': c = '\f'; break;
+				case 'n': c = '\n'; break;
+				case 'r': c = '\r'; break;
+				case 't': c = '\t'; break;
+				case 'u':
+					size = ret->length;
+					out = string_grow(ret, 4);
+					
+					if (!read_hex16(s, &uc))
+						goto failed;
+					s += 4;
+					
+					if (uc >= 0xD800 && uc <= 0xDFFF) {
+						/* Handle UTF-16 surrogate pair. */
+						
+						if (uc >= 0xDC00)
+							goto failed; /* Second surrogate not preceded by
+							                first surrogate. */
+						
+						if (s[0] != '\\' || s[1] != 'u'
+						|| !read_hex16(s+2, &lc)
+						|| !(lc >= 0xDC00 && lc <= 0xDFFF))
+							goto failed; /* First surrogate not followed by
+							                second surrogate. */
+						
+						s += 6;
+						
+						uc = 0x10000 | ((uc & 0x3FF) << 10) | (lc & 0x3FF);
+					}
+					
+					/* 0xFFFE and 0xFFFF are invalid Unicode */
+					if (uc == 0xFFFE || uc == 0xFFFF)
+						goto failed;
+					
+					size += utf8_encode_char(out, uc);
+					string_trunc(ret, size);
+					
+					continue; /* Continue the enclosing while loop to skip
+					             the str_append below. */
+				default: /* Invalid escape */
+					goto failed;
+			}
+		} else if (c <= 0x1F) {
+			/* Control characters not allowed in string literals. */
+			goto failed;
+		}
+		string_append_char(ret, c);
+	}
+	
+	if (!*s++)
+		goto failed;
+	
+	*length = ret->length;
+	*sp = s;
+	return string_buffer(ret);
+	
+failed:
+	string_free(ret);
+	return NULL;
+}
+
+json_type json_text_type(const char *str, size_t nbytes)
+{
+	const char *s = str;
+	const char *e = str + nbytes;
+	char c;
+	
+	/* Skip whitespace characters. */
+	while (s < e && is_whitespace(*s))
+		s++;
+	
+	/* Get first non-white character, making sure it's in bounds. */
+	if (s >= e)
+		return JSON_INVALID;
+	c = *s;
+	
+	switch (c) {
+		case 'n':
+			return JSON_NULL;
+		case '"':
+			return JSON_STRING;
+		case 't':
+		case 'f':
+			return JSON_BOOL;
+		case '{':
+			return JSON_OBJECT;
+		case '[':
+			return JSON_ARRAY;
+		default:
+			if (c == '-' || (c >= '0' && c <= '9'))
+				return JSON_NUMBER;
+			return JSON_INVALID;
+	}
+}
+
+
+/****************************** Encoding *****************************/
+
+static bool encode_string(String out, const char *string, size_t length)
+{
+	const char *s = string;
+	const char *e = s + length;
+	
+	if (!utf8_validate(string, length))
+		return false;
+	
+	string_append_char(out, '"');
+	
+	while (s < e) {
+		unsigned char c = *s++;
+		unsigned char e;
+		
+		switch (c) {
+			case '"': e = '"'; break;
+			case '\\': e = '\\'; break;
+			case '\b': e = 'b'; break;
+			case '\f': e = 'f'; break;
+			case '\n': e = 'n'; break;
+			case '\r': e = 'r'; break;
+			case '\t': e = 't'; break;
+			default: {
+				if (c < 0x1F || (c >= 0x80 && json_escape_unicode)) {
+					/* Encode using \u.... */
+					unsigned int uc, lc;
+					char txt[13];
+					
+					s--;
+					utf8_decode_char_nocheck(&s, &uc);
+					
+					txt[0] = '\\';
+					txt[1] = 'u';
+					txt[6] = '\\';
+					txt[7] = 'u';
+					if (uc <= 0xFFFF) {
+						write_hex16(txt+2, uc);
+						txt[6] = '\0';
+					} else {
+						uc -= 0x10000;
+						lc = uc & 0x3FF;
+						uc = uc >> 10;
+						uc |= 0xD800;
+						lc |= 0xDC00;
+						write_hex16(txt+2, uc);
+						write_hex16(txt+8, lc);
+						txt[12] = '\0';
+					}
+					
+					string_append(out, txt);
+					continue; /* Skip backslash-encoding code below. */
+				}
+				e = 0;
+			}
+		}
+		
+		string_append_char(out, e ? '\\' : c);
+		if (e)
+			string_append_char(out, e);
+	}
+	
+	string_append_char(out, '"');
+	
+	return true;
+}
+
+static bool encode_number(String out, const char *string)
+{
+	const char *s = string;
+	const char *start, *end;
+	
+	if (!string)
+		return false;
+	
+	/* Validate number, trimming whitespace. */
+	skip_whitespace(&s);
+	start = s;
+	if (!validate_number(&s))
+		return false;
+	end = s;
+	skip_whitespace(&s);
+	if (*s != '\0')
+		return false;
+	
+	/* Append number to out */
+	string_append_length(out, start, end-start);
+	
+	return true;
+}
+
+char *json_encode(json_node *node)
+{
+	String(ret);
+	const char *txt;
+	json_node *sentinel;
+	
+	if (!node)
+		return NULL;
+	sentinel = node->parent;
+	
+	goto begin_nokey;
+	
+begin: /* Encode entire node, or (if it's an array or object)
+          the beginning of it. */
+	
+	if (node->key) {
+		if (!encode_string(ret, node->key, node->key_length))
+			goto failed;
+		string_append_char(ret, ':');
+	}
+	goto begin_nokey;
+	
+begin_nokey:
+	
+	txt = NULL;
+	switch (node->type) {
+		case JSON_NULL:
+			txt = "null";
+			break;
+		case JSON_BOOL:
+			if (node->v.v_bool)
+				txt = "true";
+			else
+				txt = "false";
+			break;
+		case JSON_STRING:
+			if (!encode_string(ret, node->v.string.str, node->v.string.length))
+				goto failed;
+			break;
+		case JSON_NUMBER:
+			if (!encode_number(ret, node->v.number))
+				goto failed;
+			break;
+		case JSON_ARRAY:
+			txt = "[";
+			break;
+		case JSON_OBJECT:
+			txt = "{";
+			break;
+		default:
+			goto failed;
+	}
+	if (txt)
+		string_append(ret, txt);
+	
+	if (is_internal(node) && node->v.children.head) {
+		node = node->v.children.head;
+		goto begin;
+	} else {
+		goto finish;
+	}
+	
+finish: /* Finish a node and move to the next one. */
+	if (node->type == JSON_ARRAY)
+		string_append_char(ret, ']');
+	else if (node->type == JSON_OBJECT)
+		string_append_char(ret, '}');
+	
+	if (node->next) {
+		string_append_char(ret, ',');
+		node = node->next;
+		goto begin;
+	}
+	if (node->parent != sentinel) {
+		node = node->parent;
+		goto finish;
+	}
+	goto end;
+
+end: /* All nodes finished being serialized. */
+	return string_buffer(ret);
+	
+failed: /* Handle error. */
+	string_free(ret);
+	return NULL;
+}
+
+
+/************************ Liberal JSON support ***********************/
+
+bool json_validate_liberal(const char *str)
+{
+	json_node *node = json_decode_liberal(str);
+	if (!node)
+		return false;
+	json_delete(node);
+	return true;
+}
+
+json_node *json_decode_liberal(const char *str)
+{
+	char *cleaned = json_cleanup(str);
+	json_node *node = json_decode(cleaned);
+	if (cleaned)
+		JSON_free(cleaned);
+	return node;
+}
+
+char *json_cleanup(const char *str)
+{
+	String(ret);
+	const char *p = str;
+	const char *s = str;
+	int comment_start_width = 0;
+	char quote_char = 0;
+	/* flush(): flush content we have scanned, meaning append characters
+	 *          from p thru s to ret, then set p to s. */
+	#define flush() do { \
+			string_append_length(ret, p, s-p); \
+			p = s; \
+		} while(0)
+	
+	if (!str)
+		return NULL;
+	
+	goto begin;
+	
+begin:
+	for (;*s; s++) {
+		if (*s == '"' || *s == '\'')
+			goto quote;
+		if (isdigit(*s) || *s=='-' || *s=='+' || *s=='.')
+			goto number;
+		if (s[0]=='#') {
+			comment_start_width = 1;
+			goto line_comment;
+		}
+		if (s[0]=='/' && s[1]=='/') {
+			comment_start_width = 2;
+			goto line_comment;
+		}
+		if (s[0]=='/' && s[1]=='*') {
+			comment_start_width = 2;
+			goto c_comment;
+		}
+	}
+	flush();
+	return ret->buffer;
+
+quote:
+	quote_char = *s;
+	if (*s == '\'') {
+		flush();
+		string_append_char(ret, '"');
+		p = s = s+1;
+	} else {
+		s++;
+	}
+	while (*s) {
+		if (*s == quote_char) {
+			if (*s == '\'') {
+				flush();
+				string_append_char(ret, '"');
+				p = s = s+1;
+			} else {
+				s++;
+			}
+			break;
+		} if (*s == '"') {
+			/* We're converting single quotes to double quotes,
+			 * so double quotes need to be automatically escaped. */
+			flush();
+			string_append_char(ret, '\\');
+			s++;
+		} else if (*s == '\\') {
+			s++;
+			switch (*s) {
+				case '\0':
+					break;
+				case '\'':
+					/* Convert \' to \u0027 */
+					flush();
+					string_append(ret, "u0027");
+					p = s = s+1;
+					break;
+				default:
+					s++;
+			}
+		} else {
+			s++;
+		}
+	}
+	goto begin;
+
+number:
+	/* Skip a '-', or remove a '+' if present. */
+	if (*s == '-') {
+		s++;
+	} else if (*s == '+') {
+		flush();
+		p = s = s+1;
+	}
+	/* Make sure number has at least one digit. */
+	if (!isdigit(*s)) {
+		if (*s != '.')
+			goto failed;
+		if (!isdigit(s[1]))
+			goto failed;
+	}
+	/* Make sure that if first digit before '.' is '0', that it is the only digit.
+	 * Leading 0s are not allowed, and for a good reason: to avoid ambiguity
+	 * between octal and decimal formats. */
+	if (*s == '0') {
+		s++;
+		if (isdigit(*s))
+			goto failed;
+		goto frac;
+	}
+	/* Skip digits, or add a '0' if none are present. */
+	if (isdigit(*s)) {
+		do s++; while (isdigit(*s));
+	} else {
+		flush();
+		string_append_char(ret, '0');
+	}
+	goto frac;
+	
+frac:
+	if (*s == '.') {
+		s++;
+		if (isdigit(*s)) {
+			do s++; while (isdigit(*s));
+		} else {
+			flush();
+			string_append_char(ret, '0');
+		}
+	}
+/* exp: */
+	if (*s=='E' || *s=='e') {
+		s++;
+		if (*s=='+' || *s=='-')
+			s++;
+		if (!isdigit(*s))
+			goto failed;
+		do s++; while (isdigit(*s));
+	}
+	/* The isdigit check is not needed, but here
+	 * for clarity and safety. */
+	if (isdigit(*s) || *s=='-' || *s=='+' || *s=='.')
+		goto failed;
+	goto begin;
+
+line_comment: /* Remove all characters up to newline */
+	flush();
+	s += comment_start_width;
+	/* Skip characters up to newline */
+	while (*s && !(*s == '\n' || *s == '\r'))
+		s++;
+	/* Skip newline character and its complement (if present) */
+	if (s[0]) {
+		if (s[1] == '\n'+'\r'-s[0])
+			s++;
+		s++;
+	}
+	/* Set begin marker so characters skipped are not
+	 * appended to output on next flush. */
+	p = s;
+	goto begin;
+	
+c_comment: /* Remove all characters up to star-slash */
+	flush();
+	s += comment_start_width;
+	/* Skip characters up to and including star-slash */
+	while (*s && !(s[0] == '*' && s[1] == '/'))
+		s++;
+	if (*s)
+		s += 2;
+	else
+		goto failed; /* No star-slash present */
+	/* Set begin marker so characters skipped are not
+	 * appended to output on next flush. */
+	p = s;
+	goto begin;
+
+failed:
+	string_free(ret);
+	return NULL;
+	
+	#undef flush
+}
+
+
+/****************************** Unicode ******************************/
+
+static const bool utf8_allow_surrogates = false;
+
+static void utf8_decode_char_nocheck(const char **sp, unsigned int *uc)
+{
+	const unsigned char *s = (const unsigned char *)*sp;
+	unsigned char c = *s++;
+	unsigned int len;
+	unsigned char sf[4] = {0xFF, 0x1F, 0xF, 0x7};
+	
+	if (c < 0x80)
+		len = 0;
+	else if (c < 0xE0)
+		len = 1;
+	else if (c < 0xF0)
+		len = 2;
+	else
+		len = 3;
+	
+	*uc = c & sf[len];
+	while (len--) {
+		*uc <<= 6;
+		*uc |= *s++ & 0x3F;
+	}
+	
+	*sp = (const char*)s;
+}
+
+static bool utf8_validate(const char *str, size_t length)
+{
+	const unsigned char *s = (const unsigned char*)str;
+	const unsigned char *e = s + length;
+	
+	while (s < e) {
+		unsigned char c = *s++;
+		unsigned int len; /* number of bytes in sequence - 2 */
+		
+		/* If character is ASCII, move on. */
+		if (c < 0x80)
+			continue;
+		
+		if (s >= e)
+			return false; /* Missing bytes in sequence. */
+		
+		if (c < 0xE0) {
+			/* 2-byte sequence, U+0080 to U+07FF
+			   c must be 11000010 or higher
+			   s[0] must be 10xxxxxx */
+			len = 0;
+			if (c < 0xC2)
+				return false;
+		} else if (c < 0xF0) {
+			/* 3-byte sequence, U+0800 to U+FFFF
+			   Note that the surrogate range is U+D800 to U+DFFF,
+				  and that U+FFFE and U+FFFF are illegal characters.
+			   c must be >= 11100000 (which it is)
+			   If c is 11100000, then s[0] must be >= 10100000
+			   If the global parameter utf8_allow_surrogates is false:
+				  If c is 11101101 and s[0] is >= 10100000,
+				     then this is a surrogate and we should fail.
+			   If c is 11101111, s[0] is 10111111, and s[1] >= 10111110,
+				  then this is an illegal character and we should fail.
+			   s[0] and s[1] must be 10xxxxxx */
+			len = 1;
+			if (c == 0xE0 && *s < 0xA0)
+				return false;
+			if (!utf8_allow_surrogates && c == 0xED && *s >= 0xA0)
+				return false;
+			if (c == 0xEF && s[0] == 0xBF && (s+1 >= e || s[1] >= 0xBE))
+				return false;
+		} else {
+			/* 4-byte sequence, U+010000 to U+10FFFF
+			   c must be >= 11110000 (which it is) and <= 11110100
+			   If c is 11110000, then s[0] must be >= 10010000
+			   If c is 11110100, then s[0] must be < 10010000
+			   s[0], s[1], and s[2] must be 10xxxxxx */
+			len = 2;
+			if (c > 0xF4)
+				return false;
+			if (c == 0xF0 && *s < 0x90)
+				return false;
+			if (c == 0xF4 && *s >= 0x90)
+				return false;
+		}
+		
+		if (s + len >= e)
+			return false; /* Missing bytes in sequence. */
+		
+		do {
+			if ((*s++ & 0xC0) != 0x80)
+				return false;
+		} while (len--);
+	}
+	
+	return true;
+}
+
+/*
+ * Encodes the Unicode character uc as UTF-8, writing it
+ * to *out and updating *out to point to the end of the UTF-8 sequence.
+ *
+ * If uc is too high, no character will be emitted, and *out will
+ * not be changed.  If uc is in the UTF-16 surrogate range
+ * (U+D800 thru U+DFFF) or is a designated not-a-character
+ * (U+FFFE or U+FFFF), the character will be emitted anyway,
+ * although it is technically invalid UTF-8.
+ *
+ * Returns the number of characters emitted.
+ */
+static int utf8_encode_char(char *out, unsigned int uc)
+{
+	char *start = out;
+	
+	if (uc < 0x80) {
+		*out++ = uc & 0x7F;
+	} else if (uc < 0x800) {
+		*out++ = 0xC0 | (uc >> 6);
+		*out++ = 0x80 | (uc & 0x3F);
+	} else if (uc < 0x10000) {
+		*out++ = 0xE0 | (uc >> 12);
+		*out++ = 0x80 | ((uc >> 6) & 0x3F);
+		*out++ = 0x80 | (uc & 0x3F);
+	} else if (uc < 0x110000) {
+		*out++ = 0xF0 | ((uc >> 18) & 0x07);
+		*out++ = 0x80 | ((uc >> 12) & 0x3F);
+		*out++ = 0x80 | ((uc >> 6) & 0x3F);
+		*out++ = 0x80 | (uc & 0x3F);
+	}
+	
+	return out - start;
+}
+
-- 
cgit v1.2.3