From bb5bbf028efbf663c15628e748a2a878e766ef65 Mon Sep 17 00:00:00 2001 From: Joey Adams Date: Wed, 9 Jun 2010 00:19:29 -0400 Subject: Moved everything in /contrib/json along with json.sgml to root directory. This repository should probably be an honest-to-goodness branch of mainline PostgreSQL. I'm looking into that :-) --- json.c | 1210 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1210 insertions(+) create mode 100644 json.c (limited to 'json.c') diff --git a/json.c b/json.c new file mode 100644 index 0000000..d530edd --- /dev/null +++ b/json.c @@ -0,0 +1,1210 @@ +/* + Copyright (C) 2010 Joseph A. Adams (joeyadams3.14159@gmail.com) + All rights reserved. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#include "json.h" + +#include + + + +bool json_escape_unicode = false; + +#define JSON_malloc palloc + +/* repalloc and pfree can't take a null pointer, unlike normal realloc and free. */ +static void *JSON_realloc(void *ptr, Size size) +{ + if (ptr) + return repalloc(ptr, size); + else + return palloc(size); +} +static void JSON_free(void *ptr) +{ + if (ptr) + pfree(ptr); +} + +static char *JSON_strdup(const char *str, size_t length) +{ + char *ret = JSON_malloc(length + 1); + memcpy(ret, str, length); + ret[length] = 0; + return ret; +} + +#define is_internal(node) ((node)->type == JSON_ARRAY || (node)->type == JSON_OBJECT) + +/* We can't use isspace() because it also accepts \v and \f, which + aren't legal whitespace characters in strict JSON. */ +#define is_whitespace(c) ((c)==' ' || (c)=='\t' || (c)=='\n' || (c)=='\r') + +static void skip_whitespace(const char **sp) +{ + const char *s = *sp; + while (is_whitespace(*s)) + s++; + *sp = s; +} + +static char end_parenthesis(json_node *node) +{ + if (!node) + return 0; + switch (node->type) { + case JSON_ARRAY: return ']'; + case JSON_OBJECT: return '}'; + default: return 0; + } +} + +/* + * Reads exactly 4 hex characters (capital or lowercase). + * Writes the result to *out . + * Returns true on success, false on failure. + */ +static bool read_hex16(const char *in, unsigned int *out) +{ + unsigned int i; + unsigned int tmp; + char c; + + *out = 0; + + for (i=0; i<4; i++) { + c = *in++; + if (c >= '0' && c <= '9') + tmp = c - '0'; + else if (c >= 'A' && c <= 'F') + tmp = c - 'A' + 10; + else if (c >= 'a' && c <= 'f') + tmp = c - 'a' + 10; + else + return false; + + *out <<= 4; + *out += tmp; + } + + return true; +} + +static void write_hex16(char *out, unsigned int val) +{ + const char *hex = "0123456789ABCDEF"; + *out++ = hex[(val >> 12) & 0xF]; + *out++ = hex[(val >> 8) & 0xF]; + *out++ = hex[(val >> 4) & 0xF]; + *out++ = hex[val & 0xF]; +} + +static bool utf8_validate(const char *str, size_t length); +static void utf8_decode_char_nocheck(const char **sp, unsigned int *uc); +static int utf8_encode_char(char *out, unsigned int uc); + + +/*************************** String buffer ***************************/ + +typedef struct { + char *buffer; + size_t length; + size_t alloc; +} String[1]; + +/* Declare and initialize a String with the given name. */ +#define String(name) String name = {{NULL, 0, 0}} + +/* Grow the string by @need characters, reallocating if necessary. + * Returns a pointer to the uninitialized range where text is to go. + * A '\0' terminator is added automatically. */ +static char *string_grow(String str, size_t need) +{ + size_t end = str->length; + str->length += need; + if (str->alloc <= str->length) { + str->alloc = str->length*3/2 + 1; + if (str->alloc < 8) + str->alloc = 8; + str->buffer = JSON_realloc(str->buffer, str->alloc); + } + str->buffer[str->length] = '\0'; + return str->buffer + end; +} +static char *string_buffer(String str) +{ + if (!str->buffer) + string_grow(str, 0); + return str->buffer; +} +static inline void string_append_length(String str, const char *append, size_t len) +{ + char *dest = string_grow(str, len); + memcpy(dest, append, len); +} +static inline void string_append(String str, const char *append) +{ + string_append_length(str, append, strlen(append)); +} +static inline void string_append_char(String str, char c) +{ + *string_grow(str, 1) = c; +} +static inline void string_trunc(String str, size_t len) +{ + str->length = len; + str->buffer[len] = '\0'; +} +static inline void string_free(String str) +{ + JSON_free(str->buffer); +} + + +/*********** json_node creation, manipulation, and deletion **********/ + +json_node *json_mknode(json_type type) +{ + json_node *node = JSON_malloc(sizeof(*node)); + memset(node, 0, sizeof(*node)); + node->type = type; + return node; +} + +json_node *json_mkbool(bool v_bool) +{ + json_node *node = json_mknode(JSON_BOOL); + node->v.v_bool = v_bool; + return node; +} + +json_node *json_mkstring(const char *str, size_t length) +{ + json_node *node = json_mknode(JSON_STRING); + if (str) { + node->v.string.str = JSON_strdup(str, length); + node->v.string.length = length; + } + return node; +} + +json_node *json_mknumber(const char *number, size_t length) +{ + json_node *node = json_mknode(JSON_NUMBER); + if (number) + node->v.number = JSON_strdup(number, length); + return node; +} + +void json_append(json_node *parent, json_node *child) +{ + Assert(parent->type==JSON_ARRAY || parent->type==JSON_OBJECT); + Assert(child->parent == NULL); + + parent->v.children.count++; + child->parent = parent; + child->prev = parent->v.children.tail; + child->next = NULL; + + if (parent->v.children.tail) { + parent->v.children.tail->next = child; + parent->v.children.tail = child; + } else { + parent->v.children.head = parent->v.children.tail = child; + } +} + +void json_remove(json_node *node) +{ + json_node *parent = node->parent; + + if (!parent) + return; + Assert(parent->type==JSON_ARRAY || parent->type==JSON_OBJECT); + Assert(parent->v.children.count > 0); + + if (node->prev) + node->prev->next = node->next; + else + parent->v.children.head = node->next; + if (node->next) + node->next->prev = node->prev; + else + parent->v.children.tail = node->prev; + + parent->v.children.count--; + node->parent = NULL; + node->prev = NULL; + node->next = NULL; +} + +const char *json_get_string(json_node *node, size_t *length_out) +{ + Assert(node->type == JSON_STRING); + if (length_out) + *length_out = node->v.string.length; + return node->v.string.str; +} + +void json_set_string(json_node *node, const char *str, size_t length) +{ + Assert(node->type == JSON_STRING); + if (node->v.string.str) + JSON_free(node->v.string.str); + if (str) { + node->v.string.str = JSON_strdup(str, length); + node->v.string.length = length; + } else { + node->v.string.str = NULL; + node->v.string.length = 0; + } +} + +const char *json_get_number(json_node *node) +{ + Assert(node->type == JSON_NUMBER); + return node->v.number; +} + +void json_set_number(json_node *node, const char *number, size_t length) +{ + Assert(node->type == JSON_NUMBER); + if (node->v.number) + JSON_free(node->v.number); + if (number) + node->v.number = JSON_strdup(number, length); + else + node->v.number = NULL; +} + +/* Non-recursively free a node */ +static void free_node(json_node *node) +{ + if (node->type == JSON_STRING) + JSON_free(node->v.string.str); + else if (node->type == JSON_NUMBER) + JSON_free(node->v.number); + if (node->key) + JSON_free(node->key); + JSON_free(node); +} + +void json_delete(json_node *node) +{ + json_node *parent, *next; + + if (!node) + return; + + /* Remove node from parent (if it has one). */ + json_remove(node); + + goto descend; + +descend: + while (is_internal(node) && node->v.children.head) + node = node->v.children.head; + goto advance; + +advance: + parent = node->parent; + next = node->next; + free_node(node); + node = next; + + if (node) + goto descend; + else + goto ascend; + +ascend: + node = parent; + if (node) + goto advance; + else + return; +} + + +/*********************** Parsing and validation **********************/ + +static json_node *decode_leaf(const char **sp); +static json_node *decode_number(const char **sp); +static char *decode_string(const char **sp, size_t *length); +/* decode_string has a different signature than its friends + because it's also used to parse object member keys. */ + +bool json_validate(const char *str) +{ + json_node *node = json_decode(str); + if (!node) + return false; + json_delete(node); + return true; +} + +json_node *json_decode(const char *str) +{ + json_node *root = NULL, *parent = NULL, *node = NULL; + const char *s = str; + char *key = NULL; + size_t key_length = 0; + + if (!str) + return NULL; + + if (!utf8_validate(str, strlen(str))) + return NULL; + + goto item; + +item: /* Expect a value */ + skip_whitespace(&s); + + if (parent && parent->type == JSON_OBJECT) { + /* Parse member key string. */ + key = decode_string(&s, &key_length); + if (!key) + goto failed; + + /* Eat the " : " */ + skip_whitespace(&s); + if (*s != ':') + goto failed; + s++; + skip_whitespace(&s); + } else { + key = NULL; + } + + node = decode_leaf(&s); + if (!node) { + if (*s == '[') + node = json_mknode(JSON_ARRAY); + else if (*s == '{') + node = json_mknode(JSON_OBJECT); + else + goto failed; + s++; + } + + if (key) { + node->key = key; + node->key_length = key_length; + key = NULL; + } + + if (parent) + json_append(parent, node); + else + root = node; + + if (is_internal(node)) { + parent = node; + goto item_endp; + } + + if (parent) + goto comma_endp; + else + goto end; + +comma_endp: /* Expect a comma or end bracket/brace */ + skip_whitespace(&s); + + if (*s == ',') { + s++; + goto item; + } + if (*s == ']' || *s == '}') + goto endp; + + goto failed; + +item_endp: /* Expect a value or end bracket/brace */ + skip_whitespace(&s); + if (*s == ']' || *s == '}') + goto endp; + goto item; + +endp: /* Handle an end bracket/brace */ + if (*s != end_parenthesis(parent)) + goto failed; + s++; + node = parent; + parent = parent->parent; + if (parent) + goto comma_endp; + else + goto end; + +end: /* Expect end of text */ + skip_whitespace(&s); + if (*s) + goto failed; + return node; + +failed: /* Handle failure */ + if (key) + JSON_free(key); + json_delete(root); + return NULL; +} + +/* + * Decode and skip a node that does not have children. + * Whitespace is not skipped first (it is done in the primary decode loop). + * + * Returns NULL if next character is '[', '{', or invalid. + */ +static json_node *decode_leaf(const char **sp) +{ + char c = **sp; + + if (c == '"') { + size_t length; + char *str = decode_string(sp, &length); + + if (str) { + json_node *node = json_mknode(JSON_STRING); + node->v.string.str = str; + node->v.string.length = length; + return node; + } + + return NULL; + } + if ((c >= '0' && c <= '9') || c == '-') + return decode_number(sp); + if (!strncmp(*sp, "true", 4)) { + (*sp) += 4; + return json_mkbool(true); + } + if (!strncmp(*sp, "false", 5)) { + (*sp) += 5; + return json_mkbool(false); + } + if (!strncmp(*sp, "null", 4)) { + (*sp) += 4; + return json_mknode(JSON_NULL); + } + + return NULL; +} + +/* + * The JSON spec says that a number shall follow this precise pattern + * (spaces and quotes added for readability): + * '-'? (0 | [1-9][0-9]*) ('.' [0-9]+)? ([Ee] [+-]? [0-9]+)? + * + * However, some JSON parsers are more liberal. For instance, PHP accepts + * '.5' and '1.'. JSON.parse accepts '+3'. + * + * This function takes the strict approach. The user should use + * json_clean() to handle liberal JSON text. + */ +static bool validate_number(const char **sp) +{ + const char *s = *sp; + + /* '-'? */ + if (*s == '-') + s++; + + /* (0 | [1-9][0-9]*) */ + if (*s == '0') { + s++; + } else { + if (!isdigit(*s)) + return false; + do s++; while (isdigit(*s)); + } + + /* ('.' [0-9]+)? */ + if (*s == '.') { + s++; + if (!isdigit(*s)) + return false; + do s++; while (isdigit(*s)); + } + + /* ([Ee] [+-]? [0-9]+)? */ + if (*s=='E' || *s=='e') { + s++; + if (*s=='+' || *s=='-') + s++; + if (!isdigit(*s)) + return false; + do s++; while (isdigit(*s)); + } + + *sp = s; + return true; +} + +static json_node *decode_number(const char **sp) +{ + const char *start, *end; + + start = *sp; + if (!validate_number(sp)) + return NULL; + end = *sp; + + return json_mknumber(start, end - start); +} + +static char *decode_string(const char **sp, size_t *length) +{ + const char *s = *sp; + String(ret); + char *out; + size_t size; + + if (*s++ != '"') + return NULL; + + while (*s && *s != '"') { + unsigned char c = *s++; + unsigned int uc, lc; + + if (c == '\\') { + c = *s++; + switch (c) { + case '"': + case '\\': + case '/': + break; + case 'b': c = '\b'; break; + case 'f': c = '\f'; break; + case 'n': c = '\n'; break; + case 'r': c = '\r'; break; + case 't': c = '\t'; break; + case 'u': + size = ret->length; + out = string_grow(ret, 4); + + if (!read_hex16(s, &uc)) + goto failed; + s += 4; + + if (uc >= 0xD800 && uc <= 0xDFFF) { + /* Handle UTF-16 surrogate pair. */ + + if (uc >= 0xDC00) + goto failed; /* Second surrogate not preceded by + first surrogate. */ + + if (s[0] != '\\' || s[1] != 'u' + || !read_hex16(s+2, &lc) + || !(lc >= 0xDC00 && lc <= 0xDFFF)) + goto failed; /* First surrogate not followed by + second surrogate. */ + + s += 6; + + uc = 0x10000 | ((uc & 0x3FF) << 10) | (lc & 0x3FF); + } + + /* 0xFFFE and 0xFFFF are invalid Unicode */ + if (uc == 0xFFFE || uc == 0xFFFF) + goto failed; + + size += utf8_encode_char(out, uc); + string_trunc(ret, size); + + continue; /* Continue the enclosing while loop to skip + the str_append below. */ + default: /* Invalid escape */ + goto failed; + } + } else if (c <= 0x1F) { + /* Control characters not allowed in string literals. */ + goto failed; + } + string_append_char(ret, c); + } + + if (!*s++) + goto failed; + + *length = ret->length; + *sp = s; + return string_buffer(ret); + +failed: + string_free(ret); + return NULL; +} + +json_type json_text_type(const char *str, size_t nbytes) +{ + const char *s = str; + const char *e = str + nbytes; + char c; + + /* Skip whitespace characters. */ + while (s < e && is_whitespace(*s)) + s++; + + /* Get first non-white character, making sure it's in bounds. */ + if (s >= e) + return JSON_INVALID; + c = *s; + + switch (c) { + case 'n': + return JSON_NULL; + case '"': + return JSON_STRING; + case 't': + case 'f': + return JSON_BOOL; + case '{': + return JSON_OBJECT; + case '[': + return JSON_ARRAY; + default: + if (c == '-' || (c >= '0' && c <= '9')) + return JSON_NUMBER; + return JSON_INVALID; + } +} + + +/****************************** Encoding *****************************/ + +static bool encode_string(String out, const char *string, size_t length) +{ + const char *s = string; + const char *e = s + length; + + if (!utf8_validate(string, length)) + return false; + + string_append_char(out, '"'); + + while (s < e) { + unsigned char c = *s++; + unsigned char e; + + switch (c) { + case '"': e = '"'; break; + case '\\': e = '\\'; break; + case '\b': e = 'b'; break; + case '\f': e = 'f'; break; + case '\n': e = 'n'; break; + case '\r': e = 'r'; break; + case '\t': e = 't'; break; + default: { + if (c < 0x1F || (c >= 0x80 && json_escape_unicode)) { + /* Encode using \u.... */ + unsigned int uc, lc; + char txt[13]; + + s--; + utf8_decode_char_nocheck(&s, &uc); + + txt[0] = '\\'; + txt[1] = 'u'; + txt[6] = '\\'; + txt[7] = 'u'; + if (uc <= 0xFFFF) { + write_hex16(txt+2, uc); + txt[6] = '\0'; + } else { + uc -= 0x10000; + lc = uc & 0x3FF; + uc = uc >> 10; + uc |= 0xD800; + lc |= 0xDC00; + write_hex16(txt+2, uc); + write_hex16(txt+8, lc); + txt[12] = '\0'; + } + + string_append(out, txt); + continue; /* Skip backslash-encoding code below. */ + } + e = 0; + } + } + + string_append_char(out, e ? '\\' : c); + if (e) + string_append_char(out, e); + } + + string_append_char(out, '"'); + + return true; +} + +static bool encode_number(String out, const char *string) +{ + const char *s = string; + const char *start, *end; + + if (!string) + return false; + + /* Validate number, trimming whitespace. */ + skip_whitespace(&s); + start = s; + if (!validate_number(&s)) + return false; + end = s; + skip_whitespace(&s); + if (*s != '\0') + return false; + + /* Append number to out */ + string_append_length(out, start, end-start); + + return true; +} + +char *json_encode(json_node *node) +{ + String(ret); + const char *txt; + json_node *sentinel; + + if (!node) + return NULL; + sentinel = node->parent; + + goto begin_nokey; + +begin: /* Encode entire node, or (if it's an array or object) + the beginning of it. */ + + if (node->key) { + if (!encode_string(ret, node->key, node->key_length)) + goto failed; + string_append_char(ret, ':'); + } + goto begin_nokey; + +begin_nokey: + + txt = NULL; + switch (node->type) { + case JSON_NULL: + txt = "null"; + break; + case JSON_BOOL: + if (node->v.v_bool) + txt = "true"; + else + txt = "false"; + break; + case JSON_STRING: + if (!encode_string(ret, node->v.string.str, node->v.string.length)) + goto failed; + break; + case JSON_NUMBER: + if (!encode_number(ret, node->v.number)) + goto failed; + break; + case JSON_ARRAY: + txt = "["; + break; + case JSON_OBJECT: + txt = "{"; + break; + default: + goto failed; + } + if (txt) + string_append(ret, txt); + + if (is_internal(node) && node->v.children.head) { + node = node->v.children.head; + goto begin; + } else { + goto finish; + } + +finish: /* Finish a node and move to the next one. */ + if (node->type == JSON_ARRAY) + string_append_char(ret, ']'); + else if (node->type == JSON_OBJECT) + string_append_char(ret, '}'); + + if (node->next) { + string_append_char(ret, ','); + node = node->next; + goto begin; + } + if (node->parent != sentinel) { + node = node->parent; + goto finish; + } + goto end; + +end: /* All nodes finished being serialized. */ + return string_buffer(ret); + +failed: /* Handle error. */ + string_free(ret); + return NULL; +} + + +/************************ Liberal JSON support ***********************/ + +bool json_validate_liberal(const char *str) +{ + json_node *node = json_decode_liberal(str); + if (!node) + return false; + json_delete(node); + return true; +} + +json_node *json_decode_liberal(const char *str) +{ + char *cleaned = json_cleanup(str); + json_node *node = json_decode(cleaned); + if (cleaned) + JSON_free(cleaned); + return node; +} + +char *json_cleanup(const char *str) +{ + String(ret); + const char *p = str; + const char *s = str; + int comment_start_width = 0; + char quote_char = 0; + /* flush(): flush content we have scanned, meaning append characters + * from p thru s to ret, then set p to s. */ + #define flush() do { \ + string_append_length(ret, p, s-p); \ + p = s; \ + } while(0) + + if (!str) + return NULL; + + goto begin; + +begin: + for (;*s; s++) { + if (*s == '"' || *s == '\'') + goto quote; + if (isdigit(*s) || *s=='-' || *s=='+' || *s=='.') + goto number; + if (s[0]=='#') { + comment_start_width = 1; + goto line_comment; + } + if (s[0]=='/' && s[1]=='/') { + comment_start_width = 2; + goto line_comment; + } + if (s[0]=='/' && s[1]=='*') { + comment_start_width = 2; + goto c_comment; + } + } + flush(); + return ret->buffer; + +quote: + quote_char = *s; + if (*s == '\'') { + flush(); + string_append_char(ret, '"'); + p = s = s+1; + } else { + s++; + } + while (*s) { + if (*s == quote_char) { + if (*s == '\'') { + flush(); + string_append_char(ret, '"'); + p = s = s+1; + } else { + s++; + } + break; + } if (*s == '"') { + /* We're converting single quotes to double quotes, + * so double quotes need to be automatically escaped. */ + flush(); + string_append_char(ret, '\\'); + s++; + } else if (*s == '\\') { + s++; + switch (*s) { + case '\0': + break; + case '\'': + /* Convert \' to \u0027 */ + flush(); + string_append(ret, "u0027"); + p = s = s+1; + break; + default: + s++; + } + } else { + s++; + } + } + goto begin; + +number: + /* Skip a '-', or remove a '+' if present. */ + if (*s == '-') { + s++; + } else if (*s == '+') { + flush(); + p = s = s+1; + } + /* Make sure number has at least one digit. */ + if (!isdigit(*s)) { + if (*s != '.') + goto failed; + if (!isdigit(s[1])) + goto failed; + } + /* Make sure that if first digit before '.' is '0', that it is the only digit. + * Leading 0s are not allowed, and for a good reason: to avoid ambiguity + * between octal and decimal formats. */ + if (*s == '0') { + s++; + if (isdigit(*s)) + goto failed; + goto frac; + } + /* Skip digits, or add a '0' if none are present. */ + if (isdigit(*s)) { + do s++; while (isdigit(*s)); + } else { + flush(); + string_append_char(ret, '0'); + } + goto frac; + +frac: + if (*s == '.') { + s++; + if (isdigit(*s)) { + do s++; while (isdigit(*s)); + } else { + flush(); + string_append_char(ret, '0'); + } + } +/* exp: */ + if (*s=='E' || *s=='e') { + s++; + if (*s=='+' || *s=='-') + s++; + if (!isdigit(*s)) + goto failed; + do s++; while (isdigit(*s)); + } + /* The isdigit check is not needed, but here + * for clarity and safety. */ + if (isdigit(*s) || *s=='-' || *s=='+' || *s=='.') + goto failed; + goto begin; + +line_comment: /* Remove all characters up to newline */ + flush(); + s += comment_start_width; + /* Skip characters up to newline */ + while (*s && !(*s == '\n' || *s == '\r')) + s++; + /* Skip newline character and its complement (if present) */ + if (s[0]) { + if (s[1] == '\n'+'\r'-s[0]) + s++; + s++; + } + /* Set begin marker so characters skipped are not + * appended to output on next flush. */ + p = s; + goto begin; + +c_comment: /* Remove all characters up to star-slash */ + flush(); + s += comment_start_width; + /* Skip characters up to and including star-slash */ + while (*s && !(s[0] == '*' && s[1] == '/')) + s++; + if (*s) + s += 2; + else + goto failed; /* No star-slash present */ + /* Set begin marker so characters skipped are not + * appended to output on next flush. */ + p = s; + goto begin; + +failed: + string_free(ret); + return NULL; + + #undef flush +} + + +/****************************** Unicode ******************************/ + +static const bool utf8_allow_surrogates = false; + +static void utf8_decode_char_nocheck(const char **sp, unsigned int *uc) +{ + const unsigned char *s = (const unsigned char *)*sp; + unsigned char c = *s++; + unsigned int len; + unsigned char sf[4] = {0xFF, 0x1F, 0xF, 0x7}; + + if (c < 0x80) + len = 0; + else if (c < 0xE0) + len = 1; + else if (c < 0xF0) + len = 2; + else + len = 3; + + *uc = c & sf[len]; + while (len--) { + *uc <<= 6; + *uc |= *s++ & 0x3F; + } + + *sp = (const char*)s; +} + +static bool utf8_validate(const char *str, size_t length) +{ + const unsigned char *s = (const unsigned char*)str; + const unsigned char *e = s + length; + + while (s < e) { + unsigned char c = *s++; + unsigned int len; /* number of bytes in sequence - 2 */ + + /* If character is ASCII, move on. */ + if (c < 0x80) + continue; + + if (s >= e) + return false; /* Missing bytes in sequence. */ + + if (c < 0xE0) { + /* 2-byte sequence, U+0080 to U+07FF + c must be 11000010 or higher + s[0] must be 10xxxxxx */ + len = 0; + if (c < 0xC2) + return false; + } else if (c < 0xF0) { + /* 3-byte sequence, U+0800 to U+FFFF + Note that the surrogate range is U+D800 to U+DFFF, + and that U+FFFE and U+FFFF are illegal characters. + c must be >= 11100000 (which it is) + If c is 11100000, then s[0] must be >= 10100000 + If the global parameter utf8_allow_surrogates is false: + If c is 11101101 and s[0] is >= 10100000, + then this is a surrogate and we should fail. + If c is 11101111, s[0] is 10111111, and s[1] >= 10111110, + then this is an illegal character and we should fail. + s[0] and s[1] must be 10xxxxxx */ + len = 1; + if (c == 0xE0 && *s < 0xA0) + return false; + if (!utf8_allow_surrogates && c == 0xED && *s >= 0xA0) + return false; + if (c == 0xEF && s[0] == 0xBF && (s+1 >= e || s[1] >= 0xBE)) + return false; + } else { + /* 4-byte sequence, U+010000 to U+10FFFF + c must be >= 11110000 (which it is) and <= 11110100 + If c is 11110000, then s[0] must be >= 10010000 + If c is 11110100, then s[0] must be < 10010000 + s[0], s[1], and s[2] must be 10xxxxxx */ + len = 2; + if (c > 0xF4) + return false; + if (c == 0xF0 && *s < 0x90) + return false; + if (c == 0xF4 && *s >= 0x90) + return false; + } + + if (s + len >= e) + return false; /* Missing bytes in sequence. */ + + do { + if ((*s++ & 0xC0) != 0x80) + return false; + } while (len--); + } + + return true; +} + +/* + * Encodes the Unicode character uc as UTF-8, writing it + * to *out and updating *out to point to the end of the UTF-8 sequence. + * + * If uc is too high, no character will be emitted, and *out will + * not be changed. If uc is in the UTF-16 surrogate range + * (U+D800 thru U+DFFF) or is a designated not-a-character + * (U+FFFE or U+FFFF), the character will be emitted anyway, + * although it is technically invalid UTF-8. + * + * Returns the number of characters emitted. + */ +static int utf8_encode_char(char *out, unsigned int uc) +{ + char *start = out; + + if (uc < 0x80) { + *out++ = uc & 0x7F; + } else if (uc < 0x800) { + *out++ = 0xC0 | (uc >> 6); + *out++ = 0x80 | (uc & 0x3F); + } else if (uc < 0x10000) { + *out++ = 0xE0 | (uc >> 12); + *out++ = 0x80 | ((uc >> 6) & 0x3F); + *out++ = 0x80 | (uc & 0x3F); + } else if (uc < 0x110000) { + *out++ = 0xF0 | ((uc >> 18) & 0x07); + *out++ = 0x80 | ((uc >> 12) & 0x3F); + *out++ = 0x80 | ((uc >> 6) & 0x3F); + *out++ = 0x80 | (uc & 0x3F); + } + + return out - start; +} + -- cgit v1.2.3