diff options
| author | Joey Adams | 2010-06-14 06:29:58 +0000 |
|---|---|---|
| committer | Joey Adams | 2010-06-14 06:29:58 +0000 |
| commit | e8f4dc86cb87b38ee19b8574dadbbe6629f35ef0 (patch) | |
| tree | d6b73069951e7decfcd9d22a94d57686694426b1 | |
| parent | 042358fb35a690afdb97c12800b34af9515d15d9 (diff) | |
Implemented/tested parser for basic JSONPath(ish) patterns.
| -rw-r--r-- | Makefile | 2 | ||||
| -rw-r--r-- | expected/json.out | 300 | ||||
| -rw-r--r-- | json.c | 70 | ||||
| -rw-r--r-- | json.h | 27 | ||||
| -rw-r--r-- | json.sql.in | 5 | ||||
| -rw-r--r-- | json_op.c | 17 | ||||
| -rw-r--r-- | jsonpath.c | 246 | ||||
| -rw-r--r-- | jsonpath.h | 37 | ||||
| -rw-r--r-- | sql/json.sql | 53 |
9 files changed, 735 insertions, 22 deletions
@@ -1,5 +1,5 @@ MODULE_big = json -OBJS = json.o json_io.o json_op.o +OBJS = json.o jsonpath.o json_io.o json_op.o DATA_built = json.sql DATA = uninstall_json.sql diff --git a/expected/json.out b/expected/json.out index ecf4105..e8264bd 100644 --- a/expected/json.out +++ b/expected/json.out @@ -599,3 +599,303 @@ SELECT from_json('invalid'); ERROR: invalid JSON content LINE 1: SELECT from_json('invalid'); ^ +SELECT parse_json_path('..'); + parse_json_path +----------------- + +(1 row) + +SELECT parse_json_path('..*'); + parse_json_path +----------------- + $..[*] +(1 row) + +SELECT parse_json_path('.*'); + parse_json_path +----------------- + $[*] +(1 row) + +SELECT parse_json_path(''); + parse_json_path +----------------- + +(1 row) + +SELECT parse_json_path('$.*'); + parse_json_path +----------------- + $[*] +(1 row) + +SELECT parse_json_path('$'); + parse_json_path +----------------- + $ +(1 row) + +SELECT parse_json_path('$*'); + parse_json_path +----------------- + +(1 row) + +SELECT parse_json_path('*'); + parse_json_path +----------------- + $[*] +(1 row) + +SELECT parse_json_path($$ .. [ 025 ] $$); + parse_json_path +----------------- + $..[25] +(1 row) + +SELECT parse_json_path($$ $ . 0 . 3 $$); + parse_json_path +----------------- + $[0][3] +(1 row) + +SELECT parse_json_path($$ $ . 0_3 $$); + parse_json_path +----------------- + +(1 row) + +SELECT parse_json_path($$ $ . 0.3 $$); + parse_json_path +----------------- + $[0][3] +(1 row) + +SELECT parse_json_path($$ .. [ 031 ] $$); + parse_json_path +----------------- + $..[31] +(1 row) + +SELECT parse_json_path($$ $ . 0 . 3 . hello $$); + parse_json_path +------------------ + $[0][3]["hello"] +(1 row) + +SELECT parse_json_path($$ $ . 0 . 3 . hello . "back\\slash" $$); + parse_json_path +--------------------------------- + $[0][3]["hello"]["back\\slash"] +(1 row) + +SELECT parse_json_path($$ $ . 0 . 3 . hello . ["back\\slash"] $$); + parse_json_path +--------------------------------- + $[0][3]["hello"]["back\\slash"] +(1 row) + +SELECT parse_json_path($$ $ . 0 . 3 . hello . back\slash $$); + parse_json_path +----------------- + +(1 row) + +SELECT parse_json_path($$ $ . 0 . 3 . hello . backslash $$); + parse_json_path +------------------------------- + $[0][3]["hello"]["backslash"] +(1 row) + +SELECT parse_json_path($$ .. [ 0x31 ] $$); + parse_json_path +----------------- + +(1 row) + +SELECT parse_json_path($$ [ 0x31 ] $$); + parse_json_path +----------------- + +(1 row) + +SELECT parse_json_path($$ _3 $$); + parse_json_path +----------------- + $["_3"] +(1 row) + +SELECT parse_json_path($$ _3_ $$); + parse_json_path +----------------- + $["_3_"] +(1 row) + +SELECT parse_json_path($$ [ _3 ] $$); + parse_json_path +----------------- + +(1 row) + +SELECT parse_json_path($$ $ _3 $$); + parse_json_path +----------------- + +(1 row) + +SELECT parse_json_path($$ $ . _3 $$); + parse_json_path +----------------- + $["_3"] +(1 row) + +SELECT parse_json_path('..["5"]'); + parse_json_path +----------------- + $..["5"] +(1 row) + +SELECT parse_json_path('..[5]'); + parse_json_path +----------------- + $..[5] +(1 row) + +SELECT parse_json_path('..5'); + parse_json_path +----------------- + $..[5] +(1 row) + +SELECT parse_json_path($$ .. [ -5 ] $$); + parse_json_path +----------------- + $..[-5] +(1 row) + +SELECT parse_json_path($$ .. [ "5" ] $$); + parse_json_path +----------------- + $..["5"] +(1 row) + +SELECT parse_json_path($$ .. [ +5 ] $$); + parse_json_path +----------------- + $..[5] +(1 row) + +SELECT parse_json_path($$ .. [ 5 ] $$); + parse_json_path +----------------- + $..[5] +(1 row) + +SELECT parse_json_path($$ .. ["5"] $$); + parse_json_path +----------------- + $..["5"] +(1 row) + +SELECT parse_json_path($$ ..["5"] $$); + parse_json_path +----------------- + $..["5"] +(1 row) + +SELECT parse_json_path($$ [ "5" ] $$); + parse_json_path +----------------- + $["5"] +(1 row) + +SELECT parse_json_path($$ [ 5 ] $$); + parse_json_path +----------------- + $[5] +(1 row) + +SELECT parse_json_path('."hello"'); + parse_json_path +----------------- + $["hello"] +(1 row) + +SELECT parse_json_path('.hello'); + parse_json_path +----------------- + $["hello"] +(1 row) + +SELECT parse_json_path('...["hello world"]'); + parse_json_path +----------------- + +(1 row) + +SELECT parse_json_path('..["hello world"]'); + parse_json_path +-------------------- + $..["hello world"] +(1 row) + +SELECT parse_json_path('."hello world"'); + parse_json_path +------------------ + $["hello world"] +(1 row) + +SELECT parse_json_path('.["hello world"]'); + parse_json_path +------------------ + $["hello world"] +(1 row) + +SELECT parse_json_path('.hello world'); + parse_json_path +----------------- + +(1 row) + +SELECT parse_json_path('..["hello world"]["5"]'); + parse_json_path +------------------------- + $..["hello world"]["5"] +(1 row) + +SELECT parse_json_path('..["hello world"][5]'); + parse_json_path +----------------------- + $..["hello world"][5] +(1 row) + +SELECT parse_json_path('..["hello world"][5]..[3]..*.[*]'); + parse_json_path +------------------------------------ + $..["hello world"][5]..[3]..[*][*] +(1 row) + +SELECT parse_json_path('..["hello world"][5]..[3]..*'); + parse_json_path +--------------------------------- + $..["hello world"][5]..[3]..[*] +(1 row) + +SELECT parse_json_path('..["hello world"][5]..[3]'); + parse_json_path +---------------------------- + $..["hello world"][5]..[3] +(1 row) + +SELECT parse_json_path('..["hello world"][5]..[3]*'); + parse_json_path +----------------- + +(1 row) + +SELECT parse_json_path($$..['"hello"\\"world"'][5]..[3]*$$); + parse_json_path +----------------- + +(1 row) + @@ -349,9 +349,10 @@ ascend: static json_node *decode_leaf(const char **sp); static json_node *decode_number(const char **sp); -static char *decode_string(const char **sp, size_t *length); -/* decode_string has a different signature than its friends - because it's also used to parse object member keys. */ +char *json_decode_string(const char **sp, size_t *length, bool strict); +/* json_decode_string has a different signature than its friends + because it's also used to parse object member keys. + It's also useful outside of json.c, such as in jsonpath.c . */ bool json_validate(const char *str) { @@ -382,7 +383,7 @@ item: /* Expect a value */ if (parent && parent->type == JSON_OBJECT) { /* Parse member key string. */ - key = decode_string(&s, &key_length); + key = json_decode_string(&s, &key_length, true); if (!key) goto failed; @@ -482,7 +483,7 @@ static json_node *decode_leaf(const char **sp) if (c == '"') { size_t length; - char *str = decode_string(sp, &length); + char *str = json_decode_string(sp, &length, true); if (str) { json_node *node = json_mknode(JSON_STRING); @@ -573,24 +574,32 @@ static json_node *decode_number(const char **sp) return json_mknumber(start, end - start); } -static char *decode_string(const char **sp, size_t *length) +char *json_decode_string(const char **sp, size_t *length, bool strict) { const char *s = *sp; String(ret); char *out; size_t size; + char quote; + + Assert(length != NULL); - if (*s++ != '"') - return NULL; - - while (*s && *s != '"') { + quote = *s++; + if (strict) { + if (quote != '"') + return NULL; + } else { + if (quote != '"' && quote != '\'') + return NULL; + } + + while (*s && *s != quote) { unsigned char c = *s++; unsigned int uc, lc; if (c == '\\') { c = *s++; switch (c) { - case '"': case '\\': case '/': break; @@ -635,7 +644,11 @@ static char *decode_string(const char **sp, size_t *length) continue; /* Continue the enclosing while loop to skip the str_append below. */ default: /* Invalid escape */ - goto failed; + if (c == quote) + break; + if (!strict && (c == '"' || c == '\'')) + break; + goto failed; /* Invalid escape */ } } else if (c <= 0x1F) { /* Control characters not allowed in string literals. */ @@ -693,22 +706,21 @@ json_type json_text_type(const char *str, size_t nbytes) /****************************** Encoding *****************************/ -static bool encode_string(String out, const char *string, size_t length) +static bool encode_string(String out, const char *string, size_t length, char quote) { const char *s = string; const char *e = s + length; - - if (!utf8_validate(string, length)) + + if (!utf8_validate(string, length) || quote == '\\') return false; - - string_append_char(out, '"'); + + string_append_char(out, quote); while (s < e) { unsigned char c = *s++; unsigned char e; switch (c) { - case '"': e = '"'; break; case '\\': e = '\\'; break; case '\b': e = 'b'; break; case '\f': e = 'f'; break; @@ -716,6 +728,10 @@ static bool encode_string(String out, const char *string, size_t length) case '\r': e = 'r'; break; case '\t': e = 't'; break; default: { + if (c == quote) { + e = quote; + break; + } if (c < 0x1F || (c >= 0x80 && json_escape_unicode)) { /* Encode using \u.... */ unsigned int uc, lc; @@ -754,7 +770,7 @@ static bool encode_string(String out, const char *string, size_t length) string_append_char(out, e); } - string_append_char(out, '"'); + string_append_char(out, quote); return true; } @@ -799,7 +815,7 @@ begin: /* Encode entire node, or (if it's an array or object) the beginning of it. */ if (node->key) { - if (!encode_string(ret, node->key, node->key_length)) + if (!encode_string(ret, node->key, node->key_length, '"')) goto failed; string_append_char(ret, ':'); } @@ -819,7 +835,7 @@ begin_nokey: txt = "false"; break; case JSON_STRING: - if (!encode_string(ret, node->v.string.str, node->v.string.length)) + if (!encode_string(ret, node->v.string.str, node->v.string.length, '"')) goto failed; break; case JSON_NUMBER: @@ -870,6 +886,18 @@ failed: /* Handle error. */ return NULL; } +char *json_encode_string(const char *str, size_t length, char quote) +{ + String(ret); + + if (!encode_string(ret, str, length, quote)) { + string_free(ret); + return NULL; + } + + return string_buffer(ret); +} + /************************ Liberal JSON support ***********************/ @@ -119,6 +119,33 @@ char *json_cleanup(const char *str); void json_delete(json_node *node); /* + * Decodes a JSON-encoded string literal + * (If you're interested in the decoding JSON in general, see json_decode). + * If strict is true, string must be double-quoted, + * as is required by the JSON RFC. + * Otherwise, the string may be single- or double-quoted. + * Also, no whitespace skipping is done, so the caller should only + * call this function when it expects **sp to be either " or ' + * + * On success, returns the decoded string and passes that string's length + * through *length (which must not be NULL). On failure (parse error), + * returns NULL and leaves *length untouched. + */ +char *json_decode_string(const char **sp, size_t *length, bool strict); + +/* + * Encodes a string literal JSON-style using the given quote character, + * only escaping characters when necessary + * (If you're interested in encoding JSON in general, see json_encode). + * Note that using anything but '"' as the quote character will result in + * invalid JSON. + * + * Returns NULL if input is invalid UTF-8 or if an invalid quote character + * (such as backslash) is given. + */ +char *json_encode_string(const char *str, size_t length, char quote); + +/* * Default: false * * If json_escape_unicode is set, json_encode will escape all Unicode diff --git a/json.sql.in b/json.sql.in index 5e4dcf3..5a3bcca 100644 --- a/json.sql.in +++ b/json.sql.in @@ -53,3 +53,8 @@ CREATE OR REPLACE FUNCTION to_json(anyelement) RETURNS json AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE; -- not STRICT; allows to_json(NULL) to return 'null'::json. + +CREATE OR REPLACE FUNCTION parse_json_path(text) +RETURNS text +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT IMMUTABLE; @@ -1,4 +1,5 @@ #include "json.h" +#include "jsonpath.h" #define PG_RETURN_ENUM(typname, label) return enumLabelToOid(typname, label) @@ -52,3 +53,19 @@ json_get_type(PG_FUNCTION_ARGS) PG_RETURN_ENUM("json_type_t", enum_type_names[type]); } + +PG_FUNCTION_INFO_V1(parse_json_path); +Datum parse_json_path(PG_FUNCTION_ARGS); +Datum +parse_json_path(PG_FUNCTION_ARGS) +{ + char *string = text_to_cstring(PG_GETARG_TEXT_PP(0)); + JSONPath *jpath = jp_parse(string); + char *normalized; + + if (!jpath) + PG_RETURN_NULL(); + + normalized = jp_show(jpath); + PG_RETURN_TEXT_P(cstring_to_text(normalized)); +} diff --git a/jsonpath.c b/jsonpath.c new file mode 100644 index 0000000..8de8327 --- /dev/null +++ b/jsonpath.c @@ -0,0 +1,246 @@ +#include "jsonpath.h" + +#include <ctype.h> + +/* NB: These macros evaluate their argument multiple times. */ + +#define isletter(c) (((c) >= 'A' && (c) <= 'Z') || ((c) >= 'a' && (c) <= 'z')) + /* isalpha() is locale-specific. This simply matches [A-Za-z] . */ +#define isextended(c) ((unsigned char)(c) > 127) + +/* Note that Unicode characters are allowed in identifiers. */ +#define identifier_start(c) (isletter(c) || (c) == '_' || (c) == '$' || isextended(c)) +#define identifier_char(c) (identifier_start(c) || isdigit(c)) + +#define integer_start(c) (isdigit(c) || (c) == '+' || (c) == '-') + +/* + * Note that skip_spaces differs from skip_whitespace in json.c + * in that this function treats '\f' and '\v' as whitespace. + * This is because JSON does not accept these characters as + * whitespace, but since this is JSONPath, + * we can do whatever we want here :-) + */ +static void skip_spaces(const char **sp) +{ + const char *s = *sp; + while (isspace(*s)) + s++; + *sp = s; +} + +static jp_element *mkElement(jp_element_type type, bool rd) +{ + jp_element *elem = palloc0(sizeof(*elem)); + elem->type = type; + elem->recursive_descent = rd; + return elem; +} + +static jp_element *mkRoot(void) +{ + jp_element *elem = mkElement(JP_ROOT, false); + return elem; +} + +static jp_element *mkWildcard(bool rd) +{ + jp_element *elem = mkElement(JP_WILDCARD, rd); + return elem; +} + +static jp_element *mkIndexSubscript(int index, bool rd) +{ + jp_element *elem = mkElement(JP_INDEX_SUBSCRIPT, rd); + elem->data.index = index; + return elem; +} + +static jp_element *mkKeySubscript(char *key, size_t length, bool rd) +{ + jp_element *elem = mkElement(JP_KEY_SUBSCRIPT, rd); + elem->data.key.ptr = key; + elem->data.key.length = length; + return elem; +} + +char *jp_show(JSONPath *jp) +{ + StringInfoData string[1]; + ListCell *cell; + jp_element *elem; + bool rd; + char *tmp; + + initStringInfo(string); + + foreach(cell, jp) { + elem = lfirst(cell); + rd = elem->recursive_descent; + + switch (elem->type) { + case JP_ROOT: + appendStringInfoChar(string, '$'); + break; + case JP_WILDCARD: + appendStringInfoString(string, rd ? "..[*]" : "[*]"); + break; + case JP_INDEX_SUBSCRIPT: + appendStringInfo(string, "%s[%ld]", rd ? ".." : "", elem->data.index); + break; + case JP_KEY_SUBSCRIPT: + tmp = json_encode_string(elem->data.key.ptr, elem->data.key.length, '"'); + Assert(tmp != NULL); + appendStringInfo(string, "%s[%s]", rd ? ".." : "", tmp); + pfree(tmp); + break; + default: + Assert(false); + } + } + + return string->data; +} + +JSONPath *jp_parse(const char *pattern) +{ + JSONPath *jp = NIL; + const char *s = pattern; + const char *p; + bool recursive_descent = false; + bool bracket = false; + const char *err_msg = NULL; + long index; + char *key; + size_t key_length; + + skip_spaces(&s); + + /* pattern may not be empty */ + if (!*s) + return NULL; + + jp = lappend(jp, mkRoot()); + + if (*s == '$') { + s++; + goto begin_element; + } else if (*s != '.') { + goto dot_subscript; // implicit '.' at beginning + } + +begin_element: + skip_spaces(&s); + + recursive_descent = false; + bracket = false; + + if (*s == '\0') + goto end; + if (s[0] == '.' && s[1] == '.') { + recursive_descent = true; + s += 2; + goto dot_subscript; + } + if (s[0] == '.') { + s++; + goto dot_subscript; + } + if (s[0] == '[') { + s++; + goto bracket_subscript; + } + + goto failed; + +next_element: + if (bracket) { + skip_spaces(&s); + if (*s != ']') + goto failed; + s++; + } + goto begin_element; + +dot_subscript: + skip_spaces(&s); + + if (*s == '*') + goto wildcard; + if (integer_start(*s)) + goto integer; + if (identifier_start(*s)) + goto identifier; + if (*s == '"' || *s == '\'') + goto string; + if (*s == '[') { + s++; + goto bracket_subscript; + } + + goto failed; + +bracket_subscript: + skip_spaces(&s); + + bracket = true; + + if (*s == '*') + goto wildcard; + if (integer_start(*s)) + goto integer; + if (identifier_start(*s)) { + err_msg = "Identifiers may not be bracketed. This syntax is reserved for future use."; + goto failed; + } + if (*s == '"' || *s == '\'') + goto string; + + goto failed; + +wildcard: + s++; + jp = lappend(jp, mkWildcard(recursive_descent)); + goto next_element; + +integer: + p = s; + errno = 0; + index = strtol(s, (char**)&p, 10); + if (p <= s || errno != 0) + goto failed; + s = p; + + jp = lappend(jp, mkIndexSubscript(index, recursive_descent)); + goto next_element; + +identifier: + p = s; + while (identifier_char(*p)) + p++; + key = pnstrdup(s, p - s); + key_length = p - s; + s = p; + + jp = lappend(jp, mkKeySubscript(key, key_length, recursive_descent)); + goto next_element; + +string: + key = json_decode_string(&s, &key_length, false); + if (!key) + goto failed; + + jp = lappend(jp, mkKeySubscript(key, key_length, recursive_descent)); + goto next_element; + +end: + return jp; + +failed: + return NULL; +} + +List *jp_match(JSONPath *jp, json_node *json) +{ + +} diff --git a/jsonpath.h b/jsonpath.h new file mode 100644 index 0000000..4d19062 --- /dev/null +++ b/jsonpath.h @@ -0,0 +1,37 @@ +#ifndef JSONPATH_H +#define JSONPATH_H + +#include "json.h" + +#include "nodes/pg_list.h" + +typedef enum { + JP_ROOT, + JP_WILDCARD, + JP_INDEX_SUBSCRIPT, + JP_KEY_SUBSCRIPT +} jp_element_type; + +typedef struct { + jp_element_type type; + + union { + long index; + struct { + char *ptr; + size_t length; + } key; + } data; + + // If element was preceded by ".." in pattern + bool recursive_descent; +} jp_element; + +typedef List JSONPath; + +JSONPath *jp_parse(const char *pattern); +char *jp_show(JSONPath *jp); + +List *jp_match(JSONPath *jp, json_node *json); + +#endif diff --git a/sql/json.sql b/sql/json.sql index 6e8aa86..7cbe026 100644 --- a/sql/json.sql +++ b/sql/json.sql @@ -158,3 +158,56 @@ SELECT from_json('[1,2,3]'); -- from_json: invalid SELECT from_json('invalid'); +SELECT parse_json_path('..'); +SELECT parse_json_path('..*'); +SELECT parse_json_path('.*'); +SELECT parse_json_path(''); +SELECT parse_json_path('$.*'); +SELECT parse_json_path('$'); +SELECT parse_json_path('$*'); +SELECT parse_json_path('*'); +SELECT parse_json_path($$ .. [ 025 ] $$); +SELECT parse_json_path($$ $ . 0 . 3 $$); +SELECT parse_json_path($$ $ . 0_3 $$); +SELECT parse_json_path($$ $ . 0.3 $$); +SELECT parse_json_path($$ .. [ 031 ] $$); +SELECT parse_json_path($$ $ . 0 . 3 . hello $$); +SELECT parse_json_path($$ $ . 0 . 3 . hello . "back\\slash" $$); +SELECT parse_json_path($$ $ . 0 . 3 . hello . ["back\\slash"] $$); +SELECT parse_json_path($$ $ . 0 . 3 . hello . back\slash $$); +SELECT parse_json_path($$ $ . 0 . 3 . hello . backslash $$); +SELECT parse_json_path($$ .. [ 0x31 ] $$); +SELECT parse_json_path($$ [ 0x31 ] $$); +SELECT parse_json_path($$ _3 $$); +SELECT parse_json_path($$ _3_ $$); +SELECT parse_json_path($$ [ _3 ] $$); +SELECT parse_json_path($$ $ _3 $$); +SELECT parse_json_path($$ $ . _3 $$); +SELECT parse_json_path('..["5"]'); +SELECT parse_json_path('..[5]'); +SELECT parse_json_path('..5'); +SELECT parse_json_path($$ .. [ -5 ] $$); +SELECT parse_json_path($$ .. [ "5" ] $$); +SELECT parse_json_path($$ .. [ +5 ] $$); +SELECT parse_json_path($$ .. [ 5 ] $$); +SELECT parse_json_path($$ .. ["5"] $$); +SELECT parse_json_path($$ ..["5"] $$); +SELECT parse_json_path($$ [ "5" ] $$); +SELECT parse_json_path($$ [ 5 ] $$); +SELECT parse_json_path('."hello"'); +SELECT parse_json_path('.hello'); +SELECT parse_json_path('...["hello world"]'); +SELECT parse_json_path('..["hello world"]'); +SELECT parse_json_path('."hello world"'); +SELECT parse_json_path('.["hello world"]'); +SELECT parse_json_path('.hello world'); +SELECT parse_json_path('..["hello world"]["5"]'); +SELECT parse_json_path('..["hello world"][5]'); +SELECT parse_json_path('..["hello world"][5]..[3]..*.[*]'); +SELECT parse_json_path('..["hello world"][5]..[3]..*'); +SELECT parse_json_path('..["hello world"][5]..[3]'); +SELECT parse_json_path('..["hello world"][5]..[3]*'); +SELECT parse_json_path($$ + .. [ '"hello"\\"world"' ] + [ 5 ] .. [3] . * [ * ] +$$); |
