diff options
Diffstat (limited to 'util.c')
| -rw-r--r-- | util.c | 96 |
1 files changed, 96 insertions, 0 deletions
@@ -52,6 +52,31 @@ enumLabelToOid(const char *typname, const char *label) return ret; } +/* + * utf8_substring + * Find substring bounds in a UTF-8-encoded string. + * + * @src and @srcbytes are the start and byte length of the input string. + * @start and @length are the start and number of characters requested. + * + * Writes the bounds of the substring to + * *out_start (start) and *out_bytes (byte length). + * Returns the number of characters (not bytes) in the string. + * + * Example: + * const char *out_start; + * int out_bytes; + * int out_chars; + * + * out_chars = + * unicode_substring("⁰¹²³", 9, + * 1, 100, + * &out_start, &out_bytes); + * + * out_chars will be 3. + * out_start will point to the "¹". + * out_bytes will be 6. + */ size_t utf8_substring( const char *src, size_t srcbytes, @@ -124,6 +149,11 @@ utf8_decode_char_nocheck(const char **sp, unsigned int *uc) *sp = (const char *) s; } +/* + * utf8_validate + * Essentially a variant of pg_verify_mbstr(PG_UTF8, str, length, true) + * that allows '\0' characters. + */ bool utf8_validate(const char *str, size_t length) { @@ -158,3 +188,69 @@ utf8_encode_char(char *out, unsigned int uc) unicode_to_utf8(uc, (unsigned char *) out); return pg_utf_mblen((unsigned char *) out); } + +char * +server_to_utf8(const char *str, int len) +{ + return (char *) pg_do_encoding_conversion( + (unsigned char *) str, len, GetDatabaseEncoding(), PG_UTF8); +} + +char * +utf8_to_server(const char *str, int len) +{ + return (char *) pg_do_encoding_conversion( + (unsigned char *) str, len, PG_UTF8, GetDatabaseEncoding()); +} + +/* + * Adaptations of text_to_cstring and cstring_to_text for simplifying UTF-8 conversions. + * + * Just like text_to_cstring, text_to_utf8_cstring will always return a palloc'd, + * null-terminated C-string. + */ +char *text_to_utf8_cstring(const text *t) +{ + /* must cast away the const, just like in text_to_cstring */ + text *tunpacked = pg_detoast_datum_packed((struct varlena *) t); + const char *data = VARDATA_ANY(tunpacked); + int len = VARSIZE_ANY_EXHDR(tunpacked); + char *result; + + result = server_to_utf8(data, len); + if (result == data) + result = pnstrdup(data, len); + + if (tunpacked != t) + pfree(tunpacked); + + return result; +} + +text *utf8_cstring_to_text(const char *s) +{ + return utf8_cstring_to_text_with_len(s, strlen(s)); +} + +text *utf8_cstring_to_text_with_len(const char *s, int len) +{ + char *cstring; + int cstring_len; + text *result; + + cstring = utf8_to_server(s, len); + if (cstring == s) + cstring_len = len; + else + cstring_len = strlen(cstring); + + result = (text *) palloc(len + VARHDRSZ); + + SET_VARSIZE(result, len + VARHDRSZ); + memcpy(VARDATA(result), cstring, cstring_len); + + if (cstring != s) + pfree(cstring); + + return result; +} |
