summaryrefslogtreecommitdiff
path: root/util.c
diff options
context:
space:
mode:
Diffstat (limited to 'util.c')
-rw-r--r--util.c96
1 files changed, 96 insertions, 0 deletions
diff --git a/util.c b/util.c
index 9c64542..3171a4f 100644
--- a/util.c
+++ b/util.c
@@ -52,6 +52,31 @@ enumLabelToOid(const char *typname, const char *label)
return ret;
}
+/*
+ * utf8_substring
+ * Find substring bounds in a UTF-8-encoded string.
+ *
+ * @src and @srcbytes are the start and byte length of the input string.
+ * @start and @length are the start and number of characters requested.
+ *
+ * Writes the bounds of the substring to
+ * *out_start (start) and *out_bytes (byte length).
+ * Returns the number of characters (not bytes) in the string.
+ *
+ * Example:
+ * const char *out_start;
+ * int out_bytes;
+ * int out_chars;
+ *
+ * out_chars =
+ * unicode_substring("⁰¹²³", 9,
+ * 1, 100,
+ * &out_start, &out_bytes);
+ *
+ * out_chars will be 3.
+ * out_start will point to the "¹".
+ * out_bytes will be 6.
+ */
size_t
utf8_substring(
const char *src, size_t srcbytes,
@@ -124,6 +149,11 @@ utf8_decode_char_nocheck(const char **sp, unsigned int *uc)
*sp = (const char *) s;
}
+/*
+ * utf8_validate
+ * Essentially a variant of pg_verify_mbstr(PG_UTF8, str, length, true)
+ * that allows '\0' characters.
+ */
bool
utf8_validate(const char *str, size_t length)
{
@@ -158,3 +188,69 @@ utf8_encode_char(char *out, unsigned int uc)
unicode_to_utf8(uc, (unsigned char *) out);
return pg_utf_mblen((unsigned char *) out);
}
+
+char *
+server_to_utf8(const char *str, int len)
+{
+ return (char *) pg_do_encoding_conversion(
+ (unsigned char *) str, len, GetDatabaseEncoding(), PG_UTF8);
+}
+
+char *
+utf8_to_server(const char *str, int len)
+{
+ return (char *) pg_do_encoding_conversion(
+ (unsigned char *) str, len, PG_UTF8, GetDatabaseEncoding());
+}
+
+/*
+ * Adaptations of text_to_cstring and cstring_to_text for simplifying UTF-8 conversions.
+ *
+ * Just like text_to_cstring, text_to_utf8_cstring will always return a palloc'd,
+ * null-terminated C-string.
+ */
+char *text_to_utf8_cstring(const text *t)
+{
+ /* must cast away the const, just like in text_to_cstring */
+ text *tunpacked = pg_detoast_datum_packed((struct varlena *) t);
+ const char *data = VARDATA_ANY(tunpacked);
+ int len = VARSIZE_ANY_EXHDR(tunpacked);
+ char *result;
+
+ result = server_to_utf8(data, len);
+ if (result == data)
+ result = pnstrdup(data, len);
+
+ if (tunpacked != t)
+ pfree(tunpacked);
+
+ return result;
+}
+
+text *utf8_cstring_to_text(const char *s)
+{
+ return utf8_cstring_to_text_with_len(s, strlen(s));
+}
+
+text *utf8_cstring_to_text_with_len(const char *s, int len)
+{
+ char *cstring;
+ int cstring_len;
+ text *result;
+
+ cstring = utf8_to_server(s, len);
+ if (cstring == s)
+ cstring_len = len;
+ else
+ cstring_len = strlen(cstring);
+
+ result = (text *) palloc(len + VARHDRSZ);
+
+ SET_VARSIZE(result, len + VARHDRSZ);
+ memcpy(VARDATA(result), cstring, cstring_len);
+
+ if (cstring != s)
+ pfree(cstring);
+
+ return result;
+}