1 files changed, 96 insertions, 0 deletions
diff --git a/util.c b/util.c
index 9c64542..3171a4f 100644
--- a/util.c
+++ b/util.c
@@ -52,6 +52,31 @@ enumLabelToOid(const char *typname, const char *label)
 	return ret;
 }
 
+/*
+ * utf8_substring
+ *    Find substring bounds in a UTF-8-encoded string.
+ *
+ *    @src and @srcbytes are the start and byte length of the input string.
+ *    @start and @length are the start and number of characters requested.
+ *
+ *    Writes the bounds of the substring to
+ *    *out_start (start) and *out_bytes (byte length).
+ *    Returns the number of characters (not bytes) in the string.
+ *
+ *    Example:
+ *       const char *out_start;
+ *       int         out_bytes;
+ *       int         out_chars;
+ *
+ *       out_chars =
+ *           unicode_substring("⁰¹²³", 9,
+ *                             1, 100,
+ *                             &out_start, &out_bytes);
+ *
+ *    out_chars will be 3.
+ *    out_start will point to the "¹".
+ *    out_bytes will be 6.
+ */
 size_t
 utf8_substring(
 			   const char *src, size_t srcbytes,
@@ -124,6 +149,11 @@ utf8_decode_char_nocheck(const char **sp, unsigned int *uc)
 	*sp = (const char *) s;
 }
 
+/*
+ * utf8_validate
+ *    Essentially a variant of pg_verify_mbstr(PG_UTF8, str, length, true)
+ *    that allows '\0' characters.
+ */
 bool
 utf8_validate(const char *str, size_t length)
 {
@@ -158,3 +188,69 @@ utf8_encode_char(char *out, unsigned int uc)
 	unicode_to_utf8(uc, (unsigned char *) out);
 	return pg_utf_mblen((unsigned char *) out);
 }
+
+char *
+server_to_utf8(const char *str, int len)
+{
+	return (char *) pg_do_encoding_conversion(
+				(unsigned char *) str, len, GetDatabaseEncoding(), PG_UTF8);
+}
+
+char *
+utf8_to_server(const char *str, int len)
+{
+	return (char *) pg_do_encoding_conversion(
+				(unsigned char *) str, len, PG_UTF8, GetDatabaseEncoding());
+}
+
+/*
+ * Adaptations of text_to_cstring and cstring_to_text for simplifying UTF-8 conversions.
+ *
+ * Just like text_to_cstring, text_to_utf8_cstring will always return a palloc'd,
+ * null-terminated C-string.
+ */
+char *text_to_utf8_cstring(const text *t)
+{
+	/* must cast away the const, just like in text_to_cstring */
+	text		*tunpacked	= pg_detoast_datum_packed((struct varlena *) t);
+	const char	*data		= VARDATA_ANY(tunpacked);
+	int			len			= VARSIZE_ANY_EXHDR(tunpacked);
+	char		*result;
+	
+	result = server_to_utf8(data, len);
+	if (result == data)
+		result = pnstrdup(data, len);
+	
+	if (tunpacked != t)
+		pfree(tunpacked);
+	
+	return result;
+}
+
+text *utf8_cstring_to_text(const char *s)
+{
+	return utf8_cstring_to_text_with_len(s, strlen(s));
+}
+
+text *utf8_cstring_to_text_with_len(const char *s, int len)
+{
+	char	*cstring;
+	int		cstring_len;
+	text	*result;
+	
+	cstring	= utf8_to_server(s, len);
+	if (cstring == s)
+		cstring_len = len;
+	else
+		cstring_len = strlen(cstring);
+	
+	result	= (text *) palloc(len + VARHDRSZ);
+	
+	SET_VARSIZE(result, len + VARHDRSZ);
+	memcpy(VARDATA(result), cstring, cstring_len);
+	
+	if (cstring != s)
+		pfree(cstring);
+	
+	return result;
+}