Made JSON datatype well-behaved with respect to character sets.

Note that this is currently untested with server encodings other than UTF-8. The encoding policy used is: JSON nodes and most of the JSON functions still operate in UTF-8. Strings are converted between server encoding and UTF-8 when they go in and out of varlena (text*), and a set of helper functions are implemented to make these conversions simple to apply. It is done this way because converting individual codepoints to/from whatever the server encoding may be is nontrivial (possibly requires a loaded module). The JSON code needs to encode/decode codepoints when it deals with escapes. Although a more clever and efficient solution might be to defer charset conversions to when they're necessary (e.g. round up all the escapes and encode them all at once), this is not simple, and it's probably not much more efficient, either. Conversions to/from server encoding and UTF-8 are no-ops when the server encoding is UTF-8, anyway.
author: Joey Adams 2010-08-04 21:44:22 +0000
committer: Joey Adams 2010-08-04 21:44:22 +0000
commit: 2b2fda2b7219004d65c1d121e7fed52ba85a9fb8 (patch)
tree: 30300ad44b786a6d5ed6106f1010c481d49b6e10 /util.c
parent: f475e581b72b8c42cf951f6653610d15e71caeee (diff)
1 files changed, 96 insertions, 0 deletions
diff --git a/util.c b/util.c
index 9c64542..3171a4f 100644
--- a/util.c
+++ b/util.c
@@ -52,6 +52,31 @@ enumLabelToOid(const char *typname, const char *label)
 	return ret;
 }
 
+/*
+ * utf8_substring
+ *    Find substring bounds in a UTF-8-encoded string.
+ *
+ *    @src and @srcbytes are the start and byte length of the input string.
+ *    @start and @length are the start and number of characters requested.
+ *
+ *    Writes the bounds of the substring to
+ *    *out_start (start) and *out_bytes (byte length).
+ *    Returns the number of characters (not bytes) in the string.
+ *
+ *    Example:
+ *       const char *out_start;
+ *       int         out_bytes;
+ *       int         out_chars;
+ *
+ *       out_chars =
+ *           unicode_substring("⁰¹²³", 9,
+ *                             1, 100,
+ *                             &out_start, &out_bytes);
+ *
+ *    out_chars will be 3.
+ *    out_start will point to the "¹".
+ *    out_bytes will be 6.
+ */
 size_t
 utf8_substring(
 			   const char *src, size_t srcbytes,
@@ -124,6 +149,11 @@ utf8_decode_char_nocheck(const char **sp, unsigned int *uc)
 	*sp = (const char *) s;
 }
 
+/*
+ * utf8_validate
+ *    Essentially a variant of pg_verify_mbstr(PG_UTF8, str, length, true)
+ *    that allows '\0' characters.
+ */
 bool
 utf8_validate(const char *str, size_t length)
 {
@@ -158,3 +188,69 @@ utf8_encode_char(char *out, unsigned int uc)
 	unicode_to_utf8(uc, (unsigned char *) out);
 	return pg_utf_mblen((unsigned char *) out);
 }
+
+char *
+server_to_utf8(const char *str, int len)
+{
+	return (char *) pg_do_encoding_conversion(
+				(unsigned char *) str, len, GetDatabaseEncoding(), PG_UTF8);
+}
+
+char *
+utf8_to_server(const char *str, int len)
+{
+	return (char *) pg_do_encoding_conversion(
+				(unsigned char *) str, len, PG_UTF8, GetDatabaseEncoding());
+}
+
+/*
+ * Adaptations of text_to_cstring and cstring_to_text for simplifying UTF-8 conversions.
+ *
+ * Just like text_to_cstring, text_to_utf8_cstring will always return a palloc'd,
+ * null-terminated C-string.
+ */
+char *text_to_utf8_cstring(const text *t)
+{
+	/* must cast away the const, just like in text_to_cstring */
+	text		*tunpacked	= pg_detoast_datum_packed((struct varlena *) t);
+	const char	*data		= VARDATA_ANY(tunpacked);
+	int			len			= VARSIZE_ANY_EXHDR(tunpacked);
+	char		*result;
+	
+	result = server_to_utf8(data, len);
+	if (result == data)
+		result = pnstrdup(data, len);
+	
+	if (tunpacked != t)
+		pfree(tunpacked);
+	
+	return result;
+}
+
+text *utf8_cstring_to_text(const char *s)
+{
+	return utf8_cstring_to_text_with_len(s, strlen(s));
+}
+
+text *utf8_cstring_to_text_with_len(const char *s, int len)
+{
+	char	*cstring;
+	int		cstring_len;
+	text	*result;
+	
+	cstring	= utf8_to_server(s, len);
+	if (cstring == s)
+		cstring_len = len;
+	else
+		cstring_len = strlen(cstring);
+	
+	result	= (text *) palloc(len + VARHDRSZ);
+	
+	SET_VARSIZE(result, len + VARHDRSZ);
+	memcpy(VARDATA(result), cstring, cstring_len);
+	
+	if (cstring != s)
+		pfree(cstring);
+	
+	return result;
+}
author	Joey Adams	2010-08-04 21:44:22 +0000
committer	Joey Adams	2010-08-04 21:44:22 +0000
commit	2b2fda2b7219004d65c1d121e7fed52ba85a9fb8 (patch)
tree	30300ad44b786a6d5ed6106f1010c481d49b6e10 /util.c
parent	f475e581b72b8c42cf951f6653610d15e71caeee (diff)