summaryrefslogtreecommitdiff
path: root/util.c
diff options
context:
space:
mode:
Diffstat (limited to 'util.c')
-rw-r--r--util.c64
1 files changed, 64 insertions, 0 deletions
diff --git a/util.c b/util.c
index 39fb0b8..9c64542 100644
--- a/util.c
+++ b/util.c
@@ -94,3 +94,67 @@ utf8_substring(
*out_bytes = sub_end - sub_start;
return sub_length;
}
+
+static const bool utf8_allow_surrogates = false;
+
+void
+utf8_decode_char_nocheck(const char **sp, unsigned int *uc)
+{
+ const unsigned char *s = (const unsigned char *) *sp;
+ unsigned char c = *s++;
+ unsigned int len;
+ unsigned char sf[4] = {0xFF, 0x1F, 0xF, 0x7};
+
+ if (c < 0x80)
+ len = 0;
+ else if (c < 0xE0)
+ len = 1;
+ else if (c < 0xF0)
+ len = 2;
+ else
+ len = 3;
+
+ *uc = c & sf[len];
+ while (len--)
+ {
+ *uc <<= 6;
+ *uc |= *s++ & 0x3F;
+ }
+
+ *sp = (const char *) s;
+}
+
+bool
+utf8_validate(const char *str, size_t length)
+{
+ const unsigned char *s = (const unsigned char *) str;
+ const unsigned char *e = s + length;
+ int len;
+
+ while (s < e)
+ {
+ if (*s <= 0x7F)
+ {
+ s++;
+ continue;
+ }
+
+ len = pg_utf_mblen(s);
+ if (s + len > e)
+ return false;
+
+ if (!pg_utf8_islegal(s, len))
+ return false;
+
+ s += len;
+ }
+
+ return true;
+}
+
+int
+utf8_encode_char(char *out, unsigned int uc)
+{
+ unicode_to_utf8(uc, (unsigned char *) out);
+ return pg_utf_mblen((unsigned char *) out);
+}