diff options
Diffstat (limited to 'src/backend/utils/adt/pg_locale_icu.c')
| -rw-r--r-- | src/backend/utils/adt/pg_locale_icu.c | 83 |
1 files changed, 74 insertions, 9 deletions
diff --git a/src/backend/utils/adt/pg_locale_icu.c b/src/backend/utils/adt/pg_locale_icu.c index e8820666b2d..43d44fe43bd 100644 --- a/src/backend/utils/adt/pg_locale_icu.c +++ b/src/backend/utils/adt/pg_locale_icu.c @@ -61,6 +61,8 @@ static size_t strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale); static size_t strfold_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale); +static size_t downcase_ident_icu(char *dst, size_t dstsize, const char *src, + ssize_t srclen, pg_locale_t locale); static int strncoll_icu(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, pg_locale_t locale); @@ -121,16 +123,9 @@ static int32_t u_strFoldCase_default(UChar *dest, int32_t destCapacity, const char *locale, UErrorCode *pErrorCode); -static bool -char_is_cased_icu(char ch, pg_locale_t locale) -{ - return IS_HIGHBIT_SET(ch) || - (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z'); -} - /* * XXX: many of the functions below rely on casts directly from pg_wchar to - * UChar32, which is correct for the UTF-8 encoding, but not in general. + * UChar32, which is correct for UTF-8 and LATIN1, but not in general. */ static pg_wchar @@ -234,6 +229,7 @@ static const struct ctype_methods ctype_methods_icu = { .strtitle = strtitle_icu, .strupper = strupper_icu, .strfold = strfold_icu, + .downcase_ident = downcase_ident_icu, .wc_isdigit = wc_isdigit_icu, .wc_isalpha = wc_isalpha_icu, .wc_isalnum = wc_isalnum_icu, @@ -244,11 +240,33 @@ static const struct ctype_methods ctype_methods_icu = { .wc_ispunct = wc_ispunct_icu, .wc_isspace = wc_isspace_icu, .wc_isxdigit = wc_isxdigit_icu, - .char_is_cased = char_is_cased_icu, .wc_iscased = wc_iscased_icu, .wc_toupper = toupper_icu, .wc_tolower = tolower_icu, }; + +/* + * ICU still depends on libc for compatibility with certain historical + * behavior for single-byte encodings. See downcase_ident_icu(). + * + * XXX: consider fixing by decoding the single byte into a code point, and + * using u_tolower(). + */ +static locale_t +make_libc_ctype_locale(const char *ctype) +{ + locale_t loc; + +#ifndef WIN32 + loc = newlocale(LC_CTYPE_MASK, ctype, NULL); +#else + loc = _create_locale(LC_ALL, ctype); +#endif + if (!loc) + report_newlocale_failure(ctype); + + return loc; +} #endif pg_locale_t @@ -259,6 +277,7 @@ create_pg_locale_icu(Oid collid, MemoryContext context) const char *iculocstr; const char *icurules = NULL; UCollator *collator; + locale_t loc = (locale_t) 0; pg_locale_t result; if (collid == DEFAULT_COLLATION_OID) @@ -281,6 +300,18 @@ create_pg_locale_icu(Oid collid, MemoryContext context) if (!isnull) icurules = TextDatumGetCString(datum); + /* libc only needed for default locale and single-byte encoding */ + if (pg_database_encoding_max_length() == 1) + { + const char *ctype; + + datum = SysCacheGetAttrNotNull(DATABASEOID, tp, + Anum_pg_database_datctype); + ctype = TextDatumGetCString(datum); + + loc = make_libc_ctype_locale(ctype); + } + ReleaseSysCache(tp); } else @@ -311,6 +342,7 @@ create_pg_locale_icu(Oid collid, MemoryContext context) result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct)); result->icu.locale = MemoryContextStrdup(context, iculocstr); result->icu.ucol = collator; + result->icu.lt = loc; result->deterministic = deterministic; result->collate_is_c = false; result->ctype_is_c = false; @@ -573,6 +605,39 @@ strfold_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, } /* + * For historical compatibility, behavior is not multibyte-aware. + * + * NB: uses libc tolower() for single-byte encodings (also for historical + * compatibility), and therefore relies on the global LC_CTYPE setting. + */ +static size_t +downcase_ident_icu(char *dst, size_t dstsize, const char *src, + ssize_t srclen, pg_locale_t locale) +{ + int i; + bool libc_lower; + locale_t lt = locale->icu.lt; + + libc_lower = lt && (pg_database_encoding_max_length() == 1); + + for (i = 0; i < srclen && i < dstsize; i++) + { + unsigned char ch = (unsigned char) src[i]; + + if (ch >= 'A' && ch <= 'Z') + ch = pg_ascii_tolower(ch); + else if (libc_lower && IS_HIGHBIT_SET(ch) && isupper_l(ch, lt)) + ch = tolower_l(ch, lt); + dst[i] = (char) ch; + } + + if (i < dstsize) + dst[i] = '\0'; + + return srclen; +} + +/* * strncoll_icu_utf8 * * Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given |
