diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/backend/parser/scansup.c | 36 | ||||
| -rw-r--r-- | src/backend/utils/adt/pg_locale.c | 20 | ||||
| -rw-r--r-- | src/backend/utils/adt/pg_locale_builtin.c | 2 | ||||
| -rw-r--r-- | src/backend/utils/adt/pg_locale_icu.c | 36 | ||||
| -rw-r--r-- | src/backend/utils/adt/pg_locale_libc.c | 33 | ||||
| -rw-r--r-- | src/include/utils/pg_locale.h | 5 |
6 files changed, 107 insertions, 25 deletions
diff --git a/src/backend/parser/scansup.c b/src/backend/parser/scansup.c index 2feb2b6cf5a..d63cb865260 100644 --- a/src/backend/parser/scansup.c +++ b/src/backend/parser/scansup.c @@ -18,6 +18,7 @@ #include "mb/pg_wchar.h" #include "parser/scansup.h" +#include "utils/pg_locale.h" /* @@ -46,35 +47,22 @@ char * downcase_identifier(const char *ident, int len, bool warn, bool truncate) { char *result; - int i; - bool enc_is_single_byte; - - result = palloc(len + 1); - enc_is_single_byte = pg_database_encoding_max_length() == 1; + size_t needed pg_attribute_unused(); /* - * SQL99 specifies Unicode-aware case normalization, which we don't yet - * have the infrastructure for. Instead we use tolower() to provide a - * locale-aware translation. However, there are some locales where this - * is not right either (eg, Turkish may do strange things with 'i' and - * 'I'). Our current compromise is to use tolower() for characters with - * the high bit set, as long as they aren't part of a multi-byte - * character, and use an ASCII-only downcasing for 7-bit characters. + * Preserves string length. + * + * NB: if we decide to support Unicode-aware identifier case folding, then + * we need to account for a change in string length. */ - for (i = 0; i < len; i++) - { - unsigned char ch = (unsigned char) ident[i]; + result = palloc(len + 1); - if (ch >= 'A' && ch <= 'Z') - ch += 'a' - 'A'; - else if (enc_is_single_byte && IS_HIGHBIT_SET(ch) && isupper(ch)) - ch = tolower(ch); - result[i] = (char) ch; - } - result[i] = '\0'; + needed = pg_downcase_ident(result, len + 1, ident, len); + Assert(needed == len); + Assert(result[len] == '\0'); - if (i >= NAMEDATALEN && truncate) - truncate_identifier(result, i, warn); + if (len >= NAMEDATALEN && truncate) + truncate_identifier(result, len, warn); return result; } diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 8a3796aa5d0..ee08ac045b7 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -1353,6 +1353,26 @@ pg_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen, } /* + * Lowercase an identifier using the database default locale. + * + * For historical reasons, does not use ordinary locale behavior. Should only + * be used for identifiers. XXX: can we make this equivalent to + * pg_strfold(..., default_locale)? + */ +size_t +pg_downcase_ident(char *dst, size_t dstsize, const char *src, ssize_t srclen) +{ + pg_locale_t locale = default_locale; + + if (locale == NULL || locale->ctype == NULL || + locale->ctype->downcase_ident == NULL) + return strlower_c(dst, dstsize, src, srclen); + else + return locale->ctype->downcase_ident(dst, dstsize, src, srclen, + locale); +} + +/* * pg_strcoll * * Like pg_strncoll for NUL-terminated input strings. diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c index 0c2920112bb..145b4641b1b 100644 --- a/src/backend/utils/adt/pg_locale_builtin.c +++ b/src/backend/utils/adt/pg_locale_builtin.c @@ -208,6 +208,8 @@ static const struct ctype_methods ctype_methods_builtin = { .strtitle = strtitle_builtin, .strupper = strupper_builtin, .strfold = strfold_builtin, + /* uses plain ASCII semantics for historical reasons */ + .downcase_ident = NULL, .wc_isdigit = wc_isdigit_builtin, .wc_isalpha = wc_isalpha_builtin, .wc_isalnum = wc_isalnum_builtin, diff --git a/src/backend/utils/adt/pg_locale_icu.c b/src/backend/utils/adt/pg_locale_icu.c index 18d026deda8..69f22b47a68 100644 --- a/src/backend/utils/adt/pg_locale_icu.c +++ b/src/backend/utils/adt/pg_locale_icu.c @@ -61,6 +61,8 @@ static size_t strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale); static size_t strfold_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale); +static size_t downcase_ident_icu(char *dst, size_t dstsize, const char *src, + ssize_t srclen, pg_locale_t locale); static int strncoll_icu(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, pg_locale_t locale); @@ -123,7 +125,7 @@ static int32_t u_strFoldCase_default(UChar *dest, int32_t destCapacity, /* * XXX: many of the functions below rely on casts directly from pg_wchar to - * UChar32, which is correct for the UTF-8 encoding, but not in general. + * UChar32, which is correct for UTF-8 and LATIN1, but not in general. */ static pg_wchar @@ -227,6 +229,7 @@ static const struct ctype_methods ctype_methods_icu = { .strtitle = strtitle_icu, .strupper = strupper_icu, .strfold = strfold_icu, + .downcase_ident = downcase_ident_icu, .wc_isdigit = wc_isdigit_icu, .wc_isalpha = wc_isalpha_icu, .wc_isalnum = wc_isalnum_icu, @@ -565,6 +568,37 @@ strfold_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, } /* + * For historical compatibility, behavior is not multibyte-aware. + * + * NB: uses libc tolower() for single-byte encodings (also for historical + * compatibility), and therefore relies on the global LC_CTYPE setting. + */ +static size_t +downcase_ident_icu(char *dst, size_t dstsize, const char *src, + ssize_t srclen, pg_locale_t locale) +{ + int i; + bool enc_is_single_byte; + + enc_is_single_byte = pg_database_encoding_max_length() == 1; + for (i = 0; i < srclen && i < dstsize; i++) + { + unsigned char ch = (unsigned char) src[i]; + + if (ch >= 'A' && ch <= 'Z') + ch = pg_ascii_tolower(ch); + else if (enc_is_single_byte && IS_HIGHBIT_SET(ch) && isupper(ch)) + ch = tolower(ch); + dst[i] = (char) ch; + } + + if (i < dstsize) + dst[i] = '\0'; + + return srclen; +} + +/* * strncoll_icu_utf8 * * Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c index 3baa5816b5f..ab6117aaace 100644 --- a/src/backend/utils/adt/pg_locale_libc.c +++ b/src/backend/utils/adt/pg_locale_libc.c @@ -318,12 +318,41 @@ tolower_libc_mb(pg_wchar wc, pg_locale_t locale) return wc; } +/* + * Characters A..Z always downcase to a..z, even in the Turkish + * locale. Characters beyond 127 use tolower(). + */ +static size_t +downcase_ident_libc_sb(char *dst, size_t dstsize, const char *src, + ssize_t srclen, pg_locale_t locale) +{ + locale_t loc = locale->lt; + int i; + + for (i = 0; i < srclen && i < dstsize; i++) + { + unsigned char ch = (unsigned char) src[i]; + + if (ch >= 'A' && ch <= 'Z') + ch = pg_ascii_tolower(ch); + else if (IS_HIGHBIT_SET(ch) && isupper_l(ch, loc)) + ch = tolower_l(ch, loc); + dst[i] = (char) ch; + } + + if (i < dstsize) + dst[i] = '\0'; + + return srclen; +} + static const struct ctype_methods ctype_methods_libc_sb = { .strlower = strlower_libc_sb, .strtitle = strtitle_libc_sb, .strupper = strupper_libc_sb, /* in libc, casefolding is the same as lowercasing */ .strfold = strlower_libc_sb, + .downcase_ident = downcase_ident_libc_sb, .wc_isdigit = wc_isdigit_libc_sb, .wc_isalpha = wc_isalpha_libc_sb, .wc_isalnum = wc_isalnum_libc_sb, @@ -349,6 +378,8 @@ static const struct ctype_methods ctype_methods_libc_other_mb = { .strupper = strupper_libc_mb, /* in libc, casefolding is the same as lowercasing */ .strfold = strlower_libc_mb, + /* uses plain ASCII semantics for historical reasons */ + .downcase_ident = NULL, .wc_isdigit = wc_isdigit_libc_sb, .wc_isalpha = wc_isalpha_libc_sb, .wc_isalnum = wc_isalnum_libc_sb, @@ -370,6 +401,8 @@ static const struct ctype_methods ctype_methods_libc_utf8 = { .strupper = strupper_libc_mb, /* in libc, casefolding is the same as lowercasing */ .strfold = strlower_libc_mb, + /* uses plain ASCII semantics for historical reasons */ + .downcase_ident = NULL, .wc_isdigit = wc_isdigit_libc_mb, .wc_isalpha = wc_isalpha_libc_mb, .wc_isalnum = wc_isalnum_libc_mb, diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index 6cf1985963d..1e584819c5e 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -110,6 +110,9 @@ struct ctype_methods size_t (*strfold) (char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale); + size_t (*downcase_ident) (char *dest, size_t destsize, + const char *src, ssize_t srclen, + pg_locale_t locale); /* required */ bool (*wc_isdigit) (pg_wchar wc, pg_locale_t locale); @@ -187,6 +190,8 @@ extern size_t pg_strupper(char *dst, size_t dstsize, extern size_t pg_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen, pg_locale_t locale); +extern size_t pg_downcase_ident(char *dst, size_t dstsize, + const char *src, ssize_t srclen); extern int pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale); extern int pg_strncoll(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, pg_locale_t locale); |
