2020#include "common/unicode_category.h"
2121#include "mb/pg_wchar.h"
2222
23+ enum CaseMapResult
24+ {
25+ CASEMAP_SELF,
26+ CASEMAP_SIMPLE,
27+ CASEMAP_SPECIAL,
28+ };
29+
2330static const pg_case_map *find_case_map(pg_wchar ucs);
2431static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
2532 CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
2633 void *wbstate);
27- static bool check_special_conditions(int conditions, const char *str,
28- size_t len, size_t offset);
34+ static enum CaseMapResult casemap(pg_wchar u1, CaseKind casekind, bool full,
35+ const char *src, size_t srclen, size_t srcoff,
36+ pg_wchar *u2, const pg_wchar **special);
2937
3038pg_wchar
3139unicode_lowercase_simple(pg_wchar code)
@@ -214,8 +222,9 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
214222 {
215223 pg_wchar u1 = utf8_to_unicode((unsigned char *) src + srcoff);
216224 int u1len = unicode_utf8len(u1);
217- const pg_case_map *casemap = find_case_map(u1);
218- const pg_special_case *special = NULL;
225+ pg_wchar simple = 0;
226+ const pg_wchar *special = NULL;
227+ enum CaseMapResult casemap_result;
219228
220229 if (str_casekind == CaseTitle)
221230 {
@@ -228,56 +237,47 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
228237 chr_casekind = CaseLower;
229238 }
230239
231- /*
232- * Find special case that matches the conditions, if any.
233- *
234- * Note: only a single special mapping per codepoint is currently
235- * supported, though Unicode allows for multiple special mappings for
236- * a single codepoint.
237- */
238- if (full && casemap && casemap->special_case)
239- {
240- int16 conditions = casemap->special_case->conditions;
241-
242- Assert(casemap->special_case->codepoint == u1);
243- if (check_special_conditions(conditions, src, srclen, srcoff))
244- special = casemap->special_case;
245- }
240+ casemap_result = casemap(u1, chr_casekind, full, src, srclen, srcoff,
241+ &simple, &special);
246242
247- /* perform mapping, update result_len, and write to dst */
248- if (special)
243+ switch (casemap_result)
249244 {
250- for (int i = 0; i < MAX_CASE_EXPANSION; i++)
251- {
252- pg_wchar u2 = special->map[chr_casekind][i];
253- size_t u2len = unicode_utf8len(u2);
254-
255- if (u2 == '\0')
256- break;
257-
258- if (result_len + u2len <= dstsize)
259- unicode_to_utf8(u2, (unsigned char *) dst + result_len);
260-
261- result_len += u2len;
262- }
263- }
264- else if (casemap)
265- {
266- pg_wchar u2 = casemap->simplemap[chr_casekind];
267- pg_wchar u2len = unicode_utf8len(u2);
268-
269- if (result_len + u2len <= dstsize)
270- unicode_to_utf8(u2, (unsigned char *) dst + result_len);
271-
272- result_len += u2len;
273- }
274- else
275- {
276- /* no mapping; copy bytes from src */
277- if (result_len + u1len <= dstsize)
278- memcpy(dst + result_len, src + srcoff, u1len);
279-
280- result_len += u1len;
245+ case CASEMAP_SELF:
246+ /* no mapping; copy bytes from src */
247+ Assert(simple == 0);
248+ Assert(special == NULL);
249+ if (result_len + u1len <= dstsize)
250+ memcpy(dst + result_len, src + srcoff, u1len);
251+
252+ result_len += u1len;
253+ break;
254+ case CASEMAP_SIMPLE:
255+ {
256+ /* replace with single character */
257+ pg_wchar u2 = simple;
258+ pg_wchar u2len = unicode_utf8len(u2);
259+
260+ Assert(special == NULL);
261+ if (result_len + u2len <= dstsize)
262+ unicode_to_utf8(u2, (unsigned char *) dst + result_len);
263+
264+ result_len += u2len;
265+ }
266+ break;
267+ case CASEMAP_SPECIAL:
268+ /* replace with up to MAX_CASE_EXPANSION characters */
269+ Assert(simple == 0);
270+ for (int i = 0; i < MAX_CASE_EXPANSION && special[i]; i++)
271+ {
272+ pg_wchar u2 = special[i];
273+ size_t u2len = unicode_utf8len(u2);
274+
275+ if (result_len + u2len <= dstsize)
276+ unicode_to_utf8(u2, (unsigned char *) dst + result_len);
277+
278+ result_len += u2len;
279+ }
280+ break;
281281 }
282282
283283 srcoff += u1len;
@@ -351,6 +351,10 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
351351 return true;
352352}
353353
354+ /*
355+ * Unicode allows for special casing to be applied only under certain
356+ * circumstances. The only currently-supported condition is Final_Sigma.
357+ */
354358static bool
355359check_special_conditions(int conditions, const char *str, size_t len,
356360 size_t offset)
@@ -365,6 +369,51 @@ check_special_conditions(int conditions, const char *str, size_t len,
365369 return false;
366370}
367371
372+ /*
373+ * Map the given character to the requested case.
374+ *
375+ * If full is true, and a special case mapping is found and the conditions are
376+ * met, 'special' is set to the mapping result (which is an array of up to
377+ * MAX_CASE_EXPANSION characters) and CASEMAP_SPECIAL is returned.
378+ *
379+ * Otherwise, search for a simple mapping, and if found, set 'simple' to the
380+ * result and return CASEMAP_SIMPLE.
381+ *
382+ * If no mapping is found, return CASEMAP_SELF, and the caller should copy the
383+ * character without modification.
384+ */
385+ static enum CaseMapResult
386+ casemap(pg_wchar u1, CaseKind casekind, bool full,
387+ const char *src, size_t srclen, size_t srcoff,
388+ pg_wchar *simple, const pg_wchar **special)
389+ {
390+ const pg_case_map *map;
391+
392+ if (u1 < 0x80)
393+ {
394+ *simple = case_map[u1].simplemap[casekind];
395+
396+ return CASEMAP_SIMPLE;
397+ }
398+
399+ map = find_case_map(u1);
400+
401+ if (map == NULL)
402+ return CASEMAP_SELF;
403+
404+ if (full && map->special_case != NULL &&
405+ check_special_conditions(map->special_case->conditions,
406+ src, srclen, srcoff))
407+ {
408+ *special = map->special_case->map[casekind];
409+ return CASEMAP_SPECIAL;
410+ }
411+
412+ *simple = map->simplemap[casekind];
413+
414+ return CASEMAP_SIMPLE;
415+ }
416+
368417/* find entry in simple case map, if any */
369418static const pg_case_map *
370419find_case_map(pg_wchar ucs)
0 commit comments