7 files changed, 353 insertions, 142 deletions
diff --git a/src/backend/utils/adt/bytea.c b/src/backend/utils/adt/bytea.c
index 6e7b914c563..f8524548e46 100644
--- a/src/backend/utils/adt/bytea.c
+++ b/src/backend/utils/adt/bytea.c
@@ -15,18 +15,19 @@
 #include "postgres.h"
 
 #include "access/detoast.h"
-#include "catalog/pg_collation_d.h"
-#include "catalog/pg_type_d.h"
+#include "common/hashfn.h"
 #include "common/int.h"
 #include "fmgr.h"
+#include "lib/hyperloglog.h"
 #include "libpq/pqformat.h"
 #include "port/pg_bitutils.h"
+#include "port/pg_bswap.h"
 #include "utils/builtins.h"
 #include "utils/bytea.h"
 #include "utils/fmgrprotos.h"
+#include "utils/guc.h"
 #include "utils/memutils.h"
 #include "utils/sortsupport.h"
-#include "utils/varlena.h"
 #include "varatt.h"
 
 /* GUC variable */
@@ -37,6 +38,19 @@ static bytea *bytea_substring(Datum str, int S, int L,
 							  bool length_not_specified);
 static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl);
 
+typedef struct
+{
+	bool		abbreviate;		/* Should we abbreviate keys? */
+	hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
+	hyperLogLogState full_card; /* Full key cardinality state */
+	double		prop_card;		/* Required cardinality proportion */
+} ByteaSortSupport;
+
+/* Static function declarations for sort support */
+static int	byteafastcmp(Datum x, Datum y, SortSupport ssup);
+static Datum bytea_abbrev_convert(Datum original, SortSupport ssup);
+static bool bytea_abbrev_abort(int memtupcount, SortSupport ssup);
+
 /*
  * bytea_catenate
  *	Guts of byteacat(), broken out so it can be used by other functions
@@ -1001,6 +1015,201 @@ bytea_smaller(PG_FUNCTION_ARGS)
 	PG_RETURN_BYTEA_P(result);
 }
 
+/*
+ * sortsupport comparison func
+ */
+static int
+byteafastcmp(Datum x, Datum y, SortSupport ssup)
+{
+	bytea	   *arg1 = DatumGetByteaPP(x);
+	bytea	   *arg2 = DatumGetByteaPP(y);
+	char	   *a1p,
+			   *a2p;
+	int			len1,
+				len2,
+				result;
+
+	a1p = VARDATA_ANY(arg1);
+	a2p = VARDATA_ANY(arg2);
+
+	len1 = VARSIZE_ANY_EXHDR(arg1);
+	len2 = VARSIZE_ANY_EXHDR(arg2);
+
+	result = memcmp(a1p, a2p, Min(len1, len2));
+	if ((result == 0) && (len1 != len2))
+		result = (len1 < len2) ? -1 : 1;
+
+	/* We can't afford to leak memory here. */
+	if (PointerGetDatum(arg1) != x)
+		pfree(arg1);
+	if (PointerGetDatum(arg2) != y)
+		pfree(arg2);
+
+	return result;
+}
+
+/*
+ * Conversion routine for sortsupport.  Converts original to abbreviated key
+ * representation.  Our encoding strategy is simple -- pack the first 8 bytes
+ * of the bytea data into a Datum (on little-endian machines, the bytes are
+ * stored in reverse order), and treat it as an unsigned integer.
+ */
+static Datum
+bytea_abbrev_convert(Datum original, SortSupport ssup)
+{
+	const size_t max_prefix_bytes = sizeof(Datum);
+	ByteaSortSupport *bss = (ByteaSortSupport *) ssup->ssup_extra;
+	bytea	   *authoritative = DatumGetByteaPP(original);
+	char	   *authoritative_data = VARDATA_ANY(authoritative);
+	Datum		res;
+	char	   *pres;
+	int			len;
+	uint32		hash;
+
+	pres = (char *) &res;
+
+	/* memset(), so any non-overwritten bytes are NUL */
+	memset(pres, 0, max_prefix_bytes);
+	len = VARSIZE_ANY_EXHDR(authoritative);
+
+	/*
+	 * Short byteas will have terminating NUL bytes in the abbreviated datum.
+	 * Abbreviated comparison need not make a distinction between these NUL
+	 * bytes, and NUL bytes representing actual NULs in the authoritative
+	 * representation.
+	 *
+	 * Hopefully a comparison at or past one abbreviated key's terminating NUL
+	 * byte will resolve the comparison without consulting the authoritative
+	 * representation; specifically, some later non-NUL byte in the longer
+	 * bytea can resolve the comparison against a subsequent terminating NUL
+	 * in the shorter bytea.  There will usually be what is effectively a
+	 * "length-wise" resolution there and then.
+	 *
+	 * If that doesn't work out -- if all bytes in the longer bytea positioned
+	 * at or past the offset of the smaller bytea (first) terminating NUL are
+	 * actually representative of NUL bytes in the authoritative binary bytea
+	 * (perhaps with some *terminating* NUL bytes towards the end of the
+	 * longer bytea iff it happens to still be small) -- then an authoritative
+	 * tie-breaker will happen, and do the right thing: explicitly consider
+	 * bytea length.
+	 */
+	memcpy(pres, authoritative_data, Min(len, max_prefix_bytes));
+
+	/*
+	 * Maintain approximate cardinality of both abbreviated keys and original,
+	 * authoritative keys using HyperLogLog.  Used as cheap insurance against
+	 * the worst case, where we do many string abbreviations for no saving in
+	 * full memcmp()-based comparisons.  These statistics are used by
+	 * bytea_abbrev_abort().
+	 *
+	 * First, Hash key proper, or a significant fraction of it.  Mix in length
+	 * in order to compensate for cases where differences are past
+	 * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
+	 */
+	hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data,
+								   Min(len, PG_CACHE_LINE_SIZE)));
+
+	if (len > PG_CACHE_LINE_SIZE)
+		hash ^= DatumGetUInt32(hash_uint32((uint32) len));
+
+	addHyperLogLog(&bss->full_card, hash);
+
+	/* Hash abbreviated key */
+	{
+		uint32		tmp;
+
+		tmp = DatumGetUInt32(res) ^ (uint32) (DatumGetUInt64(res) >> 32);
+		hash = DatumGetUInt32(hash_uint32(tmp));
+	}
+
+	addHyperLogLog(&bss->abbr_card, hash);
+
+	/*
+	 * Byteswap on little-endian machines.
+	 *
+	 * This is needed so that ssup_datum_unsigned_cmp() works correctly on all
+	 * platforms.
+	 */
+	res = DatumBigEndianToNative(res);
+
+	/* Don't leak memory here */
+	if (PointerGetDatum(authoritative) != original)
+		pfree(authoritative);
+
+	return res;
+}
+
+/*
+ * Callback for estimating effectiveness of abbreviated key optimization, using
+ * heuristic rules.  Returns value indicating if the abbreviation optimization
+ * should be aborted, based on its projected effectiveness.
+ *
+ * This is based on varstr_abbrev_abort(), but some comments have been elided
+ * for brevity. See there for more details.
+ */
+static bool
+bytea_abbrev_abort(int memtupcount, SortSupport ssup)
+{
+	ByteaSortSupport *bss = (ByteaSortSupport *) ssup->ssup_extra;
+	double		abbrev_distinct,
+				key_distinct;
+
+	Assert(ssup->abbreviate);
+
+	/* Have a little patience */
+	if (memtupcount < 100)
+		return false;
+
+	abbrev_distinct = estimateHyperLogLog(&bss->abbr_card);
+	key_distinct = estimateHyperLogLog(&bss->full_card);
+
+	/*
+	 * Clamp cardinality estimates to at least one distinct value.  While
+	 * NULLs are generally disregarded, if only NULL values were seen so far,
+	 * that might misrepresent costs if we failed to clamp.
+	 */
+	if (abbrev_distinct < 1.0)
+		abbrev_distinct = 1.0;
+
+	if (key_distinct < 1.0)
+		key_distinct = 1.0;
+
+	if (trace_sort)
+	{
+		double		norm_abbrev_card = abbrev_distinct / (double) memtupcount;
+
+		elog(LOG, "bytea_abbrev: abbrev_distinct after %d: %f "
+			 "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",
+			 memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card,
+			 bss->prop_card);
+	}
+
+	/*
+	 * If the number of distinct abbreviated keys approximately matches the
+	 * number of distinct original keys, continue with abbreviation.
+	 */
+	if (abbrev_distinct > key_distinct * bss->prop_card)
+	{
+		/*
+		 * Decay required cardinality aggressively after 10,000 tuples.
+		 */
+		if (memtupcount > 10000)
+			bss->prop_card *= 0.65;
+
+		return false;
+	}
+
+	/*
+	 * Abort abbreviation strategy.
+	 */
+	if (trace_sort)
+		elog(LOG, "bytea_abbrev: aborted abbreviation at %d "
+			 "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)",
+			 memtupcount, abbrev_distinct, key_distinct, bss->prop_card);
+
+	return true;
+}
+
 Datum
 bytea_sortsupport(PG_FUNCTION_ARGS)
 {
@@ -1009,8 +1218,27 @@ bytea_sortsupport(PG_FUNCTION_ARGS)
 
 	oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
 
-	/* Use generic string SortSupport, forcing "C" collation */
-	varstr_sortsupport(ssup, BYTEAOID, C_COLLATION_OID);
+	ssup->comparator = byteafastcmp;
+
+	/*
+	 * Set up abbreviation support if requested.
+	 */
+	if (ssup->abbreviate)
+	{
+		ByteaSortSupport *bss;
+
+		bss = palloc_object(ByteaSortSupport);
+		bss->abbreviate = true;
+		bss->prop_card = 0.20;
+		initHyperLogLog(&bss->abbr_card, 10);
+		initHyperLogLog(&bss->full_card, 10);
+
+		ssup->ssup_extra = bss;
+		ssup->abbrev_full_comparator = ssup->comparator;
+		ssup->comparator = ssup_datum_unsigned_cmp;
+		ssup->abbrev_converter = bytea_abbrev_convert;
+		ssup->abbrev_abort = bytea_abbrev_abort;
+	}
 
 	MemoryContextSwitchTo(oldcontext);
 
diff --git a/src/backend/utils/adt/like_support.c b/src/backend/utils/adt/like_support.c
index dca1d9be035..b1b0192aa46 100644
--- a/src/backend/utils/adt/like_support.c
+++ b/src/backend/utils/adt/like_support.c
@@ -99,8 +99,6 @@ static Selectivity like_selectivity(const char *patt, int pattlen,
 static Selectivity regex_selectivity(const char *patt, int pattlen,
 									 bool case_insensitive,
 									 int fixed_prefix_len);
-static int	pattern_char_isalpha(char c, bool is_multibyte,
-								 pg_locale_t locale);
 static Const *make_greater_string(const Const *str_const, FmgrInfo *ltproc,
 								  Oid collation);
 static Datum string_to_datum(const char *str, Oid datatype);
@@ -986,8 +984,8 @@ icnlikejoinsel(PG_FUNCTION_ARGS)
  */
 
 static Pattern_Prefix_Status
-like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
-				  Const **prefix_const, Selectivity *rest_selec)
+like_fixed_prefix(Const *patt_const, Const **prefix_const,
+				  Selectivity *rest_selec)
 {
 	char	   *match;
 	char	   *patt;
@@ -995,34 +993,10 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
 	Oid			typeid = patt_const->consttype;
 	int			pos,
 				match_pos;
-	bool		is_multibyte = (pg_database_encoding_max_length() > 1);
-	pg_locale_t locale = 0;
 
 	/* the right-hand const is type text or bytea */
 	Assert(typeid == BYTEAOID || typeid == TEXTOID);
 
-	if (case_insensitive)
-	{
-		if (typeid == BYTEAOID)
-			ereport(ERROR,
-					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-					 errmsg("case insensitive matching not supported on type bytea")));
-
-		if (!OidIsValid(collation))
-		{
-			/*
-			 * This typically means that the parser could not resolve a
-			 * conflict of implicit collations, so report it that way.
-			 */
-			ereport(ERROR,
-					(errcode(ERRCODE_INDETERMINATE_COLLATION),
-					 errmsg("could not determine which collation to use for ILIKE"),
-					 errhint("Use the COLLATE clause to set the collation explicitly.")));
-		}
-
-		locale = pg_newlocale_from_collation(collation);
-	}
-
 	if (typeid != BYTEAOID)
 	{
 		patt = TextDatumGetCString(patt_const->constvalue);
@@ -1055,11 +1029,6 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
 				break;
 		}
 
-		/* Stop if case-varying character (it's sort of a wildcard) */
-		if (case_insensitive &&
-			pattern_char_isalpha(patt[pos], is_multibyte, locale))
-			break;
-
 		match[match_pos++] = patt[pos];
 	}
 
@@ -1071,8 +1040,7 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
 		*prefix_const = string_to_bytea_const(match, match_pos);
 
 	if (rest_selec != NULL)
-		*rest_selec = like_selectivity(&patt[pos], pattlen - pos,
-									   case_insensitive);
+		*rest_selec = like_selectivity(&patt[pos], pattlen - pos, false);
 
 	pfree(patt);
 	pfree(match);
@@ -1087,6 +1055,112 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
 	return Pattern_Prefix_None;
 }
 
+/*
+ * Case-insensitive variant of like_fixed_prefix().  Multibyte and
+ * locale-aware for detecting cased characters.
+ */
+static Pattern_Prefix_Status
+like_fixed_prefix_ci(Const *patt_const, Oid collation, Const **prefix_const,
+					 Selectivity *rest_selec)
+{
+	text	   *val = DatumGetTextPP(patt_const->constvalue);
+	Oid			typeid = patt_const->consttype;
+	int			nbytes = VARSIZE_ANY_EXHDR(val);
+	int			wpos;
+	pg_wchar   *wpatt;
+	int			wpattlen;
+	pg_wchar   *wmatch;
+	int			wmatch_pos = 0;
+	char	   *match;
+	int			match_mblen;
+	pg_locale_t locale = 0;
+
+	/* the right-hand const is type text or bytea */
+	Assert(typeid == BYTEAOID || typeid == TEXTOID);
+
+	if (typeid == BYTEAOID)
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("case insensitive matching not supported on type bytea")));
+
+	if (!OidIsValid(collation))
+	{
+		/*
+		 * This typically means that the parser could not resolve a conflict
+		 * of implicit collations, so report it that way.
+		 */
+		ereport(ERROR,
+				(errcode(ERRCODE_INDETERMINATE_COLLATION),
+				 errmsg("could not determine which collation to use for ILIKE"),
+				 errhint("Use the COLLATE clause to set the collation explicitly.")));
+	}
+
+	locale = pg_newlocale_from_collation(collation);
+
+	wpatt = palloc((nbytes + 1) * sizeof(pg_wchar));
+	wpattlen = pg_mb2wchar_with_len(VARDATA_ANY(val), wpatt, nbytes);
+
+	wmatch = palloc((nbytes + 1) * sizeof(pg_wchar));
+	for (wpos = 0; wpos < wpattlen; wpos++)
+	{
+		/* % and _ are wildcard characters in LIKE */
+		if (wpatt[wpos] == '%' ||
+			wpatt[wpos] == '_')
+			break;
+
+		/* Backslash escapes the next character */
+		if (wpatt[wpos] == '\\')
+		{
+			wpos++;
+			if (wpos >= wpattlen)
+				break;
+		}
+
+		/*
+		 * For ILIKE, stop if it's a case-varying character (it's sort of a
+		 * wildcard).
+		 */
+		if (pg_iswcased(wpatt[wpos], locale))
+			break;
+
+		wmatch[wmatch_pos++] = wpatt[wpos];
+	}
+
+	wmatch[wmatch_pos] = '\0';
+
+	match = palloc(pg_database_encoding_max_length() * wmatch_pos + 1);
+	match_mblen = pg_wchar2mb_with_len(wmatch, match, wmatch_pos);
+	match[match_mblen] = '\0';
+	pfree(wmatch);
+
+	*prefix_const = string_to_const(match, TEXTOID);
+	pfree(match);
+
+	if (rest_selec != NULL)
+	{
+		int			wrestlen = wpattlen - wmatch_pos;
+		char	   *rest;
+		int			rest_mblen;
+
+		rest = palloc(pg_database_encoding_max_length() * wrestlen + 1);
+		rest_mblen = pg_wchar2mb_with_len(&wpatt[wmatch_pos], rest, wrestlen);
+
+		*rest_selec = like_selectivity(rest, rest_mblen, true);
+		pfree(rest);
+	}
+
+	pfree(wpatt);
+
+	/* in LIKE, an empty pattern is an exact match! */
+	if (wpos == wpattlen)
+		return Pattern_Prefix_Exact;	/* reached end of pattern, so exact */
+
+	if (wmatch_pos > 0)
+		return Pattern_Prefix_Partial;
+
+	return Pattern_Prefix_None;
+}
+
 static Pattern_Prefix_Status
 regex_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
 				   Const **prefix_const, Selectivity *rest_selec)
@@ -1164,12 +1238,11 @@ pattern_fixed_prefix(Const *patt, Pattern_Type ptype, Oid collation,
 	switch (ptype)
 	{
 		case Pattern_Type_Like:
-			result = like_fixed_prefix(patt, false, collation,
-									   prefix, rest_selec);
+			result = like_fixed_prefix(patt, prefix, rest_selec);
 			break;
 		case Pattern_Type_Like_IC:
-			result = like_fixed_prefix(patt, true, collation,
-									   prefix, rest_selec);
+			result = like_fixed_prefix_ci(patt, collation, prefix,
+										  rest_selec);
 			break;
 		case Pattern_Type_Regex:
 			result = regex_fixed_prefix(patt, false, collation,
@@ -1481,24 +1554,6 @@ regex_selectivity(const char *patt, int pattlen, bool case_insensitive,
 	return sel;
 }
 
-/*
- * Check whether char is a letter (and, hence, subject to case-folding)
- *
- * In multibyte character sets or with ICU, we can't use isalpha, and it does
- * not seem worth trying to convert to wchar_t to use iswalpha or u_isalpha.
- * Instead, just assume any non-ASCII char is potentially case-varying, and
- * hard-wire knowledge of which ASCII chars are letters.
- */
-static int
-pattern_char_isalpha(char c, bool is_multibyte,
-					 pg_locale_t locale)
-{
-	if (locale->ctype_is_c)
-		return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
-	else
-		return char_is_cased(c, locale);
-}
-
 
 /*
  * For bytea, the increment function need only increment the current byte
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 70933ee3843..8a3796aa5d0 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -1626,21 +1626,6 @@ pg_towlower(pg_wchar wc, pg_locale_t locale)
 }
 
 /*
- * char_is_cased()
- *
- * Fuzzy test of whether the given char is case-varying or not. The argument
- * is a single byte, so in a multibyte encoding, just assume any non-ASCII
- * char is case-varying.
- */
-bool
-char_is_cased(char ch, pg_locale_t locale)
-{
-	if (locale->ctype == NULL)
-		return (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z');
-	return locale->ctype->char_is_cased(ch, locale);
-}
-
-/*
  * Return required encoding ID for the given locale, or -1 if any encoding is
  * valid for the locale.
  */
diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c
index 0d4c754a267..0c2920112bb 100644
--- a/src/backend/utils/adt/pg_locale_builtin.c
+++ b/src/backend/utils/adt/pg_locale_builtin.c
@@ -191,13 +191,6 @@ wc_iscased_builtin(pg_wchar wc, pg_locale_t locale)
 	return pg_u_prop_cased(to_char32(wc));
 }
 
-static bool
-char_is_cased_builtin(char ch, pg_locale_t locale)
-{
-	return IS_HIGHBIT_SET(ch) ||
-		(ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z');
-}
-
 static pg_wchar
 wc_toupper_builtin(pg_wchar wc, pg_locale_t locale)
 {
@@ -225,7 +218,6 @@ static const struct ctype_methods ctype_methods_builtin = {
 	.wc_ispunct = wc_ispunct_builtin,
 	.wc_isspace = wc_isspace_builtin,
 	.wc_isxdigit = wc_isxdigit_builtin,
-	.char_is_cased = char_is_cased_builtin,
 	.wc_iscased = wc_iscased_builtin,
 	.wc_tolower = wc_tolower_builtin,
 	.wc_toupper = wc_toupper_builtin,
diff --git a/src/backend/utils/adt/pg_locale_icu.c b/src/backend/utils/adt/pg_locale_icu.c
index e8820666b2d..18d026deda8 100644
--- a/src/backend/utils/adt/pg_locale_icu.c
+++ b/src/backend/utils/adt/pg_locale_icu.c
@@ -121,13 +121,6 @@ static int32_t u_strFoldCase_default(UChar *dest, int32_t destCapacity,
 									 const char *locale,
 									 UErrorCode *pErrorCode);
 
-static bool
-char_is_cased_icu(char ch, pg_locale_t locale)
-{
-	return IS_HIGHBIT_SET(ch) ||
-		(ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z');
-}
-
 /*
  * XXX: many of the functions below rely on casts directly from pg_wchar to
  * UChar32, which is correct for the UTF-8 encoding, but not in general.
@@ -244,7 +237,6 @@ static const struct ctype_methods ctype_methods_icu = {
 	.wc_ispunct = wc_ispunct_icu,
 	.wc_isspace = wc_isspace_icu,
 	.wc_isxdigit = wc_isxdigit_icu,
-	.char_is_cased = char_is_cased_icu,
 	.wc_iscased = wc_iscased_icu,
 	.wc_toupper = toupper_icu,
 	.wc_tolower = tolower_icu,
diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c
index 3d841f818a5..3baa5816b5f 100644
--- a/src/backend/utils/adt/pg_locale_libc.c
+++ b/src/backend/utils/adt/pg_locale_libc.c
@@ -262,17 +262,6 @@ wc_iscased_libc_mb(pg_wchar wc, pg_locale_t locale)
 		iswlower_l((wint_t) wc, locale->lt);
 }
 
-static bool
-char_is_cased_libc(char ch, pg_locale_t locale)
-{
-	bool		is_multibyte = pg_database_encoding_max_length() > 1;
-
-	if (is_multibyte && IS_HIGHBIT_SET(ch))
-		return true;
-	else
-		return isalpha_l((unsigned char) ch, locale->lt);
-}
-
 static pg_wchar
 toupper_libc_sb(pg_wchar wc, pg_locale_t locale)
 {
@@ -345,7 +334,6 @@ static const struct ctype_methods ctype_methods_libc_sb = {
 	.wc_ispunct = wc_ispunct_libc_sb,
 	.wc_isspace = wc_isspace_libc_sb,
 	.wc_isxdigit = wc_isxdigit_libc_sb,
-	.char_is_cased = char_is_cased_libc,
 	.wc_iscased = wc_iscased_libc_sb,
 	.wc_toupper = toupper_libc_sb,
 	.wc_tolower = tolower_libc_sb,
@@ -371,7 +359,6 @@ static const struct ctype_methods ctype_methods_libc_other_mb = {
 	.wc_ispunct = wc_ispunct_libc_sb,
 	.wc_isspace = wc_isspace_libc_sb,
 	.wc_isxdigit = wc_isxdigit_libc_sb,
-	.char_is_cased = char_is_cased_libc,
 	.wc_iscased = wc_iscased_libc_sb,
 	.wc_toupper = toupper_libc_sb,
 	.wc_tolower = tolower_libc_sb,
@@ -393,7 +380,6 @@ static const struct ctype_methods ctype_methods_libc_utf8 = {
 	.wc_ispunct = wc_ispunct_libc_mb,
 	.wc_isspace = wc_isspace_libc_mb,
 	.wc_isxdigit = wc_isxdigit_libc_mb,
-	.char_is_cased = char_is_cased_libc,
 	.wc_iscased = wc_iscased_libc_mb,
 	.wc_toupper = toupper_libc_mb,
 	.wc_tolower = tolower_libc_mb,
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index baa5b44ea8d..8adeb8dadc6 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -92,7 +92,7 @@ typedef struct
 	int			last_returned;	/* Last comparison result (cache) */
 	bool		cache_blob;		/* Does buf2 contain strxfrm() blob, etc? */
 	bool		collate_c;
-	Oid			typid;			/* Actual datatype (text/bpchar/bytea/name) */
+	Oid			typid;			/* Actual datatype (text/bpchar/name) */
 	hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
 	hyperLogLogState full_card; /* Full key cardinality state */
 	double		prop_card;		/* Required cardinality proportion */
@@ -1617,10 +1617,8 @@ bttextsortsupport(PG_FUNCTION_ARGS)
  * Includes locale support, and support for BpChar semantics (i.e. removing
  * trailing spaces before comparison).
  *
- * Relies on the assumption that text, VarChar, BpChar, and bytea all have the
- * same representation.  Callers that always use the C collation (e.g.
- * non-collatable type callers like bytea) may have NUL bytes in their strings;
- * this will not work with any other collation, though.
+ * Relies on the assumption that text, VarChar, and BpChar all have the
+ * same representation.
  */
 void
 varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid)
@@ -1983,7 +1981,7 @@ varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)
  * representation.  Our encoding strategy is simple -- pack the first 8 bytes
  * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are
  * stored in reverse order), and treat it as an unsigned integer.  When the "C"
- * locale is used, or in case of bytea, just memcpy() from original instead.
+ * locale is used just memcpy() from original instead.
  */
 static Datum
 varstr_abbrev_convert(Datum original, SortSupport ssup)
@@ -2010,30 +2008,8 @@ varstr_abbrev_convert(Datum original, SortSupport ssup)
 
 	/*
 	 * If we're using the C collation, use memcpy(), rather than strxfrm(), to
-	 * abbreviate keys.  The full comparator for the C locale is always
-	 * memcmp().  It would be incorrect to allow bytea callers (callers that
-	 * always force the C collation -- bytea isn't a collatable type, but this
-	 * approach is convenient) to use strxfrm().  This is because bytea
-	 * strings may contain NUL bytes.  Besides, this should be faster, too.
-	 *
-	 * More generally, it's okay that bytea callers can have NUL bytes in
-	 * strings because abbreviated cmp need not make a distinction between
-	 * terminating NUL bytes, and NUL bytes representing actual NULs in the
-	 * authoritative representation.  Hopefully a comparison at or past one
-	 * abbreviated key's terminating NUL byte will resolve the comparison
-	 * without consulting the authoritative representation; specifically, some
-	 * later non-NUL byte in the longer string can resolve the comparison
-	 * against a subsequent terminating NUL in the shorter string.  There will
-	 * usually be what is effectively a "length-wise" resolution there and
-	 * then.
-	 *
-	 * If that doesn't work out -- if all bytes in the longer string
-	 * positioned at or past the offset of the smaller string's (first)
-	 * terminating NUL are actually representative of NUL bytes in the
-	 * authoritative binary string (perhaps with some *terminating* NUL bytes
-	 * towards the end of the longer string iff it happens to still be small)
-	 * -- then an authoritative tie-breaker will happen, and do the right
-	 * thing: explicitly consider string length.
+	 * abbreviate keys.  The full comparator for the C locale is also
+	 * memcmp().  This should be faster than strxfrm().
 	 */
 	if (sss->collate_c)
 		memcpy(pres, authoritative_data, Min(len, max_prefix_bytes));
@@ -2115,9 +2091,6 @@ varstr_abbrev_convert(Datum original, SortSupport ssup)
 		 * strxfrm() blob is itself NUL terminated, leaving no danger of
 		 * misinterpreting any NUL bytes not intended to be interpreted as
 		 * logically representing termination.
-		 *
-		 * (Actually, even if there were NUL bytes in the blob it would be
-		 * okay.  See remarks on bytea case above.)
 		 */
 		memcpy(pres, sss->buf2, Min(max_prefix_bytes, bsize));
 	}
@@ -2198,10 +2171,10 @@ varstr_abbrev_abort(int memtupcount, SortSupport ssup)
 	 * NULLs are generally disregarded, if only NULL values were seen so far,
 	 * that might misrepresent costs if we failed to clamp.
 	 */
-	if (abbrev_distinct <= 1.0)
+	if (abbrev_distinct < 1.0)
 		abbrev_distinct = 1.0;
 
-	if (key_distinct <= 1.0)
+	if (key_distinct < 1.0)
 		key_distinct = 1.0;
 
 	/*