🌐 AI搜索 & 代理 主页
Skip to content

Commit 9c8de15

Browse files
committed
Use multibyte-aware extraction of pattern prefixes.
Previously, like_fixed_prefix() used char-at-a-time logic, which forced it to be too conservative for case-insensitive matching. Introduce like_fixed_prefix_ci(), and use that for case-insensitive pattern prefixes. It uses multibyte and locale-aware logic, along with the new pg_iswcased() API introduced in 630706c. Reviewed-by: Chao Li <li.evan.chao@gmail.com> Reviewed-by: Peter Eisentraut <peter@eisentraut.org> Discussion: https://postgr.es/m/450ceb6260cad30d7afdf155d991a9caafee7c0d.camel@j-davis.com
1 parent 8191937 commit 9c8de15

File tree

1 file changed

+112
-57
lines changed

1 file changed

+112
-57
lines changed

src/backend/utils/adt/like_support.c

Lines changed: 112 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -99,8 +99,6 @@ static Selectivity like_selectivity(const char *patt, int pattlen,
9999
static Selectivity regex_selectivity(const char *patt, int pattlen,
100100
bool case_insensitive,
101101
int fixed_prefix_len);
102-
static int pattern_char_isalpha(char c, bool is_multibyte,
103-
pg_locale_t locale);
104102
static Const *make_greater_string(const Const *str_const, FmgrInfo *ltproc,
105103
Oid collation);
106104
static Datum string_to_datum(const char *str, Oid datatype);
@@ -986,43 +984,19 @@ icnlikejoinsel(PG_FUNCTION_ARGS)
986984
*/
987985

988986
static Pattern_Prefix_Status
989-
like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
990-
Const **prefix_const, Selectivity *rest_selec)
987+
like_fixed_prefix(Const *patt_const, Const **prefix_const,
988+
Selectivity *rest_selec)
991989
{
992990
char *match;
993991
char *patt;
994992
int pattlen;
995993
Oid typeid = patt_const->consttype;
996994
int pos,
997995
match_pos;
998-
bool is_multibyte = (pg_database_encoding_max_length() > 1);
999-
pg_locale_t locale = 0;
1000996

1001997
/* the right-hand const is type text or bytea */
1002998
Assert(typeid == BYTEAOID || typeid == TEXTOID);
1003999

1004-
if (case_insensitive)
1005-
{
1006-
if (typeid == BYTEAOID)
1007-
ereport(ERROR,
1008-
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1009-
errmsg("case insensitive matching not supported on type bytea")));
1010-
1011-
if (!OidIsValid(collation))
1012-
{
1013-
/*
1014-
* This typically means that the parser could not resolve a
1015-
* conflict of implicit collations, so report it that way.
1016-
*/
1017-
ereport(ERROR,
1018-
(errcode(ERRCODE_INDETERMINATE_COLLATION),
1019-
errmsg("could not determine which collation to use for ILIKE"),
1020-
errhint("Use the COLLATE clause to set the collation explicitly.")));
1021-
}
1022-
1023-
locale = pg_newlocale_from_collation(collation);
1024-
}
1025-
10261000
if (typeid != BYTEAOID)
10271001
{
10281002
patt = TextDatumGetCString(patt_const->constvalue);
@@ -1055,11 +1029,6 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
10551029
break;
10561030
}
10571031

1058-
/* Stop if case-varying character (it's sort of a wildcard) */
1059-
if (case_insensitive &&
1060-
pattern_char_isalpha(patt[pos], is_multibyte, locale))
1061-
break;
1062-
10631032
match[match_pos++] = patt[pos];
10641033
}
10651034

@@ -1071,8 +1040,7 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
10711040
*prefix_const = string_to_bytea_const(match, match_pos);
10721041

10731042
if (rest_selec != NULL)
1074-
*rest_selec = like_selectivity(&patt[pos], pattlen - pos,
1075-
case_insensitive);
1043+
*rest_selec = like_selectivity(&patt[pos], pattlen - pos, false);
10761044

10771045
pfree(patt);
10781046
pfree(match);
@@ -1087,6 +1055,112 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
10871055
return Pattern_Prefix_None;
10881056
}
10891057

1058+
/*
1059+
* Case-insensitive variant of like_fixed_prefix(). Multibyte and
1060+
* locale-aware for detecting cased characters.
1061+
*/
1062+
static Pattern_Prefix_Status
1063+
like_fixed_prefix_ci(Const *patt_const, Oid collation, Const **prefix_const,
1064+
Selectivity *rest_selec)
1065+
{
1066+
text *val = DatumGetTextPP(patt_const->constvalue);
1067+
Oid typeid = patt_const->consttype;
1068+
int nbytes = VARSIZE_ANY_EXHDR(val);
1069+
int wpos;
1070+
pg_wchar *wpatt;
1071+
int wpattlen;
1072+
pg_wchar *wmatch;
1073+
int wmatch_pos = 0;
1074+
char *match;
1075+
int match_mblen;
1076+
pg_locale_t locale = 0;
1077+
1078+
/* the right-hand const is type text or bytea */
1079+
Assert(typeid == BYTEAOID || typeid == TEXTOID);
1080+
1081+
if (typeid == BYTEAOID)
1082+
ereport(ERROR,
1083+
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1084+
errmsg("case insensitive matching not supported on type bytea")));
1085+
1086+
if (!OidIsValid(collation))
1087+
{
1088+
/*
1089+
* This typically means that the parser could not resolve a conflict
1090+
* of implicit collations, so report it that way.
1091+
*/
1092+
ereport(ERROR,
1093+
(errcode(ERRCODE_INDETERMINATE_COLLATION),
1094+
errmsg("could not determine which collation to use for ILIKE"),
1095+
errhint("Use the COLLATE clause to set the collation explicitly.")));
1096+
}
1097+
1098+
locale = pg_newlocale_from_collation(collation);
1099+
1100+
wpatt = palloc((nbytes + 1) * sizeof(pg_wchar));
1101+
wpattlen = pg_mb2wchar_with_len(VARDATA_ANY(val), wpatt, nbytes);
1102+
1103+
wmatch = palloc((nbytes + 1) * sizeof(pg_wchar));
1104+
for (wpos = 0; wpos < wpattlen; wpos++)
1105+
{
1106+
/* % and _ are wildcard characters in LIKE */
1107+
if (wpatt[wpos] == '%' ||
1108+
wpatt[wpos] == '_')
1109+
break;
1110+
1111+
/* Backslash escapes the next character */
1112+
if (wpatt[wpos] == '\\')
1113+
{
1114+
wpos++;
1115+
if (wpos >= wpattlen)
1116+
break;
1117+
}
1118+
1119+
/*
1120+
* For ILIKE, stop if it's a case-varying character (it's sort of a
1121+
* wildcard).
1122+
*/
1123+
if (pg_iswcased(wpatt[wpos], locale))
1124+
break;
1125+
1126+
wmatch[wmatch_pos++] = wpatt[wpos];
1127+
}
1128+
1129+
wmatch[wmatch_pos] = '\0';
1130+
1131+
match = palloc(pg_database_encoding_max_length() * wmatch_pos + 1);
1132+
match_mblen = pg_wchar2mb_with_len(wmatch, match, wmatch_pos);
1133+
match[match_mblen] = '\0';
1134+
pfree(wmatch);
1135+
1136+
*prefix_const = string_to_const(match, TEXTOID);
1137+
pfree(match);
1138+
1139+
if (rest_selec != NULL)
1140+
{
1141+
int wrestlen = wpattlen - wmatch_pos;
1142+
char *rest;
1143+
int rest_mblen;
1144+
1145+
rest = palloc(pg_database_encoding_max_length() * wrestlen + 1);
1146+
rest_mblen = pg_wchar2mb_with_len(&wpatt[wmatch_pos], rest, wrestlen);
1147+
1148+
*rest_selec = like_selectivity(rest, rest_mblen, true);
1149+
pfree(rest);
1150+
}
1151+
1152+
pfree(wpatt);
1153+
1154+
/* in LIKE, an empty pattern is an exact match! */
1155+
if (wpos == wpattlen)
1156+
return Pattern_Prefix_Exact; /* reached end of pattern, so exact */
1157+
1158+
if (wmatch_pos > 0)
1159+
return Pattern_Prefix_Partial;
1160+
1161+
return Pattern_Prefix_None;
1162+
}
1163+
10901164
static Pattern_Prefix_Status
10911165
regex_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
10921166
Const **prefix_const, Selectivity *rest_selec)
@@ -1164,12 +1238,11 @@ pattern_fixed_prefix(Const *patt, Pattern_Type ptype, Oid collation,
11641238
switch (ptype)
11651239
{
11661240
case Pattern_Type_Like:
1167-
result = like_fixed_prefix(patt, false, collation,
1168-
prefix, rest_selec);
1241+
result = like_fixed_prefix(patt, prefix, rest_selec);
11691242
break;
11701243
case Pattern_Type_Like_IC:
1171-
result = like_fixed_prefix(patt, true, collation,
1172-
prefix, rest_selec);
1244+
result = like_fixed_prefix_ci(patt, collation, prefix,
1245+
rest_selec);
11731246
break;
11741247
case Pattern_Type_Regex:
11751248
result = regex_fixed_prefix(patt, false, collation,
@@ -1481,24 +1554,6 @@ regex_selectivity(const char *patt, int pattlen, bool case_insensitive,
14811554
return sel;
14821555
}
14831556

1484-
/*
1485-
* Check whether char is a letter (and, hence, subject to case-folding)
1486-
*
1487-
* In multibyte character sets or with ICU, we can't use isalpha, and it does
1488-
* not seem worth trying to convert to wchar_t to use iswalpha or u_isalpha.
1489-
* Instead, just assume any non-ASCII char is potentially case-varying, and
1490-
* hard-wire knowledge of which ASCII chars are letters.
1491-
*/
1492-
static int
1493-
pattern_char_isalpha(char c, bool is_multibyte,
1494-
pg_locale_t locale)
1495-
{
1496-
if (locale->ctype_is_c)
1497-
return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
1498-
else
1499-
return char_is_cased(c, locale);
1500-
}
1501-
15021557

15031558
/*
15041559
* For bytea, the increment function need only increment the current byte

0 commit comments

Comments
 (0)