🌐 AI搜索 & 代理 主页
Skip to content

Commit 8023086

Browse files
committed
Amend recent fix for SIMILAR TO regex conversion.
Commit e3ffc3e fixed the translation of character classes in SIMILAR TO regular expressions. Unfortunately the fix broke a corner case: if there is an escape character right after the opening bracket (for example in "[\q]"), a closing bracket right after the escape sequence would not be seen as closing the character class. There were two more oversights: a backslash or a nested opening bracket right at the beginning of a character class should remove the special meaning from any following caret or closing bracket. This bug suggests that this code needs to be more readable, so also rename the variables "charclass_depth" and "charclass_start" to something more meaningful, rewrite an "if" cascade to be more consistent, and improve the commentary. Reported-by: Dominique Devienne <ddevienne@gmail.com> Reported-by: Stephan Springl <springl-psql@bfw-online.de> Author: Laurenz Albe <laurenz.albe@cybertec.at> Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> Discussion: https://postgr.es/m/CAFCRh-8NwJd0jq6P=R3qhHyqU7hw0BTor3W0SvUcii24et+zAw@mail.gmail.com Backpatch-through: 13
1 parent 01bea60 commit 8023086

File tree

3 files changed

+82
-27
lines changed

3 files changed

+82
-27
lines changed

src/backend/utils/adt/regexp.c

Lines changed: 70 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -774,10 +774,8 @@ similar_escape_internal(text *pat_text, text *esc_text)
774774
elen;
775775
bool afterescape = false;
776776
int nquotes = 0;
777-
int charclass_depth = 0; /* Nesting level of character classes,
778-
* encompassed by square brackets */
779-
int charclass_start = 0; /* State of the character class start,
780-
* for carets */
777+
int bracket_depth = 0; /* square bracket nesting level */
778+
int charclass_pos = 0; /* position inside a character class */
781779

782780
p = VARDATA_ANY(pat_text);
783781
plen = VARSIZE_ANY_EXHDR(pat_text);
@@ -836,6 +834,17 @@ similar_escape_internal(text *pat_text, text *esc_text)
836834
* the relevant part separators in the above expansion. If the result
837835
* of this function is used in a plain regexp match (SIMILAR TO), the
838836
* escape-double-quotes have no effect on the match behavior.
837+
*
838+
* While we don't fully validate character classes (bracket expressions),
839+
* we do need to parse them well enough to know where they end.
840+
* "charclass_pos" tracks where we are in a character class.
841+
* Its value is uninteresting when bracket_depth is 0.
842+
* But when bracket_depth > 0, it will be
843+
* 1: right after the opening '[' (a following '^' will negate
844+
* the class, while ']' is a literal character)
845+
* 2: right after a '^' after the opening '[' (']' is still a literal
846+
* character)
847+
* 3 or more: further inside the character class (']' ends the class)
839848
*----------
840849
*/
841850

@@ -907,7 +916,7 @@ similar_escape_internal(text *pat_text, text *esc_text)
907916
/* fast path */
908917
if (afterescape)
909918
{
910-
if (pchar == '"' && charclass_depth < 1) /* escape-double-quote? */
919+
if (pchar == '"' && bracket_depth < 1) /* escape-double-quote? */
911920
{
912921
/* emit appropriate part separator, per notes above */
913922
if (nquotes == 0)
@@ -948,6 +957,12 @@ similar_escape_internal(text *pat_text, text *esc_text)
948957
*/
949958
*r++ = '\\';
950959
*r++ = pchar;
960+
961+
/*
962+
* If we encounter an escaped character in a character class,
963+
* we are no longer at the beginning.
964+
*/
965+
charclass_pos = 3;
951966
}
952967
afterescape = false;
953968
}
@@ -956,41 +971,69 @@ similar_escape_internal(text *pat_text, text *esc_text)
956971
/* SQL escape character; do not send to output */
957972
afterescape = true;
958973
}
959-
else if (charclass_depth > 0)
974+
else if (bracket_depth > 0)
960975
{
976+
/* inside a character class */
961977
if (pchar == '\\')
978+
{
979+
/*
980+
* If we're here, backslash is not the SQL escape character,
981+
* so treat it as a literal class element, which requires
982+
* doubling it. (This matches our behavior for backslashes
983+
* outside character classes.)
984+
*/
962985
*r++ = '\\';
986+
}
963987
*r++ = pchar;
964988

965-
/*
966-
* Ignore a closing bracket at the start of a character class.
967-
* Such a bracket is taken literally rather than closing the
968-
* class. "charclass_start" is 1 right at the beginning of a
969-
* class and 2 after an initial caret.
970-
*/
971-
if (pchar == ']' && charclass_start > 2)
972-
charclass_depth--;
989+
/* parse the character class well enough to identify ending ']' */
990+
if (pchar == ']' && charclass_pos > 2)
991+
{
992+
/* found the real end of a bracket pair */
993+
bracket_depth--;
994+
/* don't reset charclass_pos, this may be an inner bracket */
995+
}
973996
else if (pchar == '[')
974-
charclass_depth++;
997+
{
998+
/* start of a nested bracket pair */
999+
bracket_depth++;
9751000

976-
/*
977-
* If there is a caret right after the opening bracket, it negates
978-
* the character class, but a following closing bracket should
979-
* still be treated as a normal character. That holds only for
980-
* the first caret, so only the values 1 and 2 mean that closing
981-
* brackets should be taken literally.
982-
*/
983-
if (pchar == '^')
984-
charclass_start++;
1001+
/*
1002+
* We are no longer at the beginning of a character class.
1003+
* (The nested bracket pair is a collating element, not a
1004+
* character class in its own right.)
1005+
*/
1006+
charclass_pos = 3;
1007+
}
1008+
else if (pchar == '^')
1009+
{
1010+
/*
1011+
* A caret right after the opening bracket negates the
1012+
* character class. In that case, the following will
1013+
* increment charclass_pos from 1 to 2, so that a following
1014+
* ']' is still a literal character and does not end the
1015+
* character class. If we are further inside a character
1016+
* class, charclass_pos might get incremented past 3, which is
1017+
* fine.
1018+
*/
1019+
charclass_pos++;
1020+
}
9851021
else
986-
charclass_start = 3; /* definitely past the start */
1022+
{
1023+
/*
1024+
* Anything else (including a backslash or leading ']') is an
1025+
* element of the character class, so we are no longer at the
1026+
* beginning of the class.
1027+
*/
1028+
charclass_pos = 3;
1029+
}
9871030
}
9881031
else if (pchar == '[')
9891032
{
9901033
/* start of a character class */
9911034
*r++ = pchar;
992-
charclass_depth++;
993-
charclass_start = 1;
1035+
bracket_depth = 1;
1036+
charclass_pos = 1;
9941037
}
9951038
else if (pchar == '%')
9961039
{

src/test/regress/expected/strings.out

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -681,6 +681,15 @@ EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[^^]^';
681681
Filter: (f1 ~ '^(?:[^^]\^)$'::text)
682682
(2 rows)
683683

684+
-- Closing square bracket after an escape sequence at the beginning of
685+
-- a character closes the character class
686+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[|a]%' ESCAPE '|';
687+
QUERY PLAN
688+
---------------------------------------
689+
Seq Scan on text_tbl
690+
Filter: (f1 ~ '^(?:[\a].*)$'::text)
691+
(2 rows)
692+
684693
-- Test backslash escapes in regexp_replace's replacement string
685694
SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3');
686695
regexp_replace

src/test/regress/sql/strings.sql

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,9 @@ EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[]%][^]%][^%]%';
216216
-- Closing square bracket effective after two carets at the beginning
217217
-- of character class.
218218
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[^^]^';
219+
-- Closing square bracket after an escape sequence at the beginning of
220+
-- a character closes the character class
221+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[|a]%' ESCAPE '|';
219222

220223
-- Test backslash escapes in regexp_replace's replacement string
221224
SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3');

0 commit comments

Comments
 (0)