🌐 AI搜索 & 代理 主页
Skip to content

Commit 18b3493

Browse files
committed
Fix text substring search for non-deterministic collations.
Due to an off-by-one error, the code failed to find matches at the end of the haystack. Fix by rewriting the loop. While at it, fix a comment that claimed that the function could find a zero-length match. Such a match could send a caller into an endless loop. However, zero-length matches only make sense with an empty search string, and that case is explicitly excluded by all callers. To make sure it stays that way, add an Assert and a comment. Bug: #19341 Reported-by: Adam Warland <adam.warland@infor.com> Author: Laurenz Albe <laurenz.albe@cybertec.at> Reviewed-by: Heikki Linnakangas <hlinnaka@iki.fi> Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> Discussion: https://postgr.es/m/19341-1d9a22915edfec58@postgresql.org Backpatch-through: 18
1 parent 02ba5e3 commit 18b3493

File tree

3 files changed

+28
-7
lines changed

3 files changed

+28
-7
lines changed

src/backend/utils/adt/varlena.c

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1424,6 +1424,7 @@ text_position_next_internal(char *start_ptr, TextPositionState *state)
14241424
const char *hptr;
14251425

14261426
Assert(start_ptr >= haystack && start_ptr <= haystack_end);
1427+
Assert(needle_len > 0);
14271428

14281429
state->last_match_len_tmp = needle_len;
14291430

@@ -1436,19 +1437,26 @@ text_position_next_internal(char *start_ptr, TextPositionState *state)
14361437
* needle under the given collation.
14371438
*
14381439
* Note, the found substring could have a different length than the
1439-
* needle, including being empty. Callers that want to skip over the
1440-
* found string need to read the length of the found substring from
1441-
* last_match_len rather than just using the length of their needle.
1440+
* needle. Callers that want to skip over the found string need to
1441+
* read the length of the found substring from last_match_len rather
1442+
* than just using the length of their needle.
14421443
*
14431444
* Most callers will require "greedy" semantics, meaning that we need
14441445
* to find the longest such substring, not the shortest. For callers
14451446
* that don't need greedy semantics, we can finish on the first match.
1447+
*
1448+
* This loop depends on the assumption that the needle is nonempty and
1449+
* any matching substring must also be nonempty. (Even if the
1450+
* collation would accept an empty match, returning one would send
1451+
* callers that search for successive matches into an infinite loop.)
14461452
*/
14471453
const char *result_hptr = NULL;
14481454

14491455
hptr = start_ptr;
14501456
while (hptr < haystack_end)
14511457
{
1458+
const char *test_end;
1459+
14521460
/*
14531461
* First check the common case that there is a match in the
14541462
* haystack of exactly the length of the needle.
@@ -1459,19 +1467,22 @@ text_position_next_internal(char *start_ptr, TextPositionState *state)
14591467
return (char *) hptr;
14601468

14611469
/*
1462-
* Else check if any of the possible substrings starting at hptr
1463-
* are equal to the needle.
1470+
* Else check if any of the non-empty substrings starting at hptr
1471+
* compare equal to the needle.
14641472
*/
1465-
for (const char *test_end = hptr; test_end < haystack_end; test_end += pg_mblen(test_end))
1473+
test_end = hptr;
1474+
do
14661475
{
1476+
test_end += pg_mblen(test_end);
14671477
if (pg_strncoll(hptr, (test_end - hptr), needle, needle_len, state->locale) == 0)
14681478
{
14691479
state->last_match_len_tmp = (test_end - hptr);
14701480
result_hptr = hptr;
14711481
if (!state->greedy)
14721482
break;
14731483
}
1474-
}
1484+
} while (test_end < haystack_end);
1485+
14751486
if (result_hptr)
14761487
break;
14771488

src/test/regress/expected/collate.icu.utf8.out

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1484,6 +1484,13 @@ SELECT array_sort('{a,B}'::text[] COLLATE "C");
14841484
{B,a}
14851485
(1 row)
14861486

1487+
-- test replace() at the end of the string (bug #19341)
1488+
SELECT replace('testX' COLLATE case_insensitive, 'x' COLLATE case_insensitive, 'er');
1489+
replace
1490+
---------
1491+
tester
1492+
(1 row)
1493+
14871494
-- test language tags
14881495
CREATE COLLATION lt_insensitive (provider = icu, locale = 'en-u-ks-level1', deterministic = false);
14891496
SELECT 'aBcD' COLLATE lt_insensitive = 'AbCd' COLLATE lt_insensitive;

src/test/regress/sql/collate.icu.utf8.sql

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -568,6 +568,9 @@ SELECT 'abc' <= 'ABC' COLLATE case_insensitive, 'abc' >= 'ABC' COLLATE case_inse
568568
SELECT array_sort('{a,B}'::text[] COLLATE case_insensitive);
569569
SELECT array_sort('{a,B}'::text[] COLLATE "C");
570570

571+
-- test replace() at the end of the string (bug #19341)
572+
SELECT replace('testX' COLLATE case_insensitive, 'x' COLLATE case_insensitive, 'er');
573+
571574
-- test language tags
572575
CREATE COLLATION lt_insensitive (provider = icu, locale = 'en-u-ks-level1', deterministic = false);
573576
SELECT 'aBcD' COLLATE lt_insensitive = 'AbCd' COLLATE lt_insensitive;

0 commit comments

Comments
 (0)