Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 18bac60

Browse files
committed
Let regexp_replace() make use of REG_NOSUB when feasible.
If the replacement string doesn't contain \1...\9, then we don't need sub-match locations, so we can use the REG_NOSUB optimization here too. There's already a pre-scan of the replacement string to look for backslashes, so extend that to check for digits, and refactor to allow that to happen before we compile the regexp. While at it, try to speed up the pre-scan by using memchr() instead of a handwritten loop. It's likely that this is lost in the noise compared to the regexp processing proper, but maybe not. In any case, this coding is shorter. Also, add some test cases to improve the poor coverage of appendStringInfoRegexpSubstr(). Discussion: https://postgr.es/m/[email protected]
1 parent e126945 commit 18bac60

File tree

5 files changed

+90
-65
lines changed

5 files changed

+90
-65
lines changed

src/backend/utils/adt/regexp.c

Lines changed: 9 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -630,11 +630,10 @@ textregexreplace_noopt(PG_FUNCTION_ARGS)
630630
text *s = PG_GETARG_TEXT_PP(0);
631631
text *p = PG_GETARG_TEXT_PP(1);
632632
text *r = PG_GETARG_TEXT_PP(2);
633-
regex_t *re;
634-
635-
re = RE_compile_and_cache(p, REG_ADVANCED, PG_GET_COLLATION());
636633

637-
PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, 0, 1));
634+
PG_RETURN_TEXT_P(replace_text_regexp(s, p, r,
635+
REG_ADVANCED, PG_GET_COLLATION(),
636+
0, 1));
638637
}
639638

640639
/*
@@ -648,7 +647,6 @@ textregexreplace(PG_FUNCTION_ARGS)
648647
text *p = PG_GETARG_TEXT_PP(1);
649648
text *r = PG_GETARG_TEXT_PP(2);
650649
text *opt = PG_GETARG_TEXT_PP(3);
651-
regex_t *re;
652650
pg_re_flags flags;
653651

654652
/*
@@ -672,10 +670,9 @@ textregexreplace(PG_FUNCTION_ARGS)
672670

673671
parse_re_flags(&flags, opt);
674672

675-
re = RE_compile_and_cache(p, flags.cflags, PG_GET_COLLATION());
676-
677-
PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, 0,
678-
flags.glob ? 0 : 1));
673+
PG_RETURN_TEXT_P(replace_text_regexp(s, p, r,
674+
flags.cflags, PG_GET_COLLATION(),
675+
0, flags.glob ? 0 : 1));
679676
}
680677

681678
/*
@@ -694,7 +691,6 @@ textregexreplace_extended(PG_FUNCTION_ARGS)
694691
int n = 1;
695692
text *flags = PG_GETARG_TEXT_PP_IF_EXISTS(5);
696693
pg_re_flags re_flags;
697-
regex_t *re;
698694

699695
/* Collect optional parameters */
700696
if (PG_NARGS() > 3)
@@ -723,11 +719,10 @@ textregexreplace_extended(PG_FUNCTION_ARGS)
723719
if (PG_NARGS() <= 4)
724720
n = re_flags.glob ? 0 : 1;
725721

726-
/* Compile the regular expression */
727-
re = RE_compile_and_cache(p, re_flags.cflags, PG_GET_COLLATION());
728-
729722
/* Do the replacement(s) */
730-
PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, start - 1, n));
723+
PG_RETURN_TEXT_P(replace_text_regexp(s, p, r,
724+
re_flags.cflags, PG_GET_COLLATION(),
725+
start - 1, n));
731726
}
732727

733728
/* This is separate to keep the opr_sanity regression test from complaining */

src/backend/utils/adt/varlena.c

Lines changed: 53 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -4359,34 +4359,36 @@ replace_text(PG_FUNCTION_ARGS)
43594359
}
43604360

43614361
/*
4362-
* check_replace_text_has_escape_char
4362+
* check_replace_text_has_escape
43634363
*
4364-
* check whether replace_text contains escape char.
4364+
* Returns 0 if text contains no backslashes that need processing.
4365+
* Returns 1 if text contains backslashes, but not regexp submatch specifiers.
4366+
* Returns 2 if text contains regexp submatch specifiers (\1 .. \9).
43654367
*/
4366-
static bool
4367-
check_replace_text_has_escape_char(const text *replace_text)
4368+
static int
4369+
check_replace_text_has_escape(const text *replace_text)
43684370
{
4371+
int result = 0;
43694372
const char *p = VARDATA_ANY(replace_text);
43704373
const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
43714374

4372-
if (pg_database_encoding_max_length() == 1)
4373-
{
4374-
for (; p < p_end; p++)
4375-
{
4376-
if (*p == '\\')
4377-
return true;
4378-
}
4379-
}
4380-
else
4375+
while (p < p_end)
43814376
{
4382-
for (; p < p_end; p += pg_mblen(p))
4377+
/* Find next escape char, if any. */
4378+
p = memchr(p, '\\', p_end - p);
4379+
if (p == NULL)
4380+
break;
4381+
p++;
4382+
/* Note: a backslash at the end doesn't require extra processing. */
4383+
if (p < p_end)
43834384
{
4384-
if (*p == '\\')
4385-
return true;
4385+
if (*p >= '1' && *p <= '9')
4386+
return 2; /* Found a submatch specifier, so done */
4387+
result = 1; /* Found some other sequence, keep looking */
4388+
p++;
43864389
}
43874390
}
4388-
4389-
return false;
4391+
return result;
43904392
}
43914393

43924394
/*
@@ -4403,25 +4405,17 @@ appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
44034405
{
44044406
const char *p = VARDATA_ANY(replace_text);
44054407
const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
4406-
int eml = pg_database_encoding_max_length();
44074408

4408-
for (;;)
4409+
while (p < p_end)
44094410
{
44104411
const char *chunk_start = p;
44114412
int so;
44124413
int eo;
44134414

4414-
/* Find next escape char. */
4415-
if (eml == 1)
4416-
{
4417-
for (; p < p_end && *p != '\\'; p++)
4418-
/* nothing */ ;
4419-
}
4420-
else
4421-
{
4422-
for (; p < p_end && *p != '\\'; p += pg_mblen(p))
4423-
/* nothing */ ;
4424-
}
4415+
/* Find next escape char, if any. */
4416+
p = memchr(p, '\\', p_end - p);
4417+
if (p == NULL)
4418+
p = p_end;
44254419

44264420
/* Copy the text we just scanned over, if any. */
44274421
if (p > chunk_start)
@@ -4473,7 +4467,7 @@ appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
44734467
continue;
44744468
}
44754469

4476-
if (so != -1 && eo != -1)
4470+
if (so >= 0 && eo >= 0)
44774471
{
44784472
/*
44794473
* Copy the text that is back reference of regexp. Note so and eo
@@ -4491,45 +4485,57 @@ appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
44914485
}
44924486
}
44934487

4494-
#define REGEXP_REPLACE_BACKREF_CNT 10
4495-
44964488
/*
44974489
* replace_text_regexp
44984490
*
4499-
* replace substring(s) in src_text that match regexp with replace_text.
4491+
* replace substring(s) in src_text that match pattern with replace_text.
4492+
* The replace_text can contain backslash markers to substitute
4493+
* (parts of) the matched text.
45004494
*
4495+
* cflags: regexp compile flags.
4496+
* collation: collation to use.
45014497
* search_start: the character (not byte) offset in src_text at which to
45024498
* begin searching.
45034499
* n: if 0, replace all matches; if > 0, replace only the N'th match.
4504-
*
4505-
* Note: to avoid having to include regex.h in builtins.h, we declare
4506-
* the regexp argument as void *, but really it's regex_t *.
45074500
*/
45084501
text *
4509-
replace_text_regexp(text *src_text, void *regexp,
4502+
replace_text_regexp(text *src_text, text *pattern_text,
45104503
text *replace_text,
4504+
int cflags, Oid collation,
45114505
int search_start, int n)
45124506
{
45134507
text *ret_text;
4514-
regex_t *re = (regex_t *) regexp;
4508+
regex_t *re;
45154509
int src_text_len = VARSIZE_ANY_EXHDR(src_text);
45164510
int nmatches = 0;
45174511
StringInfoData buf;
4518-
regmatch_t pmatch[REGEXP_REPLACE_BACKREF_CNT];
4512+
regmatch_t pmatch[10]; /* main match, plus \1 to \9 */
4513+
int nmatch = lengthof(pmatch);
45194514
pg_wchar *data;
45204515
size_t data_len;
45214516
int data_pos;
45224517
char *start_ptr;
4523-
bool have_escape;
4518+
int escape_status;
45244519

45254520
initStringInfo(&buf);
45264521

45274522
/* Convert data string to wide characters. */
45284523
data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
45294524
data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len);
45304525

4531-
/* Check whether replace_text has escape char. */
4532-
have_escape = check_replace_text_has_escape_char(replace_text);
4526+
/* Check whether replace_text has escapes, especially regexp submatches. */
4527+
escape_status = check_replace_text_has_escape(replace_text);
4528+
4529+
/* If no regexp submatches, we can use REG_NOSUB. */
4530+
if (escape_status < 2)
4531+
{
4532+
cflags |= REG_NOSUB;
4533+
/* Also tell pg_regexec we only want the whole-match location. */
4534+
nmatch = 1;
4535+
}
4536+
4537+
/* Prepare the regexp. */
4538+
re = RE_compile_and_cache(pattern_text, cflags, collation);
45334539

45344540
/* start_ptr points to the data_pos'th character of src_text */
45354541
start_ptr = (char *) VARDATA_ANY(src_text);
@@ -4546,7 +4552,7 @@ replace_text_regexp(text *src_text, void *regexp,
45464552
data_len,
45474553
search_start,
45484554
NULL, /* no details */
4549-
REGEXP_REPLACE_BACKREF_CNT,
4555+
nmatch,
45504556
pmatch,
45514557
0);
45524558

@@ -4602,10 +4608,9 @@ replace_text_regexp(text *src_text, void *regexp,
46024608
}
46034609

46044610
/*
4605-
* Copy the replace_text. Process back references when the
4606-
* replace_text has escape characters.
4611+
* Copy the replace_text, processing escapes if any are present.
46074612
*/
4608-
if (have_escape)
4613+
if (escape_status > 0)
46094614
appendStringInfoRegexpSubstr(&buf, replace_text, pmatch,
46104615
start_ptr, data_pos);
46114616
else

src/include/utils/varlena.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,9 @@ extern bool SplitDirectoriesString(char *rawstring, char separator,
3333
List **namelist);
3434
extern bool SplitGUCList(char *rawstring, char separator,
3535
List **namelist);
36-
extern text *replace_text_regexp(text *src_text, void *regexp,
36+
extern text *replace_text_regexp(text *src_text, text *pattern_text,
3737
text *replace_text,
38+
int cflags, Oid collation,
3839
int search_start, int n);
3940

4041
#endif

src/test/regress/expected/strings.out

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -571,13 +571,32 @@ SELECT 'abcdefg' SIMILAR TO '_bcd%' ESCAPE NULL AS null;
571571
SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '##' AS error;
572572
ERROR: invalid escape string
573573
HINT: Escape string must be empty or one character.
574-
-- Test back reference in regexp_replace
574+
-- Test backslash escapes in regexp_replace's replacement string
575575
SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3');
576576
regexp_replace
577577
----------------
578578
(111) 222-3333
579579
(1 row)
580580

581+
SELECT regexp_replace('foobarrbazz', E'(.)\\1', E'X\\&Y', 'g');
582+
regexp_replace
583+
-------------------
584+
fXooYbaXrrYbaXzzY
585+
(1 row)
586+
587+
SELECT regexp_replace('foobarrbazz', E'(.)\\1', E'X\\\\Y', 'g');
588+
regexp_replace
589+
----------------
590+
fX\YbaX\YbaX\Y
591+
(1 row)
592+
593+
-- not an error, though perhaps it should be:
594+
SELECT regexp_replace('foobarrbazz', E'(.)\\1', E'X\\Y\\1Z\\');
595+
regexp_replace
596+
-----------------
597+
fX\YoZ\barrbazz
598+
(1 row)
599+
581600
SELECT regexp_replace('AAA BBB CCC ', E'\\s+', ' ', 'g');
582601
regexp_replace
583602
----------------

src/test/regress/sql/strings.sql

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,8 +187,13 @@ SELECT 'abcd\efg' SIMILAR TO '_bcd\%' ESCAPE '' AS true;
187187
SELECT 'abcdefg' SIMILAR TO '_bcd%' ESCAPE NULL AS null;
188188
SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '##' AS error;
189189

190-
-- Test back reference in regexp_replace
190+
-- Test backslash escapes in regexp_replace's replacement string
191191
SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3');
192+
SELECT regexp_replace('foobarrbazz', E'(.)\\1', E'X\\&Y', 'g');
193+
SELECT regexp_replace('foobarrbazz', E'(.)\\1', E'X\\\\Y', 'g');
194+
-- not an error, though perhaps it should be:
195+
SELECT regexp_replace('foobarrbazz', E'(.)\\1', E'X\\Y\\1Z\\');
196+
192197
SELECT regexp_replace('AAA BBB CCC ', E'\\s+', ' ', 'g');
193198
SELECT regexp_replace('AAA', '^|$', 'Z', 'g');
194199
SELECT regexp_replace('AAA aaa', 'A+', 'Z', 'gi');

0 commit comments

Comments
 (0)