Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 0e6aa87

Browse files
committed
Avoid determining regexp subexpression matches, when possible.
Identifying the precise match locations for parenthesized subexpressions is a fairly expensive task given the way our regexp engine works, both at regexp compile time (where we must create an optimized NFA for each parenthesized subexpression) and at runtime (where determining exact match locations requires laborious search). Up to now we've made little attempt to optimize this situation. This patch identifies cases where we know at compile time that we won't need to know subexpression match locations, and teaches the regexp compiler to not bother creating per-subexpression regexps for parenthesis pairs that are not referenced by backrefs elsewhere in the regexp. (To preserve semantics, we obviously still have to pin down the match locations of backref references.) Users could have obtained the same results before this by being careful to write "non capturing" parentheses wherever possible, but few people bother with that. Discussion: https://postgr.es/m/[email protected]
1 parent 76ad244 commit 0e6aa87

File tree

10 files changed

+154
-45
lines changed

10 files changed

+154
-45
lines changed

contrib/pg_trgm/trgm_regexp.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -541,9 +541,11 @@ createTrgmNFA(text *text_re, Oid collation,
541541
* Stage 1: Compile the regexp into a NFA, using the regexp library.
542542
*/
543543
#ifdef IGNORECASE
544-
RE_compile(&regex, text_re, REG_ADVANCED | REG_ICASE, collation);
544+
RE_compile(&regex, text_re,
545+
REG_ADVANCED | REG_NOSUB | REG_ICASE, collation);
545546
#else
546-
RE_compile(&regex, text_re, REG_ADVANCED, collation);
547+
RE_compile(&regex, text_re,
548+
REG_ADVANCED | REG_NOSUB, collation);
547549
#endif
548550

549551
/*

src/backend/regex/regc_lex.c

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -528,10 +528,7 @@ next(struct vars *v)
528528
}
529529
assert(NOTREACHED);
530530
}
531-
if (v->cflags & REG_NOSUB)
532-
RETV('(', 0); /* all parens non-capturing */
533-
else
534-
RETV('(', 1);
531+
RETV('(', 1);
535532
break;
536533
case CHR(')'):
537534
if (LASTTYPE('('))

src/backend/regex/regcomp.c

Lines changed: 54 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ static struct subre *subre(struct vars *, int, int, struct state *, struct state
6565
static void freesubre(struct vars *, struct subre *);
6666
static void freesubreandsiblings(struct vars *, struct subre *);
6767
static void freesrnode(struct vars *, struct subre *);
68-
static void optst(struct vars *, struct subre *);
68+
static void removecaptures(struct vars *, struct subre *);
6969
static int numst(struct subre *, int);
7070
static void markst(struct subre *);
7171
static void cleanst(struct vars *);
@@ -431,7 +431,8 @@ pg_regcomp(regex_t *re,
431431
dumpst(v->tree, debug, 1);
432432
}
433433
#endif
434-
optst(v, v->tree);
434+
if (v->cflags & REG_NOSUB)
435+
removecaptures(v, v->tree);
435436
v->ntree = numst(v->tree, 1);
436437
markst(v->tree);
437438
cleanst(v);
@@ -1013,14 +1014,16 @@ parseqatom(struct vars *v,
10131014
break;
10141015
case BACKREF: /* the Feature From The Black Lagoon */
10151016
INSIST(type != LACON, REG_ESUBREG);
1016-
INSIST(v->nextvalue < v->nsubs, REG_ESUBREG);
1017-
INSIST(v->subs[v->nextvalue] != NULL, REG_ESUBREG);
1017+
subno = v->nextvalue;
1018+
assert(subno > 0);
1019+
INSIST(subno < v->nsubs, REG_ESUBREG);
1020+
NOERRN();
1021+
INSIST(v->subs[subno] != NULL, REG_ESUBREG);
10181022
NOERRN();
1019-
assert(v->nextvalue > 0);
10201023
atom = subre(v, 'b', BACKR, lp, rp);
10211024
NOERRN();
1022-
subno = v->nextvalue;
10231025
atom->backno = subno;
1026+
v->subs[subno]->flags |= BRUSE;
10241027
EMPTYARC(lp, rp); /* temporarily, so there's something */
10251028
NEXT();
10261029
break;
@@ -2141,19 +2144,54 @@ freesrnode(struct vars *v, /* might be NULL */
21412144
}
21422145

21432146
/*
2144-
* optst - optimize a subRE subtree
2147+
* removecaptures - remove unnecessary capture subREs
2148+
*
2149+
* If the caller said that it doesn't care about subexpression match data,
2150+
* we may delete the "capture" markers on subREs that are not referenced
2151+
* by any backrefs, and then simplify anything that's become non-messy.
2152+
* Call this only if REG_NOSUB flag is set.
21452153
*/
21462154
static void
2147-
optst(struct vars *v,
2148-
struct subre *t)
2155+
removecaptures(struct vars *v,
2156+
struct subre *t)
21492157
{
2158+
struct subre *t2;
2159+
2160+
assert(t != NULL);
2161+
2162+
/*
2163+
* If this isn't itself a backref target, clear capno and tentatively
2164+
* clear CAP flag.
2165+
*/
2166+
if (!(t->flags & BRUSE))
2167+
{
2168+
t->capno = 0;
2169+
t->flags &= ~CAP;
2170+
}
2171+
2172+
/* Now recurse to children */
2173+
for (t2 = t->child; t2 != NULL; t2 = t2->sibling)
2174+
{
2175+
removecaptures(v, t2);
2176+
/* Propagate child CAP flag back up, if it's still set */
2177+
if (t2->flags & CAP)
2178+
t->flags |= CAP;
2179+
}
2180+
21502181
/*
2151-
* DGP (2007-11-13): I assume it was the programmer's intent to eventually
2152-
* come back and add code to optimize subRE trees, but the routine coded
2153-
* just spends effort traversing the tree and doing nothing. We can do
2154-
* nothing with less effort.
2182+
* If t now contains neither captures nor backrefs, there's no longer any
2183+
* need to care where its sub-match boundaries are, so we can reduce it to
2184+
* a simple DFA node. (Note in particular that MIXED child greediness is
2185+
* not a hindrance here, so we don't use the MESSY() macro.)
21552186
*/
2156-
return;
2187+
if ((t->flags & (CAP | BACKR)) == 0)
2188+
{
2189+
if (t->child)
2190+
freesubreandsiblings(v, t->child);
2191+
t->child = NULL;
2192+
t->op = '=';
2193+
t->flags &= ~MIXED;
2194+
}
21572195
}
21582196

21592197
/*
@@ -2501,6 +2539,8 @@ stdump(struct subre *t,
25012539
fprintf(f, " hascapture");
25022540
if (t->flags & BACKR)
25032541
fprintf(f, " hasbackref");
2542+
if (t->flags & BRUSE)
2543+
fprintf(f, " isreferenced");
25042544
if (!(t->flags & INUSE))
25052545
fprintf(f, " UNUSED");
25062546
if (t->latype != (char) -1)

src/backend/regex/regexec.c

Lines changed: 25 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -213,12 +213,9 @@ pg_regexec(regex_t *re,
213213
return REG_NOMATCH;
214214
backref = (v->g->info & REG_UBACKREF) ? 1 : 0;
215215
v->eflags = flags;
216-
if (v->g->cflags & REG_NOSUB)
217-
nmatch = 0; /* override client */
218-
v->nmatch = nmatch;
219-
if (backref)
216+
if (backref && nmatch <= v->g->nsub)
220217
{
221-
/* need work area */
218+
/* need larger work area */
222219
if (v->g->nsub + 1 <= LOCALMAT)
223220
v->pmatch = mat;
224221
else
@@ -229,7 +226,17 @@ pg_regexec(regex_t *re,
229226
v->nmatch = v->g->nsub + 1;
230227
}
231228
else
229+
{
230+
/* we can store results directly in caller's array */
232231
v->pmatch = pmatch;
232+
/* ensure any extra entries in caller's array are filled with -1 */
233+
if (nmatch > 0)
234+
zapallsubs(pmatch, nmatch);
235+
/* then forget about extra entries, to avoid useless work in find() */
236+
if (nmatch > v->g->nsub + 1)
237+
nmatch = v->g->nsub + 1;
238+
v->nmatch = nmatch;
239+
}
233240
v->details = details;
234241
v->start = (chr *) string;
235242
v->search_start = (chr *) string + search_start;
@@ -290,12 +297,20 @@ pg_regexec(regex_t *re,
290297
else
291298
st = find(v, &v->g->tree->cnfa, &v->g->cmap);
292299

293-
/* copy (portion of) match vector over if necessary */
294-
if (st == REG_OKAY && v->pmatch != pmatch && nmatch > 0)
300+
/* on success, ensure caller's match vector is filled correctly */
301+
if (st == REG_OKAY && nmatch > 0)
295302
{
296-
zapallsubs(pmatch, nmatch);
297-
n = (nmatch < v->nmatch) ? nmatch : v->nmatch;
298-
memcpy(VS(pmatch), VS(v->pmatch), n * sizeof(regmatch_t));
303+
if (v->pmatch != pmatch)
304+
{
305+
/* copy portion of match vector over from (larger) work area */
306+
assert(nmatch <= v->nmatch);
307+
memcpy(VS(pmatch), VS(v->pmatch), nmatch * sizeof(regmatch_t));
308+
}
309+
if (v->g->cflags & REG_NOSUB)
310+
{
311+
/* don't expose possibly-partial sub-match results to caller */
312+
zapallsubs(pmatch, nmatch);
313+
}
299314
}
300315

301316
/* clean up */
@@ -752,7 +767,6 @@ cdissect(struct vars *v,
752767
break;
753768
case '(': /* no-op capture node */
754769
assert(t->child != NULL);
755-
assert(t->capno > 0);
756770
er = cdissect(v, t->child, begin, end);
757771
break;
758772
default:

src/backend/utils/adt/jsonpath_gram.y

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -583,6 +583,14 @@ jspConvertRegexFlags(uint32 xflags)
583583
errmsg("XQuery \"x\" flag (expanded regular expressions) is not implemented")));
584584
}
585585

586+
/*
587+
* We'll never need sub-match details at execution. While
588+
* RE_compile_and_execute would set this flag anyway, force it on here to
589+
* ensure that the regex cache entries created by makeItemLikeRegex are
590+
* useful.
591+
*/
592+
cflags |= REG_NOSUB;
593+
586594
return cflags;
587595
}
588596

src/backend/utils/adt/regexp.c

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,10 @@ RE_compile_and_execute(text *text_re, char *dat, int dat_len,
347347
{
348348
regex_t *re;
349349

350+
/* Use REG_NOSUB if caller does not want sub-match details */
351+
if (nmatch < 2)
352+
cflags |= REG_NOSUB;
353+
350354
/* Compile RE */
351355
re = RE_compile_and_cache(text_re, cflags, collation);
352356

@@ -1412,6 +1416,7 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
14121416
int orig_len;
14131417
pg_wchar *wide_str;
14141418
int wide_len;
1419+
int cflags;
14151420
regex_t *cpattern;
14161421
regmatch_t *pmatch;
14171422
int pmatch_len;
@@ -1430,7 +1435,10 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
14301435
wide_len = pg_mb2wchar_with_len(VARDATA_ANY(orig_str), wide_str, orig_len);
14311436

14321437
/* set up the compiled pattern */
1433-
cpattern = RE_compile_and_cache(pattern, re_flags->cflags, collation);
1438+
cflags = re_flags->cflags;
1439+
if (!use_subpatterns)
1440+
cflags |= REG_NOSUB;
1441+
cpattern = RE_compile_and_cache(pattern, cflags, collation);
14341442

14351443
/* do we want to remember subpatterns? */
14361444
if (use_subpatterns && cpattern->re_nsub > 0)
@@ -1952,7 +1960,7 @@ regexp_fixed_prefix(text *text_re, bool case_insensitive, Oid collation,
19521960
if (case_insensitive)
19531961
cflags |= REG_ICASE;
19541962

1955-
re = RE_compile_and_cache(text_re, cflags, collation);
1963+
re = RE_compile_and_cache(text_re, cflags | REG_NOSUB, collation);
19561964

19571965
/* Examine it to see if there's a fixed prefix */
19581966
re_result = pg_regprefix(re, &str, &slen);

src/include/regex/regex.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ typedef struct
106106
#define REG_QUOTE 000004 /* no special characters, none */
107107
#define REG_NOSPEC REG_QUOTE /* historical synonym */
108108
#define REG_ICASE 000010 /* ignore case */
109-
#define REG_NOSUB 000020 /* don't care about subexpressions */
109+
#define REG_NOSUB 000020 /* caller doesn't need subexpr match data */
110110
#define REG_EXPANDED 000040 /* expanded format, white space & comments */
111111
#define REG_NLSTOP 000100 /* \n doesn't match . or [^ ] */
112112
#define REG_NLANCH 000200 /* ^ matches after \n, $ before */

src/include/regex/regguts.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -477,13 +477,14 @@ struct subre
477477
#define MIXED 04 /* mixed preference below */
478478
#define CAP 010 /* capturing parens here or below */
479479
#define BACKR 020 /* back reference here or below */
480+
#define BRUSE 040 /* is referenced by a back reference */
480481
#define INUSE 0100 /* in use in final tree */
481-
#define NOPROP 03 /* bits which may not propagate up */
482+
#define UPPROP (MIXED|CAP|BACKR) /* flags which should propagate up */
482483
#define LMIX(f) ((f)<<2) /* LONGER -> MIXED */
483484
#define SMIX(f) ((f)<<1) /* SHORTER -> MIXED */
484-
#define UP(f) (((f)&~NOPROP) | (LMIX(f) & SMIX(f) & MIXED))
485+
#define UP(f) (((f)&UPPROP) | (LMIX(f) & SMIX(f) & MIXED))
485486
#define MESSY(f) ((f)&(MIXED|CAP|BACKR))
486-
#define PREF(f) ((f)&NOPROP)
487+
#define PREF(f) ((f)&(LONGER|SHORTER))
487488
#define PREF2(f1, f2) ((PREF(f1) != 0) ? PREF(f1) : PREF(f2))
488489
#define COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2))
489490
char latype; /* LATYPE code, if lookaround constraint */

src/test/modules/test_regex/expected/test_regex.out

Lines changed: 36 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -146,12 +146,12 @@ select * from test_regex('(a)e', 'ae', '-');
146146
{ae,a}
147147
(2 rows)
148148

149-
-- expectMatch 4.2 o (a)e ae
150-
select * from test_regex('(a)e', 'ae', 'o');
151-
test_regex
152-
------------
153-
{0}
154-
{NULL}
149+
-- expectMatch 4.2 oPR (.)\1e abeaae aae {}
150+
select * from test_regex('(.)\1e', 'abeaae', 'oPR');
151+
test_regex
152+
--------------------------------
153+
{1,REG_UBACKREF,REG_UNONPOSIX}
154+
{aae,NULL}
155155
(2 rows)
156156

157157
-- expectMatch 4.3 b {\(a\)b} ab ab a
@@ -2658,6 +2658,20 @@ select * from test_regex('(\w+(?= )).*\1', 'abc abcd abd', 'HLRP');
26582658
{"abc abc",abc}
26592659
(2 rows)
26602660

2661+
-- exercise oversize-regmatch_t-array paths in regexec()
2662+
-- (that case is not reachable via test_regex, sadly)
2663+
select substring('fffoooooooooooooooooooooooooooooooo', '^(.)\1(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)');
2664+
substring
2665+
-----------
2666+
f
2667+
(1 row)
2668+
2669+
select regexp_split_to_array('abcxxxdefyyyghi', '((.))(\1\2)');
2670+
regexp_split_to_array
2671+
-----------------------
2672+
{abc,def,ghi}
2673+
(1 row)
2674+
26612675
-- doing 15 "octal escapes vs back references"
26622676
-- # initial zero is always octal
26632677
-- expectMatch 15.1 MP "a\\010b" "a\bb" "a\bb"
@@ -3476,6 +3490,22 @@ select * from test_regex('((.))(\2){0}', 'xy', 'RPQ');
34763490
{x,x,x,NULL}
34773491
(2 rows)
34783492

3493+
-- expectMatch 21.37 RP ((.))(\2) xyy yy y y y
3494+
select * from test_regex('((.))(\2)', 'xyy', 'RP');
3495+
test_regex
3496+
--------------------------------
3497+
{3,REG_UBACKREF,REG_UNONPOSIX}
3498+
{yy,y,y,y}
3499+
(2 rows)
3500+
3501+
-- expectMatch 21.38 oRP ((.))(\2) xyy yy {} {} {}
3502+
select * from test_regex('((.))(\2)', 'xyy', 'oRP');
3503+
test_regex
3504+
--------------------------------
3505+
{3,REG_UBACKREF,REG_UNONPOSIX}
3506+
{yy,NULL,NULL,NULL}
3507+
(2 rows)
3508+
34793509
-- doing 22 "multicharacter collating elements"
34803510
-- # again ugh
34813511
-- MCCEs are not implemented in Postgres, so we skip all these tests

src/test/modules/test_regex/sql/test_regex.sql

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,8 @@ select * from test_regex('ab', 'ab', 'b');
6363

6464
-- expectMatch 4.1 - (a)e ae ae a
6565
select * from test_regex('(a)e', 'ae', '-');
66-
-- expectMatch 4.2 o (a)e ae
67-
select * from test_regex('(a)e', 'ae', 'o');
66+
-- expectMatch 4.2 oPR (.)\1e abeaae aae {}
67+
select * from test_regex('(.)\1e', 'abeaae', 'oPR');
6868
-- expectMatch 4.3 b {\(a\)b} ab ab a
6969
select * from test_regex('\(a\)b', 'ab', 'b');
7070
-- expectMatch 4.4 - a((b)c) abc abc bc b
@@ -775,6 +775,11 @@ select * from test_regex('(^\w+).*\1', 'abc abc abc', 'LRP');
775775
select * from test_regex('(^\w+\M).*\1', 'abc abcd abd', 'LRP');
776776
select * from test_regex('(\w+(?= )).*\1', 'abc abcd abd', 'HLRP');
777777

778+
-- exercise oversize-regmatch_t-array paths in regexec()
779+
-- (that case is not reachable via test_regex, sadly)
780+
select substring('fffoooooooooooooooooooooooooooooooo', '^(.)\1(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)');
781+
select regexp_split_to_array('abcxxxdefyyyghi', '((.))(\1\2)');
782+
778783
-- doing 15 "octal escapes vs back references"
779784

780785
-- # initial zero is always octal
@@ -1011,6 +1016,10 @@ select * from test_regex('(a*)*', 'bc', 'N');
10111016
select * from test_regex(' TO (([a-z0-9._]+|"([^"]+|"")+")+)', 'asd TO foo', 'M');
10121017
-- expectMatch 21.36 RPQ ((.))(\2){0} xy x x x {}
10131018
select * from test_regex('((.))(\2){0}', 'xy', 'RPQ');
1019+
-- expectMatch 21.37 RP ((.))(\2) xyy yy y y y
1020+
select * from test_regex('((.))(\2)', 'xyy', 'RP');
1021+
-- expectMatch 21.38 oRP ((.))(\2) xyy yy {} {} {}
1022+
select * from test_regex('((.))(\2)', 'xyy', 'oRP');
10141023

10151024
-- doing 22 "multicharacter collating elements"
10161025
-- # again ugh

0 commit comments

Comments
 (0)