Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 00116de

Browse files
committed
Rethink regexp engine's backref-related compilation state.
I had committer's remorse almost immediately after pushing cb76fbd, upon finding that removing capturing subexpressions' subREs from the data structure broke my proposed patch for REG_NOSUB optimization. Revert that data structure change. Instead, address the concern about not changing capturing subREs' endpoints by not changing the endpoints. We don't need to, because the point of that bit was just to ensure that the atom has endpoints distinct from the outer state pair that we're stringing the branch between. We already made suitable states in the parenthesized-subexpression case, so the additional ones were just useless overhead. This seems more understandable than Spencer's original coding, and it ought to be a shade faster too by saving a few state creations and arc changes. (I actually see a couple percent improvement on Jacobson's web corpus, though that's barely above the noise floor so I wouldn't put much stock in that result.) Also, fix the logic added by ea1268f to ensure that the subRE recorded in v->subs[subno] is exactly the one with capno == subno. Spencer's original coding recorded the child subRE of the capture node, which is okay so far as having the right endpoint states is concerned, but as of cb76fbd the capturing subRE itself always has those endpoints too. I think the inconsistency is confusing for the REG_NOSUB optimization. As before, backpatch to v14. Discussion: https://postgr.es/m/[email protected]
1 parent 75a2d13 commit 00116de

File tree

1 file changed

+47
-37
lines changed

1 file changed

+47
-37
lines changed

src/backend/regex/regcomp.c

Lines changed: 47 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -233,13 +233,6 @@ static int cmp(const chr *, const chr *, size_t);
233233
static int casecmp(const chr *, const chr *, size_t);
234234

235235

236-
/* info we need during compilation about a known capturing subexpression */
237-
struct subinfo
238-
{
239-
struct state *left; /* left end of its sub-NFA */
240-
struct state *right; /* right end of its sub-NFA */
241-
};
242-
243236
/* internal variables, bundled for easy passing around */
244237
struct vars
245238
{
@@ -252,10 +245,10 @@ struct vars
252245
int nexttype; /* type of next token */
253246
chr nextvalue; /* value (if any) of next token */
254247
int lexcon; /* lexical context type (see regc_lex.c) */
255-
int nsubexp; /* number of known capturing subexpressions */
256-
struct subinfo *subs; /* info about known capturing subexpressions */
257-
size_t nsubs; /* allocated length of subs[] vector */
258-
struct subinfo sub10[10]; /* initial vector, enough for most */
248+
int nsubexp; /* subexpression count */
249+
struct subre **subs; /* subRE pointer vector */
250+
size_t nsubs; /* length of vector */
251+
struct subre *sub10[10]; /* initial vector, enough for most */
259252
struct nfa *nfa; /* the NFA */
260253
struct colormap *cm; /* character color map */
261254
color nlcolor; /* color of newline */
@@ -375,7 +368,7 @@ pg_regcomp(regex_t *re,
375368
v->subs = v->sub10;
376369
v->nsubs = 10;
377370
for (j = 0; j < v->nsubs; j++)
378-
v->subs[j].left = v->subs[j].right = NULL;
371+
v->subs[j] = NULL;
379372
v->nfa = NULL;
380373
v->cm = NULL;
381374
v->nlcolor = COLORLESS;
@@ -511,35 +504,35 @@ pg_regcomp(regex_t *re,
511504
}
512505

513506
/*
514-
* moresubs - enlarge capturing-subexpressions vector
507+
* moresubs - enlarge subRE vector
515508
*/
516509
static void
517510
moresubs(struct vars *v,
518511
int wanted) /* want enough room for this one */
519512
{
520-
struct subinfo *p;
513+
struct subre **p;
521514
size_t n;
522515

523516
assert(wanted > 0 && (size_t) wanted >= v->nsubs);
524517
n = (size_t) wanted * 3 / 2 + 1;
525518

526519
if (v->subs == v->sub10)
527520
{
528-
p = (struct subinfo *) MALLOC(n * sizeof(struct subinfo));
521+
p = (struct subre **) MALLOC(n * sizeof(struct subre *));
529522
if (p != NULL)
530523
memcpy(VS(p), VS(v->subs),
531-
v->nsubs * sizeof(struct subinfo));
524+
v->nsubs * sizeof(struct subre *));
532525
}
533526
else
534-
p = (struct subinfo *) REALLOC(v->subs, n * sizeof(struct subinfo));
527+
p = (struct subre **) REALLOC(v->subs, n * sizeof(struct subre *));
535528
if (p == NULL)
536529
{
537530
ERR(REG_ESPACE);
538531
return;
539532
}
540533
v->subs = p;
541534
for (p = &v->subs[v->nsubs]; v->nsubs < n; p++, v->nsubs++)
542-
p->left = p->right = NULL;
535+
*p = NULL;
543536
assert(v->nsubs == n);
544537
assert((size_t) wanted < v->nsubs);
545538
}
@@ -988,6 +981,7 @@ parseqatom(struct vars *v,
988981
s = newstate(v->nfa);
989982
s2 = newstate(v->nfa);
990983
NOERRN();
984+
/* We may not need these arcs, but keep things connected for now */
991985
EMPTYARC(lp, s);
992986
EMPTYARC(s2, rp);
993987
NOERRN();
@@ -997,10 +991,6 @@ parseqatom(struct vars *v,
997991
NOERRN();
998992
if (cap)
999993
{
1000-
/* save the sub-NFA's endpoints for future backrefs to use */
1001-
assert(v->subs[subno].left == NULL);
1002-
v->subs[subno].left = s;
1003-
v->subs[subno].right = s2;
1004994
if (atom->capno == 0)
1005995
{
1006996
/* normal case: just mark the atom as capturing */
@@ -1016,13 +1006,15 @@ parseqatom(struct vars *v,
10161006
t->child = atom;
10171007
atom = t;
10181008
}
1009+
assert(v->subs[subno] == NULL);
1010+
v->subs[subno] = atom;
10191011
}
10201012
/* postpone everything else pending possible {0} */
10211013
break;
10221014
case BACKREF: /* the Feature From The Black Lagoon */
10231015
INSIST(type != LACON, REG_ESUBREG);
10241016
INSIST(v->nextvalue < v->nsubs, REG_ESUBREG);
1025-
INSIST(v->subs[v->nextvalue].left != NULL, REG_ESUBREG);
1017+
INSIST(v->subs[v->nextvalue] != NULL, REG_ESUBREG);
10261018
NOERRN();
10271019
assert(v->nextvalue > 0);
10281020
atom = subre(v, 'b', BACKR, lp, rp);
@@ -1097,7 +1089,7 @@ parseqatom(struct vars *v,
10971089
if (atom != NULL)
10981090
freesubre(v, atom);
10991091
if (atomtype == '(')
1100-
v->subs[subno].left = v->subs[subno].right = NULL;
1092+
v->subs[subno] = NULL;
11011093
delsub(v->nfa, lp, rp);
11021094
EMPTYARC(lp, rp);
11031095
return top;
@@ -1130,30 +1122,48 @@ parseqatom(struct vars *v,
11301122
NOERRN();
11311123
}
11321124

1125+
/*
1126+
* For what follows, we need the atom to have its own begin/end states
1127+
* that are distinct from lp/rp, so that we can wrap iteration structure
1128+
* around it. The parenthesized-atom case above already made suitable
1129+
* states (and we don't want to modify a capturing subre, since it's
1130+
* already recorded in v->subs[]). Otherwise, we need more states.
1131+
*/
1132+
if (atom->begin == lp || atom->end == rp)
1133+
{
1134+
s = newstate(v->nfa);
1135+
s2 = newstate(v->nfa);
1136+
NOERRN();
1137+
moveouts(v->nfa, lp, s);
1138+
moveins(v->nfa, rp, s2);
1139+
atom->begin = s;
1140+
atom->end = s2;
1141+
}
1142+
else
1143+
{
1144+
/* The atom's OK, but we must temporarily disconnect it from lp/rp */
1145+
/* (this removes the EMPTY arcs we made above) */
1146+
delsub(v->nfa, lp, atom->begin);
1147+
delsub(v->nfa, atom->end, rp);
1148+
}
1149+
11331150
/*----------
11341151
* Prepare a general-purpose state skeleton.
11351152
*
11361153
* In the no-backrefs case, we want this:
11371154
*
1138-
* [lp] ---> [s] ---prefix---> [begin] ---atom---> [end] ---rest---> [rp]
1155+
* [lp] ---> [s] ---prefix---> ---atom---> ---rest---> [rp]
11391156
*
1140-
* where prefix is some repetitions of atom. In the general case we need
1157+
* where prefix is some repetitions of atom, and "rest" is the remainder
1158+
* of the branch. In the general case we need:
11411159
*
11421160
* [lp] ---> [s] ---iterator---> [s2] ---rest---> [rp]
11431161
*
1144-
* where the iterator wraps around [begin] ---atom---> [end]
1162+
* where the iterator wraps around the atom.
11451163
*
11461164
* We make the s state here for both cases; s2 is made below if needed
11471165
*----------
11481166
*/
1149-
s = newstate(v->nfa); /* first, new endpoints for the atom */
1150-
s2 = newstate(v->nfa);
1151-
NOERRN();
1152-
moveouts(v->nfa, lp, s);
1153-
moveins(v->nfa, rp, s2);
1154-
NOERRN();
1155-
atom->begin = s;
1156-
atom->end = s2;
11571167
s = newstate(v->nfa); /* set up starting state */
11581168
NOERRN();
11591169
EMPTYARC(lp, s);
@@ -1190,14 +1200,14 @@ parseqatom(struct vars *v,
11901200
{
11911201
assert(atom->begin->nouts == 1); /* just the EMPTY */
11921202
delsub(v->nfa, atom->begin, atom->end);
1193-
assert(v->subs[subno].left != NULL);
1203+
assert(v->subs[subno] != NULL);
11941204

11951205
/*
11961206
* And here's why the recursion got postponed: it must wait until the
11971207
* skeleton is filled in, because it may hit a backref that wants to
11981208
* copy the filled-in skeleton.
11991209
*/
1200-
dupnfa(v->nfa, v->subs[subno].left, v->subs[subno].right,
1210+
dupnfa(v->nfa, v->subs[subno]->begin, v->subs[subno]->end,
12011211
atom->begin, atom->end);
12021212
NOERRN();
12031213

0 commit comments

Comments
 (0)