Rethink regexp engine's backref-related compilation state.

tglsfdc · tglsfdc · commit 00116dee5ad4 · 2021-08-08T11:56:29.000-04:00
I had committer's remorse almost immediately after pushing cb76fbd, upon finding that removing capturing subexpressions' subREs from the data structure broke my proposed patch for REG_NOSUB optimization. Revert that data structure change. Instead, address the concern about not changing capturing subREs' endpoints by not changing the endpoints. We don't need to, because the point of that bit was just to ensure that the atom has endpoints distinct from the outer state pair that we're stringing the branch between. We already made suitable states in the parenthesized-subexpression case, so the additional ones were just useless overhead. This seems more understandable than Spencer's original coding, and it ought to be a shade faster too by saving a few state creations and arc changes. (I actually see a couple percent improvement on Jacobson's web corpus, though that's barely above the noise floor so I wouldn't put much stock in that result.) Also, fix the logic added by ea1268f to ensure that the subRE recorded in v->subs[subno] is exactly the one with capno == subno. Spencer's original coding recorded the child subRE of the capture node, which is okay so far as having the right endpoint states is concerned, but as of cb76fbd the capturing subRE itself always has those endpoints too. I think the inconsistency is confusing for the REG_NOSUB optimization. As before, backpatch to v14. Discussion: https://postgr.es/m/0203588E-E609-43AF-9F4F-902854231EE7@enterprisedb.com
diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c
@@ -233,13 +233,6 @@ static int	cmp(const chr *, const chr *, size_t);
 static int	casecmp(const chr *, const chr *, size_t);
 
 
-/* info we need during compilation about a known capturing subexpression */
-struct subinfo
-{
-	struct state *left;			/* left end of its sub-NFA */
-	struct state *right;		/* right end of its sub-NFA */
-};
-
 /* internal variables, bundled for easy passing around */
 struct vars
 {
@@ -252,10 +245,10 @@ struct vars
 	int			nexttype;		/* type of next token */
 	chr			nextvalue;		/* value (if any) of next token */
 	int			lexcon;			/* lexical context type (see regc_lex.c) */
-	int			nsubexp;		/* number of known capturing subexpressions */
-	struct subinfo *subs;		/* info about known capturing subexpressions */
-	size_t		nsubs;			/* allocated length of subs[] vector */
-	struct subinfo sub10[10];	/* initial vector, enough for most */
+	int			nsubexp;		/* subexpression count */
+	struct subre **subs;		/* subRE pointer vector */
+	size_t		nsubs;			/* length of vector */
+	struct subre *sub10[10];	/* initial vector, enough for most */
 	struct nfa *nfa;			/* the NFA */
 	struct colormap *cm;		/* character color map */
 	color		nlcolor;		/* color of newline */
@@ -375,7 +368,7 @@ pg_regcomp(regex_t *re,
 	v->subs = v->sub10;
 	v->nsubs = 10;
 	for (j = 0; j < v->nsubs; j++)
-		v->subs[j].left = v->subs[j].right = NULL;
+		v->subs[j] = NULL;
 	v->nfa = NULL;
 	v->cm = NULL;
 	v->nlcolor = COLORLESS;
@@ -511,35 +504,35 @@ pg_regcomp(regex_t *re,
 }
 
 /*
- * moresubs - enlarge capturing-subexpressions vector
+ * moresubs - enlarge subRE vector
  */
 static void
 moresubs(struct vars *v,
 		 int wanted)			/* want enough room for this one */
 {
-	struct subinfo *p;
+	struct subre **p;
 	size_t		n;
 
 	assert(wanted > 0 && (size_t) wanted >= v->nsubs);
 	n = (size_t) wanted * 3 / 2 + 1;
 
 	if (v->subs == v->sub10)
 	{
-		p = (struct subinfo *) MALLOC(n * sizeof(struct subinfo));
+		p = (struct subre **) MALLOC(n * sizeof(struct subre *));
 		if (p != NULL)
 			memcpy(VS(p), VS(v->subs),
-				   v->nsubs * sizeof(struct subinfo));
+				   v->nsubs * sizeof(struct subre *));
 	}
 	else
-		p = (struct subinfo *) REALLOC(v->subs, n * sizeof(struct subinfo));
+		p = (struct subre **) REALLOC(v->subs, n * sizeof(struct subre *));
 	if (p == NULL)
 	{
 		ERR(REG_ESPACE);
 		return;
 	}
 	v->subs = p;
 	for (p = &v->subs[v->nsubs]; v->nsubs < n; p++, v->nsubs++)
-		p->left = p->right = NULL;
+		*p = NULL;
 	assert(v->nsubs == n);
 	assert((size_t) wanted < v->nsubs);
 }
@@ -988,6 +981,7 @@ parseqatom(struct vars *v,
 			s = newstate(v->nfa);
 			s2 = newstate(v->nfa);
 			NOERRN();
+			/* We may not need these arcs, but keep things connected for now */
 			EMPTYARC(lp, s);
 			EMPTYARC(s2, rp);
 			NOERRN();
@@ -997,10 +991,6 @@ parseqatom(struct vars *v,
 			NOERRN();
 			if (cap)
 			{
-				/* save the sub-NFA's endpoints for future backrefs to use */
-				assert(v->subs[subno].left == NULL);
-				v->subs[subno].left = s;
-				v->subs[subno].right = s2;
 				if (atom->capno == 0)
 				{
 					/* normal case: just mark the atom as capturing */
@@ -1016,13 +1006,15 @@ parseqatom(struct vars *v,
 					t->child = atom;
 					atom = t;
 				}
+				assert(v->subs[subno] == NULL);
+				v->subs[subno] = atom;
 			}
 			/* postpone everything else pending possible {0} */
 			break;
 		case BACKREF:			/* the Feature From The Black Lagoon */
 			INSIST(type != LACON, REG_ESUBREG);
 			INSIST(v->nextvalue < v->nsubs, REG_ESUBREG);
-			INSIST(v->subs[v->nextvalue].left != NULL, REG_ESUBREG);
+			INSIST(v->subs[v->nextvalue] != NULL, REG_ESUBREG);
 			NOERRN();
 			assert(v->nextvalue > 0);
 			atom = subre(v, 'b', BACKR, lp, rp);
@@ -1097,7 +1089,7 @@ parseqatom(struct vars *v,
 		if (atom != NULL)
 			freesubre(v, atom);
 		if (atomtype == '(')
-			v->subs[subno].left = v->subs[subno].right = NULL;
+			v->subs[subno] = NULL;
 		delsub(v->nfa, lp, rp);
 		EMPTYARC(lp, rp);
 		return top;
@@ -1130,30 +1122,48 @@ parseqatom(struct vars *v,
 		NOERRN();
 	}
 
+	/*
+	 * For what follows, we need the atom to have its own begin/end states
+	 * that are distinct from lp/rp, so that we can wrap iteration structure
+	 * around it.  The parenthesized-atom case above already made suitable
+	 * states (and we don't want to modify a capturing subre, since it's
+	 * already recorded in v->subs[]).  Otherwise, we need more states.
+	 */
+	if (atom->begin == lp || atom->end == rp)
+	{
+		s = newstate(v->nfa);
+		s2 = newstate(v->nfa);
+		NOERRN();
+		moveouts(v->nfa, lp, s);
+		moveins(v->nfa, rp, s2);
+		atom->begin = s;
+		atom->end = s2;
+	}
+	else
+	{
+		/* The atom's OK, but we must temporarily disconnect it from lp/rp */
+		/* (this removes the EMPTY arcs we made above) */
+		delsub(v->nfa, lp, atom->begin);
+		delsub(v->nfa, atom->end, rp);
+	}
+
 	/*----------
 	 * Prepare a general-purpose state skeleton.
 	 *
 	 * In the no-backrefs case, we want this:
 	 *
-	 * [lp] ---> [s] ---prefix---> [begin] ---atom---> [end] ---rest---> [rp]
+	 * [lp] ---> [s] ---prefix---> ---atom---> ---rest---> [rp]
 	 *
-	 * where prefix is some repetitions of atom.  In the general case we need
+	 * where prefix is some repetitions of atom, and "rest" is the remainder
+	 * of the branch.  In the general case we need:
 	 *
 	 * [lp] ---> [s] ---iterator---> [s2] ---rest---> [rp]
 	 *
-	 * where the iterator wraps around [begin] ---atom---> [end]
+	 * where the iterator wraps around the atom.
 	 *
 	 * We make the s state here for both cases; s2 is made below if needed
 	 *----------
 	 */
-	s = newstate(v->nfa);		/* first, new endpoints for the atom */
-	s2 = newstate(v->nfa);
-	NOERRN();
-	moveouts(v->nfa, lp, s);
-	moveins(v->nfa, rp, s2);
-	NOERRN();
-	atom->begin = s;
-	atom->end = s2;
 	s = newstate(v->nfa);		/* set up starting state */
 	NOERRN();
 	EMPTYARC(lp, s);
@@ -1190,14 +1200,14 @@ parseqatom(struct vars *v,
 	{
 		assert(atom->begin->nouts == 1);	/* just the EMPTY */
 		delsub(v->nfa, atom->begin, atom->end);
-		assert(v->subs[subno].left != NULL);
+		assert(v->subs[subno] != NULL);
 
 		/*
 		 * And here's why the recursion got postponed: it must wait until the
 		 * skeleton is filled in, because it may hit a backref that wants to
 		 * copy the filled-in skeleton.
 		 */
-		dupnfa(v->nfa, v->subs[subno].left, v->subs[subno].right,
+		dupnfa(v->nfa, v->subs[subno]->begin, v->subs[subno]->end,
 			   atom->begin, atom->end);
 		NOERRN();