@@ -233,13 +233,6 @@ static int cmp(const chr *, const chr *, size_t);
233
233
static int casecmp (const chr * , const chr * , size_t );
234
234
235
235
236
- /* info we need during compilation about a known capturing subexpression */
237
- struct subinfo
238
- {
239
- struct state * left ; /* left end of its sub-NFA */
240
- struct state * right ; /* right end of its sub-NFA */
241
- };
242
-
243
236
/* internal variables, bundled for easy passing around */
244
237
struct vars
245
238
{
@@ -252,10 +245,10 @@ struct vars
252
245
int nexttype ; /* type of next token */
253
246
chr nextvalue ; /* value (if any) of next token */
254
247
int lexcon ; /* lexical context type (see regc_lex.c) */
255
- int nsubexp ; /* number of known capturing subexpressions */
256
- struct subinfo * subs ; /* info about known capturing subexpressions */
257
- size_t nsubs ; /* allocated length of subs[] vector */
258
- struct subinfo sub10 [10 ]; /* initial vector, enough for most */
248
+ int nsubexp ; /* subexpression count */
249
+ struct subre * * subs ; /* subRE pointer vector */
250
+ size_t nsubs ; /* length of vector */
251
+ struct subre * sub10 [10 ]; /* initial vector, enough for most */
259
252
struct nfa * nfa ; /* the NFA */
260
253
struct colormap * cm ; /* character color map */
261
254
color nlcolor ; /* color of newline */
@@ -375,7 +368,7 @@ pg_regcomp(regex_t *re,
375
368
v -> subs = v -> sub10 ;
376
369
v -> nsubs = 10 ;
377
370
for (j = 0 ; j < v -> nsubs ; j ++ )
378
- v -> subs [j ]. left = v -> subs [ j ]. right = NULL ;
371
+ v -> subs [j ] = NULL ;
379
372
v -> nfa = NULL ;
380
373
v -> cm = NULL ;
381
374
v -> nlcolor = COLORLESS ;
@@ -511,35 +504,35 @@ pg_regcomp(regex_t *re,
511
504
}
512
505
513
506
/*
514
- * moresubs - enlarge capturing-subexpressions vector
507
+ * moresubs - enlarge subRE vector
515
508
*/
516
509
static void
517
510
moresubs (struct vars * v ,
518
511
int wanted ) /* want enough room for this one */
519
512
{
520
- struct subinfo * p ;
513
+ struct subre * * p ;
521
514
size_t n ;
522
515
523
516
assert (wanted > 0 && (size_t ) wanted >= v -> nsubs );
524
517
n = (size_t ) wanted * 3 / 2 + 1 ;
525
518
526
519
if (v -> subs == v -> sub10 )
527
520
{
528
- p = (struct subinfo * ) MALLOC (n * sizeof (struct subinfo ));
521
+ p = (struct subre * * ) MALLOC (n * sizeof (struct subre * ));
529
522
if (p != NULL )
530
523
memcpy (VS (p ), VS (v -> subs ),
531
- v -> nsubs * sizeof (struct subinfo ));
524
+ v -> nsubs * sizeof (struct subre * ));
532
525
}
533
526
else
534
- p = (struct subinfo * ) REALLOC (v -> subs , n * sizeof (struct subinfo ));
527
+ p = (struct subre * * ) REALLOC (v -> subs , n * sizeof (struct subre * ));
535
528
if (p == NULL )
536
529
{
537
530
ERR (REG_ESPACE );
538
531
return ;
539
532
}
540
533
v -> subs = p ;
541
534
for (p = & v -> subs [v -> nsubs ]; v -> nsubs < n ; p ++ , v -> nsubs ++ )
542
- p -> left = p -> right = NULL ;
535
+ * p = NULL ;
543
536
assert (v -> nsubs == n );
544
537
assert ((size_t ) wanted < v -> nsubs );
545
538
}
@@ -988,6 +981,7 @@ parseqatom(struct vars *v,
988
981
s = newstate (v -> nfa );
989
982
s2 = newstate (v -> nfa );
990
983
NOERRN ();
984
+ /* We may not need these arcs, but keep things connected for now */
991
985
EMPTYARC (lp , s );
992
986
EMPTYARC (s2 , rp );
993
987
NOERRN ();
@@ -997,10 +991,6 @@ parseqatom(struct vars *v,
997
991
NOERRN ();
998
992
if (cap )
999
993
{
1000
- /* save the sub-NFA's endpoints for future backrefs to use */
1001
- assert (v -> subs [subno ].left == NULL );
1002
- v -> subs [subno ].left = s ;
1003
- v -> subs [subno ].right = s2 ;
1004
994
if (atom -> capno == 0 )
1005
995
{
1006
996
/* normal case: just mark the atom as capturing */
@@ -1016,13 +1006,15 @@ parseqatom(struct vars *v,
1016
1006
t -> child = atom ;
1017
1007
atom = t ;
1018
1008
}
1009
+ assert (v -> subs [subno ] == NULL );
1010
+ v -> subs [subno ] = atom ;
1019
1011
}
1020
1012
/* postpone everything else pending possible {0} */
1021
1013
break ;
1022
1014
case BACKREF : /* the Feature From The Black Lagoon */
1023
1015
INSIST (type != LACON , REG_ESUBREG );
1024
1016
INSIST (v -> nextvalue < v -> nsubs , REG_ESUBREG );
1025
- INSIST (v -> subs [v -> nextvalue ]. left != NULL , REG_ESUBREG );
1017
+ INSIST (v -> subs [v -> nextvalue ] != NULL , REG_ESUBREG );
1026
1018
NOERRN ();
1027
1019
assert (v -> nextvalue > 0 );
1028
1020
atom = subre (v , 'b' , BACKR , lp , rp );
@@ -1097,7 +1089,7 @@ parseqatom(struct vars *v,
1097
1089
if (atom != NULL )
1098
1090
freesubre (v , atom );
1099
1091
if (atomtype == '(' )
1100
- v -> subs [subno ]. left = v -> subs [ subno ]. right = NULL ;
1092
+ v -> subs [subno ] = NULL ;
1101
1093
delsub (v -> nfa , lp , rp );
1102
1094
EMPTYARC (lp , rp );
1103
1095
return top ;
@@ -1130,30 +1122,48 @@ parseqatom(struct vars *v,
1130
1122
NOERRN ();
1131
1123
}
1132
1124
1125
+ /*
1126
+ * For what follows, we need the atom to have its own begin/end states
1127
+ * that are distinct from lp/rp, so that we can wrap iteration structure
1128
+ * around it. The parenthesized-atom case above already made suitable
1129
+ * states (and we don't want to modify a capturing subre, since it's
1130
+ * already recorded in v->subs[]). Otherwise, we need more states.
1131
+ */
1132
+ if (atom -> begin == lp || atom -> end == rp )
1133
+ {
1134
+ s = newstate (v -> nfa );
1135
+ s2 = newstate (v -> nfa );
1136
+ NOERRN ();
1137
+ moveouts (v -> nfa , lp , s );
1138
+ moveins (v -> nfa , rp , s2 );
1139
+ atom -> begin = s ;
1140
+ atom -> end = s2 ;
1141
+ }
1142
+ else
1143
+ {
1144
+ /* The atom's OK, but we must temporarily disconnect it from lp/rp */
1145
+ /* (this removes the EMPTY arcs we made above) */
1146
+ delsub (v -> nfa , lp , atom -> begin );
1147
+ delsub (v -> nfa , atom -> end , rp );
1148
+ }
1149
+
1133
1150
/*----------
1134
1151
* Prepare a general-purpose state skeleton.
1135
1152
*
1136
1153
* In the no-backrefs case, we want this:
1137
1154
*
1138
- * [lp] ---> [s] ---prefix---> [begin] ---atom---> [end] ---rest---> [rp]
1155
+ * [lp] ---> [s] ---prefix---> ---atom---> ---rest---> [rp]
1139
1156
*
1140
- * where prefix is some repetitions of atom. In the general case we need
1157
+ * where prefix is some repetitions of atom, and "rest" is the remainder
1158
+ * of the branch. In the general case we need:
1141
1159
*
1142
1160
* [lp] ---> [s] ---iterator---> [s2] ---rest---> [rp]
1143
1161
*
1144
- * where the iterator wraps around [begin] --- atom---> [end]
1162
+ * where the iterator wraps around the atom.
1145
1163
*
1146
1164
* We make the s state here for both cases; s2 is made below if needed
1147
1165
*----------
1148
1166
*/
1149
- s = newstate (v -> nfa ); /* first, new endpoints for the atom */
1150
- s2 = newstate (v -> nfa );
1151
- NOERRN ();
1152
- moveouts (v -> nfa , lp , s );
1153
- moveins (v -> nfa , rp , s2 );
1154
- NOERRN ();
1155
- atom -> begin = s ;
1156
- atom -> end = s2 ;
1157
1167
s = newstate (v -> nfa ); /* set up starting state */
1158
1168
NOERRN ();
1159
1169
EMPTYARC (lp , s );
@@ -1190,14 +1200,14 @@ parseqatom(struct vars *v,
1190
1200
{
1191
1201
assert (atom -> begin -> nouts == 1 ); /* just the EMPTY */
1192
1202
delsub (v -> nfa , atom -> begin , atom -> end );
1193
- assert (v -> subs [subno ]. left != NULL );
1203
+ assert (v -> subs [subno ] != NULL );
1194
1204
1195
1205
/*
1196
1206
* And here's why the recursion got postponed: it must wait until the
1197
1207
* skeleton is filled in, because it may hit a backref that wants to
1198
1208
* copy the filled-in skeleton.
1199
1209
*/
1200
- dupnfa (v -> nfa , v -> subs [subno ]. left , v -> subs [subno ]. right ,
1210
+ dupnfa (v -> nfa , v -> subs [subno ]-> begin , v -> subs [subno ]-> end ,
1201
1211
atom -> begin , atom -> end );
1202
1212
NOERRN ();
1203
1213
0 commit comments