@@ -136,7 +136,7 @@ RegExpRoot getRoot(RegExpTerm term) {
136136 */
137137newtype TInputSymbol =
138138 /** An input symbol corresponding to character `c`. */
139- Char ( string c ) { c = any ( RegExpConstant cc ) .getValue ( ) .charAt ( _) } or
139+ Char ( string c ) { c = any ( RegExpConstant cc | getRoot ( cc ) . isRelevant ( ) ) .getValue ( ) .charAt ( _) } or
140140 /**
141141 * An input symbol representing all characters matched by
142142 * (positive, non-universal) character class `recc`.
@@ -162,6 +162,31 @@ newtype TInputSymbol =
162162 /** An epsilon transition in the automaton. */
163163 Epsilon ( )
164164
165+ /**
166+ * Holds if `a` and `b` are input symbols from the same regexp.
167+ * (And not a `Dot()`, `Any()` or `Epsilon()`)
168+ */
169+ pragma [ noinline]
170+ private predicate sharesRoot ( TInputSymbol a , TInputSymbol b ) {
171+ exists ( RegExpRoot root |
172+ belongsTo ( a , root ) and
173+ belongsTo ( b , root )
174+ )
175+ }
176+
177+ /**
178+ * Holds if the `a` is an inputsymbol from a regexp that has root `root`.
179+ */
180+ private predicate belongsTo ( TInputSymbol a , RegExpRoot root ) {
181+ exists ( RegExpTerm term | getRoot ( term ) = root |
182+ a = Char ( term .( RegExpConstant ) .getValue ( ) .charAt ( _) )
183+ or
184+ a = CharClass ( term )
185+ or
186+ a = InvertedCharClass ( term )
187+ )
188+ }
189+
165190/**
166191 * An abstract input symbol, representing a set of concrete characters.
167192 */
@@ -485,6 +510,7 @@ private string getARelevantCharClassChar(TInputSymbol symbol) {
485510 * negative char class `d`.
486511 */
487512private string getAOverlapBetweenCharacterClasses ( CharClass c , InvertedCharClass d ) {
513+ sharesRoot ( c , d ) and
488514 result = [ getARelevantCharClassChar ( c ) , getARelevantCharClassChar ( d ) ] and
489515 exists ( RegExpCharacterClass negClass , RegExpCharacterClass posClass |
490516 c = CharClass ( posClass ) and
@@ -500,12 +526,15 @@ private string getAOverlapBetweenCharacterClasses(CharClass c, InvertedCharClass
500526string intersect ( InputSymbol c , InputSymbol d ) {
501527 c = Char ( result ) and
502528 (
503- d = Char ( result )
504- or
505- exists ( RegExpCharacterClass cc | d = CharClass ( cc ) | charClassMatchesChar ( cc , result ) )
506- or
507- exists ( RegExpCharacterClass cc | d = InvertedCharClass ( cc ) |
508- not charClassMatchesChar ( cc , result )
529+ sharesRoot ( c , d ) and
530+ (
531+ d = Char ( result )
532+ or
533+ exists ( RegExpCharacterClass cc | d = CharClass ( cc ) | charClassMatchesChar ( cc , result ) )
534+ or
535+ exists ( RegExpCharacterClass cc | d = InvertedCharClass ( cc ) |
536+ not charClassMatchesChar ( cc , result )
537+ )
509538 )
510539 or
511540 d = Dot ( ) and
@@ -517,7 +546,7 @@ string intersect(InputSymbol c, InputSymbol d) {
517546 result = getMinOverlapBetweenCharacterClasses ( c , d )
518547 or
519548 exists ( RegExpCharacterClass cc | c = InvertedCharClass ( cc ) and result = chooseFromInverted ( cc ) |
520- d = InvertedCharClass ( cc )
549+ d = InvertedCharClass ( cc ) and sharesRoot ( c , d )
521550 or
522551 d = Dot ( ) and
523552 not ( result = "\n" or result = "\r" )
@@ -526,7 +555,7 @@ string intersect(InputSymbol c, InputSymbol d) {
526555 )
527556 or
528557 exists ( RegExpCharacterClass cc | c = CharClass ( cc ) and result = choose ( cc ) |
529- d = CharClass ( cc )
558+ d = CharClass ( cc ) and sharesRoot ( c , d )
530559 or
531560 d = Dot ( ) and
532561 not ( result = "\n" or result = "\r" )
0 commit comments