@@ -139,20 +139,10 @@ newtype TInputSymbol =
139139 Char ( string c ) { c = any ( RegExpConstant cc | getRoot ( cc ) .isRelevant ( ) ) .getValue ( ) .charAt ( _) } or
140140 /**
141141 * An input symbol representing all characters matched by
142- * (positive, non-universal) character class `recc`.
142+ * (non-universal) character class `recc`.
143143 */
144144 CharClass ( RegExpCharacterClass recc ) {
145145 getRoot ( recc ) .isRelevant ( ) and
146- not recc .isInverted ( ) and
147- not recc .isUniversalClass ( )
148- } or
149- /**
150- * An input symbol representing all characters matched by
151- * the inverted (non-universal) character class `recc`.
152- */
153- InvertedCharClass ( RegExpCharacterClass recc ) {
154- getRoot ( recc ) .isRelevant ( ) and
155- recc .isInverted ( ) and
156146 not recc .isUniversalClass ( )
157147 } or
158148 /** An input symbol representing all characters matched by `.`. */
@@ -166,7 +156,6 @@ newtype TInputSymbol =
166156 * Holds if `a` and `b` are input symbols from the same regexp.
167157 * (And not a `Dot()`, `Any()` or `Epsilon()`)
168158 */
169- pragma [ noinline]
170159private predicate sharesRoot ( TInputSymbol a , TInputSymbol b ) {
171160 exists ( RegExpRoot root |
172161 belongsTo ( a , root ) and
@@ -182,8 +171,6 @@ private predicate belongsTo(TInputSymbol a, RegExpRoot root) {
182171 a = Char ( term .( RegExpConstant ) .getValue ( ) .charAt ( _) )
183172 or
184173 a = CharClass ( term )
185- or
186- a = InvertedCharClass ( term )
187174 )
188175}
189176
@@ -198,14 +185,121 @@ class InputSymbol extends TInputSymbol {
198185 or
199186 result = any ( RegExpCharacterClass recc | this = CharClass ( recc ) ) .toString ( )
200187 or
201- result = any ( RegExpCharacterClass recc | this = InvertedCharClass ( recc ) ) .toString ( )
202- or
203188 this = Dot ( ) and result = "."
204189 or
205190 this = Any ( ) and result = "[^]"
206191 }
207192}
208193
194+ /**
195+ * An abstract input symbol that represents a character class.
196+ */
197+ abstract class CharacterClass extends InputSymbol {
198+ /**
199+ * Gets a char that is likely relevant for the ReDoS analysis of this character class.
200+ * That is: One of the endpoints to the character class,
201+ * or a char that is off-by-one to one of the endpoints of the character class (if this is an inversed character class).
202+ */
203+ abstract string getARelevantChar ( ) ;
204+
205+ /**
206+ * Holds if this character class matches `char`.
207+ */
208+ bindingset [ char]
209+ abstract predicate matches ( string char ) ;
210+
211+ /**
212+ * Gets a single character matched by this character class.
213+ */
214+ abstract string choose ( ) ;
215+ }
216+
217+ /**
218+ * Provides implementations for `CharacterClass`.
219+ */
220+ private module CharacterClasses {
221+ /**
222+ * Holds if the character class `cc` has a child (constant or range) that matches `char`.
223+ */
224+ bindingset [ char]
225+ predicate hasChildThatMatches ( RegExpCharacterClass cc , string char ) {
226+ exists ( RegExpTerm child | child = cc .getAChild ( ) |
227+ char = child .( RegExpConstant ) .getValue ( )
228+ or
229+ exists ( string lo , string hi | child .( RegExpCharacterRange ) .isRange ( lo , hi ) |
230+ lo <= char and char <= hi
231+ )
232+ // TODO: RegExpCharacterClassEscape.
233+ )
234+ }
235+
236+ /**
237+ * Gets a char that is mentioned in the character class `c`.
238+ */
239+ private string getAMentionedChar ( RegExpCharacterClass c ) {
240+ exists ( RegExpTerm child | child = c .getAChild ( ) |
241+ result = child .( RegExpConstant ) .getValue ( )
242+ or
243+ child .( RegExpCharacterRange ) .isRange ( result , _)
244+ or
245+ child .( RegExpCharacterRange ) .isRange ( _, result )
246+ )
247+ }
248+
249+ /**
250+ * An implementation of `CharacterClass` for positive (non inverted) character classes.
251+ */
252+ private class PositiveCharacterClass extends CharacterClass {
253+ RegExpCharacterClass cc ;
254+
255+ PositiveCharacterClass ( ) { this = CharClass ( cc ) and not cc .isInverted ( ) }
256+
257+ override string getARelevantChar ( ) { result = getAMentionedChar ( cc ) }
258+
259+ bindingset [ char]
260+ override predicate matches ( string char ) { hasChildThatMatches ( cc , char ) }
261+
262+ override string choose ( ) {
263+ result =
264+ min ( string c |
265+ exists ( RegExpTerm child | child = cc .getAChild ( ) |
266+ c = child .( RegExpConstant ) .getValue ( ) or
267+ child .( RegExpCharacterRange ) .isRange ( c , _)
268+ )
269+ )
270+ }
271+ }
272+
273+ /**
274+ * An implementation of `CharacterClass` for inverted character classes.
275+ */
276+ private class InvertedCharacterClass extends CharacterClass {
277+ RegExpCharacterClass cc ;
278+
279+ InvertedCharacterClass ( ) { this = CharClass ( cc ) and cc .isInverted ( ) }
280+
281+ override string getARelevantChar ( ) {
282+ result = nextChar ( getAMentionedChar ( cc ) ) or
283+ nextChar ( result ) = getAMentionedChar ( cc )
284+ }
285+
286+ bindingset [ char]
287+ override predicate matches ( string char ) { not hasChildThatMatches ( cc , char ) }
288+
289+ override string choose ( ) {
290+ // The next char after the max of the inverted charclass.
291+ result =
292+ nextChar ( max ( string c |
293+ exists ( RegExpTerm child | child = cc .getAChild ( ) |
294+ c = child .( RegExpConstant ) .getValue ( ) or
295+ child .( RegExpCharacterRange ) .isRange ( _, c )
296+ )
297+ ) )
298+ }
299+ }
300+ // TODO: Implementations for RegExpCharacterClassEscape
301+ }
302+
209303newtype TState =
210304 Match ( RegExpTerm t , int i ) {
211305 getRoot ( t ) .isRelevant ( ) and
@@ -303,10 +397,11 @@ predicate delta(State q1, EdgeLabel lbl, State q2) {
303397 cc .isUniversalClass ( ) and q1 = before ( cc ) and lbl = Any ( ) and q2 = after ( cc )
304398 or
305399 q1 = before ( cc ) and
306- ( lbl = CharClass ( cc ) or lbl = InvertedCharClass ( cc ) ) and
400+ lbl = CharClass ( cc ) and
307401 q2 = after ( cc )
308402 )
309403 or
404+ // TODO: Or exists(RegExpCharacterClassEscape
310405 exists ( RegExpAlt alt | lbl = Epsilon ( ) | q1 = before ( alt ) and q2 = before ( alt .getAChild ( ) ) )
311406 or
312407 exists ( RegExpSequence seq | lbl = Epsilon ( ) | q1 = before ( seq ) and q2 = before ( seq .getChild ( 0 ) ) )
@@ -454,87 +549,23 @@ newtype Trace =
454549 t = Nil ( ) and isFork ( _, s1 , s2 , _, _)
455550 }
456551
457- /**
458- * Holds if the character class `cc` has a child (constant or range) that matches `char`.
459- */
460- bindingset [ char]
461- predicate charClassMatchesChar ( RegExpCharacterClass cc , string char ) {
462- exists ( RegExpTerm child | child = cc .getAChild ( ) |
463- char = child .( RegExpConstant ) .getValue ( )
464- or
465- exists ( string lo , string hi | child .( RegExpCharacterRange ) .isRange ( lo , hi ) |
466- lo <= char and char <= hi
467- )
468- )
469- }
470-
471552/**
472553 * Gets the minimum char that is matched by both the character classes `c` and `d`.
473554 */
474- pragma [ noinline]
475- private string getMinOverlapBetweenCharacterClasses ( TInputSymbol c , TInputSymbol d ) {
555+ private string getMinOverlapBetweenCharacterClasses ( CharacterClass c , CharacterClass d ) {
476556 result = min ( getAOverlapBetweenCharacterClasses ( c , d ) )
477557}
478558
479- /**
480- * Gets a char that is mentioned in the character class `c`.
481- */
482- private string getAMentionedChar ( RegExpCharacterClass c ) {
483- exists ( RegExpTerm child | child = c .getAChild ( ) |
484- result = child .( RegExpConstant ) .getValue ( )
485- or
486- child .( RegExpCharacterRange ) .isRange ( result , _)
487- or
488- child .( RegExpCharacterRange ) .isRange ( _, result )
489- )
490- }
491-
492- /**
493- * Gets a char that is relevant for ReDoS analysis of `symbol`.
494- * The result is either mentioned in the character class `symbol`,
495- * or, if `symbol` is an inverted character class, then the result is the next/previous charcode.
496- */
497- pragma [ noinline]
498- private string getARelevantCharClassChar ( TInputSymbol symbol ) {
499- exists ( RegExpCharacterClass cc | symbol = CharClass ( cc ) | result = getAMentionedChar ( cc ) )
500- or
501- exists ( RegExpCharacterClass cc | symbol = InvertedCharClass ( cc ) |
502- result = nextChar ( getAMentionedChar ( cc ) ) or
503- nextChar ( result ) = getAMentionedChar ( cc )
504- )
505- }
506-
507559/**
508560 * Gets a char that is matched by both the character classes `c` and `d`.
561+ * And `c` and `d` is not the same character class.
509562 */
510- private string getAOverlapBetweenCharacterClasses ( TInputSymbol c , TInputSymbol d ) {
563+ private string getAOverlapBetweenCharacterClasses ( CharacterClass c , CharacterClass d ) {
511564 sharesRoot ( c , d ) and
512- result = [ getARelevantCharClassChar ( c ) , getARelevantCharClassChar ( d ) ] and
513- (
514- // pos-neg
515- exists ( RegExpCharacterClass negClass , RegExpCharacterClass posClass |
516- c = CharClass ( posClass ) and
517- d = InvertedCharClass ( negClass ) and
518- charClassMatchesChar ( posClass , result ) and
519- not charClassMatchesChar ( negClass , result )
520- )
521- or
522- // pos-pos
523- exists ( RegExpCharacterClass class1 , RegExpCharacterClass class2 | not class1 = class2 |
524- c = CharClass ( class1 ) and
525- d = CharClass ( class2 ) and
526- charClassMatchesChar ( class1 , result ) and
527- charClassMatchesChar ( class2 , result )
528- )
529- or
530- // neg-neg
531- exists ( RegExpCharacterClass class1 , RegExpCharacterClass class2 | not class1 = class2 |
532- c = InvertedCharClass ( class1 ) and
533- d = InvertedCharClass ( class2 ) and
534- not charClassMatchesChar ( class1 , result ) and
535- not charClassMatchesChar ( class2 , result )
536- )
537- )
565+ result = [ c .getARelevantChar ( ) , d .getARelevantChar ( ) ] and
566+ c .matches ( result ) and
567+ d .matches ( result ) and
568+ not c = d
538569}
539570
540571/**
@@ -547,11 +578,7 @@ string intersect(InputSymbol c, InputSymbol d) {
547578 (
548579 d = Char ( result )
549580 or
550- exists ( RegExpCharacterClass cc | d = CharClass ( cc ) | charClassMatchesChar ( cc , result ) )
551- or
552- exists ( RegExpCharacterClass cc | d = InvertedCharClass ( cc ) |
553- not charClassMatchesChar ( cc , result )
554- )
581+ d .( CharacterClass ) .matches ( result )
555582 )
556583 or
557584 d = Dot ( ) and
@@ -562,17 +589,9 @@ string intersect(InputSymbol c, InputSymbol d) {
562589 or
563590 result = getMinOverlapBetweenCharacterClasses ( c , d )
564591 or
565- exists ( RegExpCharacterClass cc | c = InvertedCharClass ( cc ) and result = chooseFromInverted ( cc ) |
566- d = InvertedCharClass ( cc ) and sharesRoot ( c , d )
567- or
568- d = Dot ( ) and
569- not ( result = "\n" or result = "\r" )
570- or
571- d = Any ( )
572- )
573- or
574- exists ( RegExpCharacterClass cc | c = CharClass ( cc ) and result = choose ( cc ) |
575- d = CharClass ( cc ) and sharesRoot ( c , d )
592+ result = c .( CharacterClass ) .choose ( ) and
593+ (
594+ d = c
576595 or
577596 d = Dot ( ) and
578597 not ( result = "\n" or result = "\r" )
@@ -592,20 +611,6 @@ string intersect(InputSymbol c, InputSymbol d) {
592611 result = intersect ( d , c )
593612}
594613
595- /**
596- * Gets a character matched by character class `cc`.
597- */
598- string choose ( RegExpCharacterClass cc ) {
599- exists ( CharClass ( cc ) ) and
600- result =
601- min ( string c |
602- exists ( RegExpTerm child | child = cc .getAChild ( ) |
603- c = child .( RegExpConstant ) .getValue ( ) or
604- child .( RegExpCharacterRange ) .isRange ( c , _)
605- )
606- )
607- }
608-
609614/**
610615 * Gets the char after `c` (from a simplified ASCII table).
611616 */
@@ -624,21 +629,6 @@ int ascii(string char) {
624629 )
625630}
626631
627- /**
628- * Chooses a char matched by the inverted char class `cc`.
629- */
630- string chooseFromInverted ( RegExpCharacterClass cc ) {
631- exists ( InvertedCharClass ( cc ) ) and
632- // The next char after the max of the inverted charclass.
633- result =
634- nextChar ( max ( string c |
635- exists ( RegExpTerm child | child = cc .getAChild ( ) |
636- c = child .( RegExpConstant ) .getValue ( ) or
637- child .( RegExpCharacterRange ) .isRange ( _, c )
638- )
639- ) )
640- }
641-
642632/**
643633 * Gets a string corresponding to the trace `t`.
644634 */
0 commit comments