@@ -140,9 +140,9 @@ class RegExpRoot extends RegExpTerm {
140140 // there is at least one repetition
141141 getRoot ( any ( InfiniteRepetitionQuantifier q ) ) = this and
142142 // is actually used as a RegExp
143- isUsedAsRegExp ( ) and
143+ this . isUsedAsRegExp ( ) and
144144 // not excluded for library specific reasons
145- not isExcluded ( getRootTerm ( ) .getParent ( ) )
145+ not isExcluded ( this . getRootTerm ( ) .getParent ( ) )
146146 }
147147}
148148
@@ -218,7 +218,7 @@ private newtype TInputSymbol =
218218 recc instanceof RegExpCharacterClass and
219219 not recc .( RegExpCharacterClass ) .isUniversalClass ( )
220220 or
221- recc instanceof RegExpCharacterClassEscape
221+ isEscapeClass ( recc , _ )
222222 )
223223 } or
224224 /** An input symbol representing all characters matched by `.`. */
@@ -302,7 +302,7 @@ abstract class CharacterClass extends InputSymbol {
302302 /**
303303 * Gets a character matched by this character class.
304304 */
305- string choose ( ) { result = getARelevantChar ( ) and matches ( result ) }
305+ string choose ( ) { result = this . getARelevantChar ( ) and this . matches ( result ) }
306306}
307307
308308/**
@@ -340,13 +340,13 @@ private module CharacterClasses {
340340 char <= hi
341341 )
342342 or
343- exists ( RegExpCharacterClassEscape escape | escape = child |
344- escape . getValue ( ) = escape . getValue ( ) . toLowerCase ( ) and
345- classEscapeMatches ( escape . getValue ( ) , char )
343+ exists ( string charClass | isEscapeClass ( child , charClass ) |
344+ charClass . toLowerCase ( ) = charClass and
345+ classEscapeMatches ( charClass , char )
346346 or
347347 char = getARelevantChar ( ) and
348- escape . getValue ( ) = escape . getValue ( ) . toUpperCase ( ) and
349- not classEscapeMatches ( escape . getValue ( ) . toLowerCase ( ) , char )
348+ charClass . toUpperCase ( ) = charClass and
349+ not classEscapeMatches ( charClass , char )
350350 )
351351 )
352352 }
@@ -409,10 +409,10 @@ private module CharacterClasses {
409409 or
410410 child .( RegExpCharacterRange ) .isRange ( _, result )
411411 or
412- exists ( RegExpCharacterClassEscape escape | child = escape |
413- result = min ( string s | classEscapeMatches ( escape . getValue ( ) .toLowerCase ( ) , s ) )
412+ exists ( string charClass | isEscapeClass ( child , charClass ) |
413+ result = min ( string s | classEscapeMatches ( charClass .toLowerCase ( ) , s ) )
414414 or
415- result = max ( string s | classEscapeMatches ( escape . getValue ( ) .toLowerCase ( ) , s ) )
415+ result = max ( string s | classEscapeMatches ( charClass .toLowerCase ( ) , s ) )
416416 )
417417 )
418418 }
@@ -466,33 +466,36 @@ private module CharacterClasses {
466466 * An implementation of `CharacterClass` for \d, \s, and \w.
467467 */
468468 private class PositiveCharacterClassEscape extends CharacterClass {
469- RegExpCharacterClassEscape cc ;
469+ RegExpTerm cc ;
470+ string charClass ;
470471
471472 PositiveCharacterClassEscape ( ) {
472- this = getCanonicalCharClass ( cc ) and cc .getValue ( ) = [ "d" , "s" , "w" ]
473+ isEscapeClass ( cc , charClass ) and
474+ this = getCanonicalCharClass ( cc ) and
475+ charClass = [ "d" , "s" , "w" ]
473476 }
474477
475478 override string getARelevantChar ( ) {
476- cc . getValue ( ) = "d" and
479+ charClass = "d" and
477480 result = [ "0" , "9" ]
478481 or
479- cc . getValue ( ) = "s" and
482+ charClass = "s" and
480483 result = " "
481484 or
482- cc . getValue ( ) = "w" and
485+ charClass = "w" and
483486 result = [ "a" , "Z" , "_" , "0" , "9" ]
484487 }
485488
486- override predicate matches ( string char ) { classEscapeMatches ( cc . getValue ( ) , char ) }
489+ override predicate matches ( string char ) { classEscapeMatches ( charClass , char ) }
487490
488491 override string choose ( ) {
489- cc . getValue ( ) = "d" and
492+ charClass = "d" and
490493 result = "9"
491494 or
492- cc . getValue ( ) = "s" and
495+ charClass = "s" and
493496 result = " "
494497 or
495- cc . getValue ( ) = "w" and
498+ charClass = "w" and
496499 result = "a"
497500 }
498501 }
@@ -501,26 +504,29 @@ private module CharacterClasses {
501504 * An implementation of `CharacterClass` for \D, \S, and \W.
502505 */
503506 private class NegativeCharacterClassEscape extends CharacterClass {
504- RegExpCharacterClassEscape cc ;
507+ RegExpTerm cc ;
508+ string charClass ;
505509
506510 NegativeCharacterClassEscape ( ) {
507- this = getCanonicalCharClass ( cc ) and cc .getValue ( ) = [ "D" , "S" , "W" ]
511+ isEscapeClass ( cc , charClass ) and
512+ this = getCanonicalCharClass ( cc ) and
513+ charClass = [ "D" , "S" , "W" ]
508514 }
509515
510516 override string getARelevantChar ( ) {
511- cc . getValue ( ) = "D" and
517+ charClass = "D" and
512518 result = [ "a" , "Z" , "!" ]
513519 or
514- cc . getValue ( ) = "S" and
520+ charClass = "S" and
515521 result = [ "a" , "9" , "!" ]
516522 or
517- cc . getValue ( ) = "W" and
523+ charClass = "W" and
518524 result = [ " " , "!" ]
519525 }
520526
521527 bindingset [ char]
522528 override predicate matches ( string char ) {
523- not classEscapeMatches ( cc . getValue ( ) .toLowerCase ( ) , char )
529+ not classEscapeMatches ( charClass .toLowerCase ( ) , char )
524530 }
525531 }
526532}
@@ -533,6 +539,55 @@ private class EdgeLabel extends TInputSymbol {
533539 }
534540}
535541
542+ /**
543+ * A RegExp term that acts like a plus.
544+ * Either it's a RegExpPlus, or it is a range {1,X} where X is >= 30.
545+ * 30 has been chosen as a threshold because for exponential blowup 2^30 is enough to get a decent DOS attack.
546+ */
547+ private class EffectivelyPlus extends RegExpTerm {
548+ EffectivelyPlus ( ) {
549+ this instanceof RegExpPlus
550+ or
551+ exists ( RegExpRange range |
552+ range .getLowerBound ( ) = 1 and
553+ ( range .getUpperBound ( ) >= 30 or not exists ( range .getUpperBound ( ) ) )
554+ |
555+ this = range
556+ )
557+ }
558+ }
559+
560+ /**
561+ * A RegExp term that acts like a star.
562+ * Either it's a RegExpStar, or it is a range {0,X} where X is >= 30.
563+ */
564+ private class EffectivelyStar extends RegExpTerm {
565+ EffectivelyStar ( ) {
566+ this instanceof RegExpStar
567+ or
568+ exists ( RegExpRange range |
569+ range .getLowerBound ( ) = 0 and
570+ ( range .getUpperBound ( ) >= 30 or not exists ( range .getUpperBound ( ) ) )
571+ |
572+ this = range
573+ )
574+ }
575+ }
576+
577+ /**
578+ * A RegExp term that acts like a question mark.
579+ * Either it's a RegExpQuestion, or it is a range {0,1}.
580+ */
581+ private class EffectivelyQuestion extends RegExpTerm {
582+ EffectivelyQuestion ( ) {
583+ this instanceof RegExpOpt
584+ or
585+ exists ( RegExpRange range | range .getLowerBound ( ) = 0 and range .getUpperBound ( ) = 1 |
586+ this = range
587+ )
588+ }
589+ }
590+
536591/**
537592 * Gets the state before matching `t`.
538593 */
@@ -542,7 +597,7 @@ private State before(RegExpTerm t) { result = Match(t, 0) }
542597/**
543598 * Gets a state the NFA may be in after matching `t`.
544599 */
545- private State after ( RegExpTerm t ) {
600+ State after ( RegExpTerm t ) {
546601 exists ( RegExpAlt alt | t = alt .getAChild ( ) | result = after ( alt ) )
547602 or
548603 exists ( RegExpSequence seq , int i | t = seq .getChild ( i ) |
@@ -553,14 +608,14 @@ private State after(RegExpTerm t) {
553608 or
554609 exists ( RegExpGroup grp | t = grp .getAChild ( ) | result = after ( grp ) )
555610 or
556- exists ( RegExpStar star | t = star .getAChild ( ) | result = before ( star ) )
611+ exists ( EffectivelyStar star | t = star .getAChild ( ) | result = before ( star ) )
557612 or
558- exists ( RegExpPlus plus | t = plus .getAChild ( ) |
613+ exists ( EffectivelyPlus plus | t = plus .getAChild ( ) |
559614 result = before ( plus ) or
560615 result = after ( plus )
561616 )
562617 or
563- exists ( RegExpOpt opt | t = opt .getAChild ( ) | result = after ( opt ) )
618+ exists ( EffectivelyQuestion opt | t = opt .getAChild ( ) | result = after ( opt ) )
564619 or
565620 exists ( RegExpRoot root | t = root | result = AcceptAnySuffix ( root ) )
566621}
@@ -599,7 +654,7 @@ predicate delta(State q1, EdgeLabel lbl, State q2) {
599654 q2 = after ( cc )
600655 )
601656 or
602- exists ( RegExpCharacterClassEscape cc |
657+ exists ( RegExpTerm cc | isEscapeClass ( cc , _ ) |
603658 q1 = before ( cc ) and
604659 lbl = CharClass ( cc .getRawValue ( ) + "|" + getCanonicalizationFlags ( cc .getRootTerm ( ) ) ) and
605660 q2 = after ( cc )
@@ -611,15 +666,17 @@ predicate delta(State q1, EdgeLabel lbl, State q2) {
611666 or
612667 exists ( RegExpGroup grp | lbl = Epsilon ( ) | q1 = before ( grp ) and q2 = before ( grp .getChild ( 0 ) ) )
613668 or
614- exists ( RegExpStar star | lbl = Epsilon ( ) |
669+ exists ( EffectivelyStar star | lbl = Epsilon ( ) |
615670 q1 = before ( star ) and q2 = before ( star .getChild ( 0 ) )
616671 or
617672 q1 = before ( star ) and q2 = after ( star )
618673 )
619674 or
620- exists ( RegExpPlus plus | lbl = Epsilon ( ) | q1 = before ( plus ) and q2 = before ( plus .getChild ( 0 ) ) )
675+ exists ( EffectivelyPlus plus | lbl = Epsilon ( ) |
676+ q1 = before ( plus ) and q2 = before ( plus .getChild ( 0 ) )
677+ )
621678 or
622- exists ( RegExpOpt opt | lbl = Epsilon ( ) |
679+ exists ( EffectivelyQuestion opt | lbl = Epsilon ( ) |
623680 q1 = before ( opt ) and q2 = before ( opt .getChild ( 0 ) )
624681 or
625682 q1 = before ( opt ) and q2 = after ( opt )
@@ -671,7 +728,7 @@ RegExpRoot getRoot(RegExpTerm term) {
671728/**
672729 * A state in the NFA.
673730 */
674- private newtype TState =
731+ newtype TState =
675732 /**
676733 * A state representing that the NFA is about to match a term.
677734 * `i` is used to index into multi-char literals.
@@ -801,29 +858,26 @@ InputSymbol getAnInputSymbolMatching(string char) {
801858 result = Any ( )
802859}
803860
861+ /**
862+ * Holds if `state` is a start state.
863+ */
864+ predicate isStartState ( State state ) {
865+ state = mkMatch ( any ( RegExpRoot r ) )
866+ or
867+ exists ( RegExpCaret car | state = after ( car ) )
868+ }
869+
804870/**
805871 * Predicates for constructing a prefix string that leads to a given state.
806872 */
807873private module PrefixConstruction {
808- /**
809- * Holds if `state` starts the string matched by the regular expression.
810- */
811- private predicate isStartState ( State state ) {
812- state instanceof StateInPumpableRegexp and
813- (
814- state = Match ( any ( RegExpRoot r ) , _)
815- or
816- exists ( RegExpCaret car | state = after ( car ) )
817- )
818- }
819-
820874 /**
821875 * Holds if `state` is the textually last start state for the regular expression.
822876 */
823877 private predicate lastStartState ( State state ) {
824878 exists ( RegExpRoot root |
825879 state =
826- max ( State s , Location l |
880+ max ( StateInPumpableRegexp s , Location l |
827881 isStartState ( s ) and getRoot ( s .getRepr ( ) ) = root and l = s .getRepr ( ) .getLocation ( )
828882 |
829883 s
0 commit comments