@@ -199,7 +199,7 @@ CharClass getCanonicalCharClass(RegExpTerm term) {
199199/**
200200 * Holds if `a` and `b` are input symbols from the same regexp.
201201 */
202- private predicate sharesRoot ( TInputSymbol a , TInputSymbol b ) {
202+ private predicate sharesRoot ( InputSymbol a , InputSymbol b ) {
203203 exists ( RegExpRoot root |
204204 belongsTo ( a , root ) and
205205 belongsTo ( b , root )
@@ -209,7 +209,7 @@ private predicate sharesRoot(TInputSymbol a, TInputSymbol b) {
209209/**
210210 * Holds if the `a` is an input symbol from a regexp that has root `root`.
211211 */
212- private predicate belongsTo ( TInputSymbol a , RegExpRoot root ) {
212+ private predicate belongsTo ( InputSymbol a , RegExpRoot root ) {
213213 exists ( State s | getRoot ( s .getRepr ( ) ) = root |
214214 delta ( s , a , _)
215215 or
@@ -378,6 +378,13 @@ private module CharacterClasses {
378378 )
379379 }
380380
381+ bindingset [ char, cc]
382+ private string caseNormalize ( string char , RegExpTerm cc ) {
383+ if RegExpFlags:: isIgnoreCase ( cc .getRootTerm ( ) )
384+ then result = char .toLowerCase ( )
385+ else result = char
386+ }
387+
381388 /**
382389 * An implementation of `CharacterClass` for positive (non inverted) character classes.
383390 */
@@ -386,7 +393,7 @@ private module CharacterClasses {
386393
387394 PositiveCharacterClass ( ) { this = getCanonicalCharClass ( cc ) and not cc .isInverted ( ) }
388395
389- override string getARelevantChar ( ) { result = getAMentionedChar ( cc ) }
396+ override string getARelevantChar ( ) { result = caseNormalize ( getAMentionedChar ( cc ) , cc ) }
390397
391398 override predicate matches ( string char ) { hasChildThatMatches ( cc , char ) }
392399 }
@@ -400,8 +407,8 @@ private module CharacterClasses {
400407 InvertedCharacterClass ( ) { this = getCanonicalCharClass ( cc ) and cc .isInverted ( ) }
401408
402409 override string getARelevantChar ( ) {
403- result = nextChar ( getAMentionedChar ( cc ) ) or
404- nextChar ( result ) = getAMentionedChar ( cc )
410+ result = nextChar ( caseNormalize ( getAMentionedChar ( cc ) , cc ) ) or
411+ nextChar ( result ) = caseNormalize ( getAMentionedChar ( cc ) , cc )
405412 }
406413
407414 bindingset [ char]
@@ -428,13 +435,12 @@ private module CharacterClasses {
428435 */
429436 private class PositiveCharacterClassEscape extends CharacterClass {
430437 string charClass ;
438+ RegExpTerm cc ;
431439
432440 PositiveCharacterClassEscape ( ) {
433- exists ( RegExpTerm cc |
434- isEscapeClass ( cc , charClass ) and
435- this = getCanonicalCharClass ( cc ) and
436- charClass = [ "d" , "s" , "w" ]
437- )
441+ isEscapeClass ( cc , charClass ) and
442+ this = getCanonicalCharClass ( cc ) and
443+ charClass = [ "d" , "s" , "w" ]
438444 }
439445
440446 override string getARelevantChar ( ) {
@@ -445,7 +451,9 @@ private module CharacterClasses {
445451 result = " "
446452 or
447453 charClass = "w" and
448- result = [ "a" , "Z" , "_" , "0" , "9" ]
454+ if RegExpFlags:: isIgnoreCase ( cc .getRootTerm ( ) )
455+ then result = [ "a" , "z" , "_" , "0" , "9" ]
456+ else result = [ "a" , "Z" , "_" , "0" , "9" ]
449457 }
450458
451459 override predicate matches ( string char ) { classEscapeMatches ( charClass , char ) }
@@ -492,6 +500,34 @@ private module CharacterClasses {
492500 not classEscapeMatches ( charClass .toLowerCase ( ) , char )
493501 }
494502 }
503+
504+ /** Gets a representative for all char classes that match the same chars as `c`. */
505+ CharacterClass normalize ( CharacterClass c ) {
506+ exists ( string normalization |
507+ normalization = getMormalizationString ( c ) and
508+ result =
509+ min ( CharacterClass cc , string raw |
510+ getMormalizationString ( cc ) = normalization and cc = CharClass ( raw )
511+ |
512+ cc order by raw
513+ )
514+ )
515+ }
516+
517+ /** Gets a string representing all the chars matched by `c` */
518+ private string getMormalizationString ( CharacterClass c ) {
519+ ( c instanceof PositiveCharacterClass or c instanceof PositiveCharacterClassEscape ) and
520+ result = concat ( string char | c .matches ( char ) and char = CharacterClasses:: getARelevantChar ( ) )
521+ or
522+ ( c instanceof InvertedCharacterClass or c instanceof NegativeCharacterClassEscape ) and
523+ // the string produced by the concat can not contain repeated chars
524+ // so by starting the below with "nn" we can guarantee that
525+ // it will not overlap with the above case.
526+ // and a negative char class can never match the same chars as a positive one, so we don't miss any results from this.
527+ result =
528+ "nn:" +
529+ concat ( string char | not c .matches ( char ) and char = CharacterClasses:: getARelevantChar ( ) )
530+ }
495531}
496532
497533private class EdgeLabel extends TInputSymbol {
@@ -620,13 +656,17 @@ predicate delta(State q1, EdgeLabel lbl, State q2) {
620656 cc .isUniversalClass ( ) and q1 = before ( cc ) and lbl = Any ( ) and q2 = after ( cc )
621657 or
622658 q1 = before ( cc ) and
623- lbl = CharClass ( cc .getRawValue ( ) + "|" + getCanonicalizationFlags ( cc .getRootTerm ( ) ) ) and
659+ lbl =
660+ CharacterClasses:: normalize ( CharClass ( cc .getRawValue ( ) + "|" +
661+ getCanonicalizationFlags ( cc .getRootTerm ( ) ) ) ) and
624662 q2 = after ( cc )
625663 )
626664 or
627665 exists ( RegExpTerm cc | isEscapeClass ( cc , _) |
628666 q1 = before ( cc ) and
629- lbl = CharClass ( cc .getRawValue ( ) + "|" + getCanonicalizationFlags ( cc .getRootTerm ( ) ) ) and
667+ lbl =
668+ CharacterClasses:: normalize ( CharClass ( cc .getRawValue ( ) + "|" +
669+ getCanonicalizationFlags ( cc .getRootTerm ( ) ) ) ) and
630670 q2 = after ( cc )
631671 )
632672 or
0 commit comments