Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit fa54ad1

Browse files
committed
refactor character class implementation in ReDoS.ql - preparing support for RegExpCharacterClassEscape
1 parent a09ffd5 commit fa54ad1

2 files changed

Lines changed: 132 additions & 133 deletions

File tree

  • javascript/ql

javascript/ql/src/Performance/ReDoS.ql

Lines changed: 123 additions & 133 deletions
Original file line numberDiff line numberDiff line change
@@ -139,20 +139,10 @@ newtype TInputSymbol =
139139
Char(string c) { c = any(RegExpConstant cc | getRoot(cc).isRelevant()).getValue().charAt(_) } or
140140
/**
141141
* An input symbol representing all characters matched by
142-
* (positive, non-universal) character class `recc`.
142+
* (non-universal) character class `recc`.
143143
*/
144144
CharClass(RegExpCharacterClass recc) {
145145
getRoot(recc).isRelevant() and
146-
not recc.isInverted() and
147-
not recc.isUniversalClass()
148-
} or
149-
/**
150-
* An input symbol representing all characters matched by
151-
* the inverted (non-universal) character class `recc`.
152-
*/
153-
InvertedCharClass(RegExpCharacterClass recc) {
154-
getRoot(recc).isRelevant() and
155-
recc.isInverted() and
156146
not recc.isUniversalClass()
157147
} or
158148
/** An input symbol representing all characters matched by `.`. */
@@ -166,7 +156,6 @@ newtype TInputSymbol =
166156
* Holds if `a` and `b` are input symbols from the same regexp.
167157
* (And not a `Dot()`, `Any()` or `Epsilon()`)
168158
*/
169-
pragma[noinline]
170159
private predicate sharesRoot(TInputSymbol a, TInputSymbol b) {
171160
exists(RegExpRoot root |
172161
belongsTo(a, root) and
@@ -182,8 +171,6 @@ private predicate belongsTo(TInputSymbol a, RegExpRoot root) {
182171
a = Char(term.(RegExpConstant).getValue().charAt(_))
183172
or
184173
a = CharClass(term)
185-
or
186-
a = InvertedCharClass(term)
187174
)
188175
}
189176

@@ -198,14 +185,121 @@ class InputSymbol extends TInputSymbol {
198185
or
199186
result = any(RegExpCharacterClass recc | this = CharClass(recc)).toString()
200187
or
201-
result = any(RegExpCharacterClass recc | this = InvertedCharClass(recc)).toString()
202-
or
203188
this = Dot() and result = "."
204189
or
205190
this = Any() and result = "[^]"
206191
}
207192
}
208193

194+
/**
195+
* An abstract input symbol that represents a character class.
196+
*/
197+
abstract class CharacterClass extends InputSymbol {
198+
/**
199+
* Gets a char that is likely relevant for the ReDoS analysis of this character class.
200+
* That is: One of the endpoints to the character class,
201+
* or a char that is off-by-one to one of the endpoints of the character class (if this is an inversed character class).
202+
*/
203+
abstract string getARelevantChar();
204+
205+
/**
206+
* Holds if this character class matches `char`.
207+
*/
208+
bindingset[char]
209+
abstract predicate matches(string char);
210+
211+
/**
212+
* Gets a single character matched by this character class.
213+
*/
214+
abstract string choose();
215+
}
216+
217+
/**
218+
* Provides implementations for `CharacterClass`.
219+
*/
220+
private module CharacterClasses {
221+
/**
222+
* Holds if the character class `cc` has a child (constant or range) that matches `char`.
223+
*/
224+
bindingset[char]
225+
predicate hasChildThatMatches(RegExpCharacterClass cc, string char) {
226+
exists(RegExpTerm child | child = cc.getAChild() |
227+
char = child.(RegExpConstant).getValue()
228+
or
229+
exists(string lo, string hi | child.(RegExpCharacterRange).isRange(lo, hi) |
230+
lo <= char and char <= hi
231+
)
232+
// TODO: RegExpCharacterClassEscape.
233+
)
234+
}
235+
236+
/**
237+
* Gets a char that is mentioned in the character class `c`.
238+
*/
239+
private string getAMentionedChar(RegExpCharacterClass c) {
240+
exists(RegExpTerm child | child = c.getAChild() |
241+
result = child.(RegExpConstant).getValue()
242+
or
243+
child.(RegExpCharacterRange).isRange(result, _)
244+
or
245+
child.(RegExpCharacterRange).isRange(_, result)
246+
)
247+
}
248+
249+
/**
250+
* An implementation of `CharacterClass` for positive (non inverted) character classes.
251+
*/
252+
private class PositiveCharacterClass extends CharacterClass {
253+
RegExpCharacterClass cc;
254+
255+
PositiveCharacterClass() { this = CharClass(cc) and not cc.isInverted() }
256+
257+
override string getARelevantChar() { result = getAMentionedChar(cc) }
258+
259+
bindingset[char]
260+
override predicate matches(string char) { hasChildThatMatches(cc, char) }
261+
262+
override string choose() {
263+
result =
264+
min(string c |
265+
exists(RegExpTerm child | child = cc.getAChild() |
266+
c = child.(RegExpConstant).getValue() or
267+
child.(RegExpCharacterRange).isRange(c, _)
268+
)
269+
)
270+
}
271+
}
272+
273+
/**
274+
* An implementation of `CharacterClass` for inverted character classes.
275+
*/
276+
private class InvertedCharacterClass extends CharacterClass {
277+
RegExpCharacterClass cc;
278+
279+
InvertedCharacterClass() { this = CharClass(cc) and cc.isInverted() }
280+
281+
override string getARelevantChar() {
282+
result = nextChar(getAMentionedChar(cc)) or
283+
nextChar(result) = getAMentionedChar(cc)
284+
}
285+
286+
bindingset[char]
287+
override predicate matches(string char) { not hasChildThatMatches(cc, char) }
288+
289+
override string choose() {
290+
// The next char after the max of the inverted charclass.
291+
result =
292+
nextChar(max(string c |
293+
exists(RegExpTerm child | child = cc.getAChild() |
294+
c = child.(RegExpConstant).getValue() or
295+
child.(RegExpCharacterRange).isRange(_, c)
296+
)
297+
))
298+
}
299+
}
300+
// TODO: Implementations for RegExpCharacterClassEscape
301+
}
302+
209303
newtype TState =
210304
Match(RegExpTerm t, int i) {
211305
getRoot(t).isRelevant() and
@@ -303,10 +397,11 @@ predicate delta(State q1, EdgeLabel lbl, State q2) {
303397
cc.isUniversalClass() and q1 = before(cc) and lbl = Any() and q2 = after(cc)
304398
or
305399
q1 = before(cc) and
306-
(lbl = CharClass(cc) or lbl = InvertedCharClass(cc)) and
400+
lbl = CharClass(cc) and
307401
q2 = after(cc)
308402
)
309403
or
404+
// TODO: Or exists(RegExpCharacterClassEscape
310405
exists(RegExpAlt alt | lbl = Epsilon() | q1 = before(alt) and q2 = before(alt.getAChild()))
311406
or
312407
exists(RegExpSequence seq | lbl = Epsilon() | q1 = before(seq) and q2 = before(seq.getChild(0)))
@@ -454,87 +549,23 @@ newtype Trace =
454549
t = Nil() and isFork(_, s1, s2, _, _)
455550
}
456551

457-
/**
458-
* Holds if the character class `cc` has a child (constant or range) that matches `char`.
459-
*/
460-
bindingset[char]
461-
predicate charClassMatchesChar(RegExpCharacterClass cc, string char) {
462-
exists(RegExpTerm child | child = cc.getAChild() |
463-
char = child.(RegExpConstant).getValue()
464-
or
465-
exists(string lo, string hi | child.(RegExpCharacterRange).isRange(lo, hi) |
466-
lo <= char and char <= hi
467-
)
468-
)
469-
}
470-
471552
/**
472553
* Gets the minimum char that is matched by both the character classes `c` and `d`.
473554
*/
474-
pragma[noinline]
475-
private string getMinOverlapBetweenCharacterClasses(TInputSymbol c, TInputSymbol d) {
555+
private string getMinOverlapBetweenCharacterClasses(CharacterClass c, CharacterClass d) {
476556
result = min(getAOverlapBetweenCharacterClasses(c, d))
477557
}
478558

479-
/**
480-
* Gets a char that is mentioned in the character class `c`.
481-
*/
482-
private string getAMentionedChar(RegExpCharacterClass c) {
483-
exists(RegExpTerm child | child = c.getAChild() |
484-
result = child.(RegExpConstant).getValue()
485-
or
486-
child.(RegExpCharacterRange).isRange(result, _)
487-
or
488-
child.(RegExpCharacterRange).isRange(_, result)
489-
)
490-
}
491-
492-
/**
493-
* Gets a char that is relevant for ReDoS analysis of `symbol`.
494-
* The result is either mentioned in the character class `symbol`,
495-
* or, if `symbol` is an inverted character class, then the result is the next/previous charcode.
496-
*/
497-
pragma[noinline]
498-
private string getARelevantCharClassChar(TInputSymbol symbol) {
499-
exists(RegExpCharacterClass cc | symbol = CharClass(cc) | result = getAMentionedChar(cc))
500-
or
501-
exists(RegExpCharacterClass cc | symbol = InvertedCharClass(cc) |
502-
result = nextChar(getAMentionedChar(cc)) or
503-
nextChar(result) = getAMentionedChar(cc)
504-
)
505-
}
506-
507559
/**
508560
* Gets a char that is matched by both the character classes `c` and `d`.
561+
* And `c` and `d` is not the same character class.
509562
*/
510-
private string getAOverlapBetweenCharacterClasses(TInputSymbol c, TInputSymbol d) {
563+
private string getAOverlapBetweenCharacterClasses(CharacterClass c, CharacterClass d) {
511564
sharesRoot(c, d) and
512-
result = [getARelevantCharClassChar(c), getARelevantCharClassChar(d)] and
513-
(
514-
// pos-neg
515-
exists(RegExpCharacterClass negClass, RegExpCharacterClass posClass |
516-
c = CharClass(posClass) and
517-
d = InvertedCharClass(negClass) and
518-
charClassMatchesChar(posClass, result) and
519-
not charClassMatchesChar(negClass, result)
520-
)
521-
or
522-
// pos-pos
523-
exists(RegExpCharacterClass class1, RegExpCharacterClass class2 | not class1 = class2 |
524-
c = CharClass(class1) and
525-
d = CharClass(class2) and
526-
charClassMatchesChar(class1, result) and
527-
charClassMatchesChar(class2, result)
528-
)
529-
or
530-
// neg-neg
531-
exists(RegExpCharacterClass class1, RegExpCharacterClass class2 | not class1 = class2 |
532-
c = InvertedCharClass(class1) and
533-
d = InvertedCharClass(class2) and
534-
not charClassMatchesChar(class1, result) and
535-
not charClassMatchesChar(class2, result)
536-
)
537-
)
565+
result = [c.getARelevantChar(), d.getARelevantChar()] and
566+
c.matches(result) and
567+
d.matches(result) and
568+
not c = d
538569
}
539570

540571
/**
@@ -547,11 +578,7 @@ string intersect(InputSymbol c, InputSymbol d) {
547578
(
548579
d = Char(result)
549580
or
550-
exists(RegExpCharacterClass cc | d = CharClass(cc) | charClassMatchesChar(cc, result))
551-
or
552-
exists(RegExpCharacterClass cc | d = InvertedCharClass(cc) |
553-
not charClassMatchesChar(cc, result)
554-
)
581+
d.(CharacterClass).matches(result)
555582
)
556583
or
557584
d = Dot() and
@@ -562,17 +589,9 @@ string intersect(InputSymbol c, InputSymbol d) {
562589
or
563590
result = getMinOverlapBetweenCharacterClasses(c, d)
564591
or
565-
exists(RegExpCharacterClass cc | c = InvertedCharClass(cc) and result = chooseFromInverted(cc) |
566-
d = InvertedCharClass(cc) and sharesRoot(c, d)
567-
or
568-
d = Dot() and
569-
not (result = "\n" or result = "\r")
570-
or
571-
d = Any()
572-
)
573-
or
574-
exists(RegExpCharacterClass cc | c = CharClass(cc) and result = choose(cc) |
575-
d = CharClass(cc) and sharesRoot(c, d)
592+
result = c.(CharacterClass).choose() and
593+
(
594+
d = c
576595
or
577596
d = Dot() and
578597
not (result = "\n" or result = "\r")
@@ -592,20 +611,6 @@ string intersect(InputSymbol c, InputSymbol d) {
592611
result = intersect(d, c)
593612
}
594613

595-
/**
596-
* Gets a character matched by character class `cc`.
597-
*/
598-
string choose(RegExpCharacterClass cc) {
599-
exists(CharClass(cc)) and
600-
result =
601-
min(string c |
602-
exists(RegExpTerm child | child = cc.getAChild() |
603-
c = child.(RegExpConstant).getValue() or
604-
child.(RegExpCharacterRange).isRange(c, _)
605-
)
606-
)
607-
}
608-
609614
/**
610615
* Gets the char after `c` (from a simplified ASCII table).
611616
*/
@@ -624,21 +629,6 @@ int ascii(string char) {
624629
)
625630
}
626631

627-
/**
628-
* Chooses a char matched by the inverted char class `cc`.
629-
*/
630-
string chooseFromInverted(RegExpCharacterClass cc) {
631-
exists(InvertedCharClass(cc)) and
632-
// The next char after the max of the inverted charclass.
633-
result =
634-
nextChar(max(string c |
635-
exists(RegExpTerm child | child = cc.getAChild() |
636-
c = child.(RegExpConstant).getValue() or
637-
child.(RegExpCharacterRange).isRange(_, c)
638-
)
639-
))
640-
}
641-
642632
/**
643633
* Gets a string corresponding to the trace `t`.
644634
*/

javascript/ql/test/query-tests/Performance/ReDoS/tst.js

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,3 +126,12 @@ var bad27 = /(([a-z]|[d-h])*)"/;
126126

127127
// NOT GOOD
128128
var bad27 = /(([^a-z]|[^0-9])*)"/;
129+
130+
// NOT GOOD
131+
var bad28 = /((\d|[0-9])*)"/;
132+
133+
// NOT GOOD
134+
var bad29 = /((\s|\s)*)"/;
135+
136+
// NOT GOOD
137+
var bad29 = /((\w|G)*)"/;

0 commit comments

Comments
 (0)