Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 7ce91e9

Browse files
committed
introduce cannonical representatives of RegExpTerms to decrease the number of InputSymbols in the NFA
1 parent 34dda6d commit 7ce91e9

2 files changed

Lines changed: 52 additions & 17 deletions

File tree

javascript/ql/src/semmle/javascript/security/performance/ReDoSUtil.qll

Lines changed: 49 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,23 @@ private class RegexpCharacterConstant extends RegExpConstant {
123123
RegexpCharacterConstant() { this.isCharacter() }
124124
}
125125

126+
/**
127+
* Holds if `term` is the chosen cannonical representative for all terms with string representation `str`.
128+
*
129+
* Using cannonical representatives gives a huge performance boost when working with tuples containing multiple `InputSymbol`s.
130+
* The number of `InputSymbol`s is decreased by 3 orders of magnitude or more in some larger benchmarks.
131+
*/
132+
private predicate isCannonicalTerm(RegExpTerm term, string str) {
133+
term =
134+
rank[1](RegExpTerm t, Location loc, File file |
135+
loc = t.getLocation() and
136+
file = t.getFile() and
137+
str = t.getRawValue()
138+
|
139+
t order by t.getFile().getRelativePath(), loc.getStartLine(), loc.getStartColumn()
140+
)
141+
}
142+
126143
/**
127144
* An abstract input symbol, representing a set of concrete characters.
128145
*/
@@ -133,11 +150,11 @@ private newtype TInputSymbol =
133150
} or
134151
/**
135152
* An input symbol representing all characters matched by
136-
* (non-universal) character class `recc`.
153+
* a (non-universal) character class that has string representation `charClassString`.
137154
*/
138-
CharClass(RegExpTerm recc) {
139-
getRoot(recc).isRelevant() and
140-
(
155+
CharClass(string charClassString) {
156+
exists(RegExpTerm term | term.getRawValue() = charClassString | getRoot(term).isRelevant()) and
157+
exists(RegExpTerm recc | isCannonicalTerm(recc, charClassString) |
141158
recc instanceof RegExpCharacterClass and
142159
not recc.(RegExpCharacterClass).isUniversalClass()
143160
or
@@ -168,8 +185,11 @@ private predicate sharesRoot(TInputSymbol a, TInputSymbol b) {
168185
private predicate belongsTo(TInputSymbol a, RegExpRoot root) {
169186
exists(RegExpTerm term | getRoot(term) = root |
170187
a = Char(term.(RegexpCharacterConstant).getValue().charAt(_))
171-
or
172-
a = CharClass(term)
188+
)
189+
or
190+
exists(string str, RegExpTerm term | a = CharClass(str) |
191+
term.getRawValue() = str and
192+
getRoot(term) = root
173193
)
174194
}
175195

@@ -182,7 +202,7 @@ class InputSymbol extends TInputSymbol {
182202
string toString() {
183203
this = Char(result)
184204
or
185-
result = any(RegExpTerm recc | this = CharClass(recc)).toString()
205+
this = CharClass(result)
186206
or
187207
this = Dot() and result = "."
188208
or
@@ -228,7 +248,10 @@ private module CharacterClasses {
228248
*/
229249
pragma[noinline]
230250
predicate hasChildThatMatches(RegExpCharacterClass cc, string char) {
231-
exists(CharClass(cc)) and
251+
exists(string str |
252+
isCannonicalTerm(cc, str) and
253+
exists(CharClass(str))
254+
) and
232255
exists(RegExpTerm child | child = cc.getAChild() |
233256
char = child.(RegexpCharacterConstant).getValue()
234257
or
@@ -324,7 +347,9 @@ private module CharacterClasses {
324347
private class PositiveCharacterClass extends CharacterClass {
325348
RegExpCharacterClass cc;
326349

327-
PositiveCharacterClass() { this = CharClass(cc) and not cc.isInverted() }
350+
PositiveCharacterClass() {
351+
exists(string str | isCannonicalTerm(cc, str) | this = CharClass(str) and not cc.isInverted())
352+
}
328353

329354
override string getARelevantChar() { result = getAMentionedChar(cc) }
330355

@@ -337,7 +362,9 @@ private module CharacterClasses {
337362
private class InvertedCharacterClass extends CharacterClass {
338363
RegExpCharacterClass cc;
339364

340-
InvertedCharacterClass() { this = CharClass(cc) and cc.isInverted() }
365+
InvertedCharacterClass() {
366+
exists(string str | isCannonicalTerm(cc, str) | this = CharClass(str) and cc.isInverted())
367+
}
341368

342369
override string getARelevantChar() {
343370
result = nextChar(getAMentionedChar(cc)) or
@@ -374,7 +401,11 @@ private module CharacterClasses {
374401
private class PositiveCharacterClassEscape extends CharacterClass {
375402
RegExpCharacterClassEscape cc;
376403

377-
PositiveCharacterClassEscape() { this = CharClass(cc) and cc.getValue() = ["d", "s", "w"] }
404+
PositiveCharacterClassEscape() {
405+
exists(string str | isCannonicalTerm(cc, str) |
406+
this = CharClass(str) and cc.getValue() = ["d", "s", "w"]
407+
)
408+
}
378409

379410
override string getARelevantChar() {
380411
cc.getValue() = "d" and
@@ -407,7 +438,11 @@ private module CharacterClasses {
407438
private class NegativeCharacterClassEscape extends CharacterClass {
408439
RegExpCharacterClassEscape cc;
409440

410-
NegativeCharacterClassEscape() { this = CharClass(cc) and cc.getValue() = ["D", "S", "W"] }
441+
NegativeCharacterClassEscape() {
442+
exists(string str | isCannonicalTerm(cc, str) |
443+
this = CharClass(str) and cc.getValue() = ["D", "S", "W"]
444+
)
445+
}
411446

412447
override string getARelevantChar() {
413448
cc.getValue() = "D" and
@@ -490,13 +525,13 @@ predicate delta(State q1, EdgeLabel lbl, State q2) {
490525
cc.isUniversalClass() and q1 = before(cc) and lbl = Any() and q2 = after(cc)
491526
or
492527
q1 = before(cc) and
493-
lbl = CharClass(cc) and
528+
lbl = CharClass(cc.getRawValue()) and
494529
q2 = after(cc)
495530
)
496531
or
497532
exists(RegExpCharacterClassEscape cc |
498533
q1 = before(cc) and
499-
lbl = CharClass(cc) and
534+
lbl = CharClass(cc.getRawValue()) and
500535
q2 = after(cc)
501536
)
502537
or

javascript/ql/test/query-tests/Performance/ReDoS/ReDoS.expected

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
| regexplib/email.js:25:106:25:117 | [a-zA-Z0-9]+ | This part of the regular expression may cause exponential backtracking on strings starting with '0@0' and containing many repetitions of '0'. |
1616
| regexplib/email.js:25:212:25:223 | [a-zA-Z0-9]+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '0'. |
1717
| regexplib/email.js:25:251:25:262 | [a-zA-Z0-9]+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '0'. |
18-
| regexplib/email.js:32:10:32:25 | (?:\\w[\\.\\-\\+]?)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '0'. |
18+
| regexplib/email.js:32:10:32:25 | (?:\\w[\\.\\-\\+]?)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a'. |
1919
| regexplib/email.js:33:16:33:22 | [-.\\w]* | This part of the regular expression may cause exponential backtracking on strings starting with '0' and containing many repetitions of '0'. |
2020
| regexplib/email.js:33:38:33:51 | ([0-9a-zA-Z])+ | This part of the regular expression may cause exponential backtracking on strings starting with '0@' and containing many repetitions of '00.'. |
2121
| regexplib/email.js:33:53:33:58 | [-\\w]* | This part of the regular expression may cause exponential backtracking on strings starting with '0@0' and containing many repetitions of '0'. |
@@ -41,7 +41,7 @@
4141
| regexplib/strings.js:57:17:57:19 | \\d+ | This part of the regular expression may cause exponential backtracking on strings starting with '?se[' and containing many repetitions of '9'. |
4242
| regexplib/strings.js:81:17:81:19 | \\d+ | This part of the regular expression may cause exponential backtracking on strings starting with '?se[' and containing many repetitions of '9'. |
4343
| regexplib/uri.js:3:128:3:129 | .* | This part of the regular expression may cause exponential backtracking on strings starting with 'ftp:// /' and containing many repetitions of '/'. |
44-
| regexplib/uri.js:3:200:3:215 | (?:\\&?\\w+\\=\\w+)* | This part of the regular expression may cause exponential backtracking on strings starting with 'ftp:// a="' and containing many repetitions of '0=0'. |
44+
| regexplib/uri.js:3:200:3:215 | (?:\\&?\\w+\\=\\w+)* | This part of the regular expression may cause exponential backtracking on strings starting with 'ftp:// a="' and containing many repetitions of 'a=0'. |
4545
| regexplib/uri.js:5:42:5:43 | .* | This part of the regular expression may cause exponential backtracking on strings starting with 'A:\\\\a' and containing many repetitions of '\\\\a'. |
4646
| regexplib/uri.js:17:42:17:43 | .* | This part of the regular expression may cause exponential backtracking on strings starting with 'A:\\\\a' and containing many repetitions of '\\\\a'. |
4747
| regexplib/uri.js:38:35:38:40 | [a-z]+ | This part of the regular expression may cause exponential backtracking on strings starting with 'a.' and containing many repetitions of 'a'. |
@@ -109,7 +109,7 @@
109109
| tst.js:227:20:227:20 | b | This part of the regular expression may cause exponential backtracking on strings starting with 'W' and containing many repetitions of 'bW'. |
110110
| tst.js:239:16:239:17 | ab | This part of the regular expression may cause exponential backtracking on strings starting with 'a' and containing many repetitions of 'ab'. |
111111
| tst.js:245:15:245:21 | [\\n\\s]+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of '\\n'. |
112-
| tst.js:254:87:254:89 | \\w* | This part of the regular expression may cause exponential backtracking on strings starting with 'foobarbazfoobarbazfoobarbazfoobarbazfoobarbazfoobarbaz' and containing many repetitions of '0foobarbazfoobarbazfoobarbazfoobarbazfoobarbazfoobarbaz'. |
112+
| tst.js:254:87:254:89 | \\w* | This part of the regular expression may cause exponential backtracking on strings starting with 'foobarbazfoobarbazfoobarbazfoobarbazfoobarbazfoobarbaz' and containing many repetitions of 'afoobarbazfoobarbazfoobarbazfoobarbazfoobarbazfoobarbaz'. |
113113
| tst.js:257:14:257:116 | (.thisisagoddamnlongstringforstresstestingthequery\|\\sthisisagoddamnlongstringforstresstestingthequery)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of ' thisisagoddamnlongstringforstresstestingthequery'. |
114114
| tst.js:260:14:260:77 | (thisisagoddamnlongstringforstresstestingthequery\|this\\w+query)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'thisisagoddamnlongstringforstresstestingthequery'. |
115115
| tst.js:272:21:272:22 | b+ | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'b'. |

0 commit comments

Comments
 (0)