Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 7530902

Browse files
Add approximate support for nested character classes.
This shouldn't fail to parse on any correctly formed character class; but may give incorrect contents when nested classes are involved.
1 parent d04c99b commit 7530902

1 file changed

Lines changed: 42 additions & 78 deletions

File tree

java/ql/lib/semmle/code/java/regex/regex.qll

Lines changed: 42 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -3,104 +3,64 @@ private import RegexFlowConfigs
33

44
/**
55
* A string literal that is used as a regular exprssion.
6-
* TODO: adjust parser for java regex syntax
76
*/
87
abstract class RegexString extends Expr {
98
RegexString() { this instanceof StringLiteral }
109

11-
/**
12-
* Helper predicate for `char_set_start(int start, int end)`.
13-
*
14-
* In order to identify left brackets ('[') which actually start a character class,
15-
* we perform a left to right scan of the string.
16-
*
17-
* To avoid negative recursion we return a boolean. See `escaping`,
18-
* the helper for `escapingChar`, for a clean use of this pattern.
19-
*
20-
* result is true for those start chars that actually mark a start of a char set.
21-
*/
22-
boolean char_set_start(int pos) {
23-
exists(int index |
24-
// is opening bracket
25-
this.char_set_delimiter(index, pos) = true and
26-
(
27-
// if this is the first bracket, `pos` starts a char set
28-
index = 1 and result = true
29-
or
30-
// if the previous char set delimiter was not a closing bracket, `pos` does
31-
// not start a char set. This is needed to handle cases such as `[[]` (a
32-
// char set that matches the `[` char)
33-
index > 1 and
34-
not this.char_set_delimiter(index - 1, _) = false and
35-
result = false
36-
or
37-
// special handling of cases such as `[][]` (the character-set of the characters `]` and `[`).
38-
exists(int prev_closing_bracket_pos |
39-
// previous bracket is a closing bracket
40-
this.char_set_delimiter(index - 1, prev_closing_bracket_pos) = false and
41-
if
42-
// check if the character that comes before the previous closing bracket
43-
// is an opening bracket (taking `^` into account)
44-
exists(int pos_before_prev_closing_bracket |
45-
if this.getChar(prev_closing_bracket_pos - 1) = "^"
46-
then pos_before_prev_closing_bracket = prev_closing_bracket_pos - 2
47-
else pos_before_prev_closing_bracket = prev_closing_bracket_pos - 1
48-
|
49-
this.char_set_delimiter(index - 2, pos_before_prev_closing_bracket) = true
50-
)
51-
then
52-
// brackets without anything in between is not valid character ranges, so
53-
// the first closing bracket in `[]]` and `[^]]` does not count,
54-
//
55-
// and we should _not_ mark the second opening bracket in `[][]` and `[^][]`
56-
// as starting a new char set. ^ ^
57-
exists(int pos_before_prev_closing_bracket |
58-
this.char_set_delimiter(index - 2, pos_before_prev_closing_bracket) = true
59-
|
60-
result = this.char_set_start(pos_before_prev_closing_bracket).booleanNot()
61-
)
62-
else
63-
// if not, `pos` does in fact mark a real start of a character range
64-
result = true
65-
)
66-
)
10+
/** Holds if a character set starts between `start` and `end`. */
11+
private predicate char_set_start0(int start, int end) {
12+
this.nonEscapedCharAt(start) = "[" and
13+
(
14+
this.getChar(start + 1) = "^" and end = start + 2
15+
or
16+
not this.getChar(start + 1) = "^" and end = start + 1
6717
)
6818
}
6919

20+
/** Holds if the character at `pos` marks the end of a character class. */
21+
private predicate char_set_end0(int pos) {
22+
this.nonEscapedCharAt(pos) = "]" and
23+
/* special case: `[]]` and `[^]]` are valid char classes. */
24+
not char_set_start0(_, pos - 1)
25+
}
26+
7027
/**
71-
* Helper predicate for chars that could be character-set delimiters.
72-
* Holds if the (non-escaped) char at `pos` in the string, is the (one-based) `index` occurrence of a bracket (`[` or `]`) in the string.
73-
* Result if `true` is the char is `[`, and `false` if the char is `]`.
28+
* Gets the nesting depth of charcter classes at position `pos`
7429
*/
75-
boolean char_set_delimiter(int index, int pos) {
76-
pos = rank[index](int p | this.nonEscapedCharAt(p) = "[" or this.nonEscapedCharAt(p) = "]") and
77-
(
78-
this.nonEscapedCharAt(pos) = "[" and result = true
79-
or
80-
this.nonEscapedCharAt(pos) = "]" and result = false
81-
)
30+
private int char_set_depth(int pos) {
31+
exists(this.getChar(pos)) and
32+
result =
33+
count(int i | i < pos and this.char_set_start0(i, _)) -
34+
count(int i | i < pos and this.char_set_end0(i))
8235
}
8336

84-
/** Hold is a character set starts between `start` and `end`. */
37+
/** Hold if a top-level character set starts between `start` and `end`. */
8538
predicate char_set_start(int start, int end) {
86-
this.char_set_start(start) = true and
87-
(
88-
this.getChar(start + 1) = "^" and end = start + 2
89-
or
90-
not this.getChar(start + 1) = "^" and end = start + 1
91-
)
39+
this.char_set_start0(start, end) and
40+
this.char_set_depth(start) = 0
9241
}
9342

94-
/** Whether there is a character class, between start (inclusive) and end (exclusive) */
43+
/** Holds if a top-level character set ends at `pos`. */
44+
predicate char_set_end(int pos) {
45+
this.char_set_end0(pos) and
46+
this.char_set_depth(pos) = 1
47+
}
48+
49+
/**
50+
* Whether there is a top-level character class, between start (inclusive) and end (exclusive)
51+
*
52+
* For now, nested character classes are approximated by only considering the top-level class for parsing.
53+
* This leads to very similar results for ReDoS queries.
54+
*/
9555
predicate charSet(int start, int end) {
9656
exists(int inner_start, int inner_end |
9757
this.char_set_start(start, inner_start) and
9858
not this.char_set_start(_, start)
9959
|
10060
end = inner_end + 1 and
10161
inner_end > inner_start and
102-
this.nonEscapedCharAt(inner_end) = "]" and
103-
not exists(int mid | this.nonEscapedCharAt(mid) = "]" | mid > inner_start and mid < inner_end)
62+
this.char_set_end(inner_end) and
63+
not exists(int mid | char_set_end(mid) | mid > inner_start and mid < inner_end)
10464
)
10565
}
10666

@@ -118,6 +78,8 @@ abstract class RegexString extends Expr {
11878
this.escapedCharacter(start, end)
11979
or
12080
exists(this.nonEscapedCharAt(start)) and end = start + 1
81+
or
82+
this.quote(start, end)
12183
)
12284
or
12385
this.char_set_token(charset_start, _, start) and
@@ -126,7 +88,9 @@ abstract class RegexString extends Expr {
12688
or
12789
exists(this.nonEscapedCharAt(start)) and
12890
end = start + 1 and
129-
not this.getChar(start) = "]"
91+
not this.char_set_end(start)
92+
or
93+
this.quote(start, end)
13094
)
13195
}
13296

0 commit comments

Comments
 (0)