@@ -3,104 +3,64 @@ private import RegexFlowConfigs
33
44/**
55 * A string literal that is used as a regular exprssion.
6- * TODO: adjust parser for java regex syntax
76 */
87abstract class RegexString extends Expr {
98 RegexString ( ) { this instanceof StringLiteral }
109
11- /**
12- * Helper predicate for `char_set_start(int start, int end)`.
13- *
14- * In order to identify left brackets ('[') which actually start a character class,
15- * we perform a left to right scan of the string.
16- *
17- * To avoid negative recursion we return a boolean. See `escaping`,
18- * the helper for `escapingChar`, for a clean use of this pattern.
19- *
20- * result is true for those start chars that actually mark a start of a char set.
21- */
22- boolean char_set_start ( int pos ) {
23- exists ( int index |
24- // is opening bracket
25- this .char_set_delimiter ( index , pos ) = true and
26- (
27- // if this is the first bracket, `pos` starts a char set
28- index = 1 and result = true
29- or
30- // if the previous char set delimiter was not a closing bracket, `pos` does
31- // not start a char set. This is needed to handle cases such as `[[]` (a
32- // char set that matches the `[` char)
33- index > 1 and
34- not this .char_set_delimiter ( index - 1 , _) = false and
35- result = false
36- or
37- // special handling of cases such as `[][]` (the character-set of the characters `]` and `[`).
38- exists ( int prev_closing_bracket_pos |
39- // previous bracket is a closing bracket
40- this .char_set_delimiter ( index - 1 , prev_closing_bracket_pos ) = false and
41- if
42- // check if the character that comes before the previous closing bracket
43- // is an opening bracket (taking `^` into account)
44- exists ( int pos_before_prev_closing_bracket |
45- if this .getChar ( prev_closing_bracket_pos - 1 ) = "^"
46- then pos_before_prev_closing_bracket = prev_closing_bracket_pos - 2
47- else pos_before_prev_closing_bracket = prev_closing_bracket_pos - 1
48- |
49- this .char_set_delimiter ( index - 2 , pos_before_prev_closing_bracket ) = true
50- )
51- then
52- // brackets without anything in between is not valid character ranges, so
53- // the first closing bracket in `[]]` and `[^]]` does not count,
54- //
55- // and we should _not_ mark the second opening bracket in `[][]` and `[^][]`
56- // as starting a new char set. ^ ^
57- exists ( int pos_before_prev_closing_bracket |
58- this .char_set_delimiter ( index - 2 , pos_before_prev_closing_bracket ) = true
59- |
60- result = this .char_set_start ( pos_before_prev_closing_bracket ) .booleanNot ( )
61- )
62- else
63- // if not, `pos` does in fact mark a real start of a character range
64- result = true
65- )
66- )
10+ /** Holds if a character set starts between `start` and `end`. */
11+ private predicate char_set_start0 ( int start , int end ) {
12+ this .nonEscapedCharAt ( start ) = "[" and
13+ (
14+ this .getChar ( start + 1 ) = "^" and end = start + 2
15+ or
16+ not this .getChar ( start + 1 ) = "^" and end = start + 1
6717 )
6818 }
6919
20+ /** Holds if the character at `pos` marks the end of a character class. */
21+ private predicate char_set_end0 ( int pos ) {
22+ this .nonEscapedCharAt ( pos ) = "]" and
23+ /* special case: `[]]` and `[^]]` are valid char classes. */
24+ not char_set_start0 ( _, pos - 1 )
25+ }
26+
7027 /**
71- * Helper predicate for chars that could be character-set delimiters.
72- * Holds if the (non-escaped) char at `pos` in the string, is the (one-based) `index` occurrence of a bracket (`[` or `]`) in the string.
73- * Result if `true` is the char is `[`, and `false` if the char is `]`.
28+ * Gets the nesting depth of charcter classes at position `pos`
7429 */
75- boolean char_set_delimiter ( int index , int pos ) {
76- pos = rank [ index ] ( int p | this .nonEscapedCharAt ( p ) = "[" or this .nonEscapedCharAt ( p ) = "]" ) and
77- (
78- this .nonEscapedCharAt ( pos ) = "[" and result = true
79- or
80- this .nonEscapedCharAt ( pos ) = "]" and result = false
81- )
30+ private int char_set_depth ( int pos ) {
31+ exists ( this .getChar ( pos ) ) and
32+ result =
33+ count ( int i | i < pos and this .char_set_start0 ( i , _) ) -
34+ count ( int i | i < pos and this .char_set_end0 ( i ) )
8235 }
8336
84- /** Hold is a character set starts between `start` and `end`. */
37+ /** Hold if a top-level character set starts between `start` and `end`. */
8538 predicate char_set_start ( int start , int end ) {
86- this .char_set_start ( start ) = true and
87- (
88- this .getChar ( start + 1 ) = "^" and end = start + 2
89- or
90- not this .getChar ( start + 1 ) = "^" and end = start + 1
91- )
39+ this .char_set_start0 ( start , end ) and
40+ this .char_set_depth ( start ) = 0
9241 }
9342
94- /** Whether there is a character class, between start (inclusive) and end (exclusive) */
43+ /** Holds if a top-level character set ends at `pos`. */
44+ predicate char_set_end ( int pos ) {
45+ this .char_set_end0 ( pos ) and
46+ this .char_set_depth ( pos ) = 1
47+ }
48+
49+ /**
50+ * Whether there is a top-level character class, between start (inclusive) and end (exclusive)
51+ *
52+ * For now, nested character classes are approximated by only considering the top-level class for parsing.
53+ * This leads to very similar results for ReDoS queries.
54+ */
9555 predicate charSet ( int start , int end ) {
9656 exists ( int inner_start , int inner_end |
9757 this .char_set_start ( start , inner_start ) and
9858 not this .char_set_start ( _, start )
9959 |
10060 end = inner_end + 1 and
10161 inner_end > inner_start and
102- this .nonEscapedCharAt ( inner_end ) = "]" and
103- not exists ( int mid | this . nonEscapedCharAt ( mid ) = "]" | mid > inner_start and mid < inner_end )
62+ this .char_set_end ( inner_end ) and
63+ not exists ( int mid | char_set_end ( mid ) | mid > inner_start and mid < inner_end )
10464 )
10565 }
10666
@@ -118,6 +78,8 @@ abstract class RegexString extends Expr {
11878 this .escapedCharacter ( start , end )
11979 or
12080 exists ( this .nonEscapedCharAt ( start ) ) and end = start + 1
81+ or
82+ this .quote ( start , end )
12183 )
12284 or
12385 this .char_set_token ( charset_start , _, start ) and
@@ -126,7 +88,9 @@ abstract class RegexString extends Expr {
12688 or
12789 exists ( this .nonEscapedCharAt ( start ) ) and
12890 end = start + 1 and
129- not this .getChar ( start ) = "]"
91+ not this .char_set_end ( start )
92+ or
93+ this .quote ( start , end )
13094 )
13195 }
13296
0 commit comments