@@ -130,36 +130,57 @@ abstract class RegexString extends Expr {
130130 /** result is true for those start chars that actually mark a start of a char set. */
131131 boolean char_set_start ( int pos ) {
132132 exists ( int index |
133- char_set_delimiter ( index , pos ) = true and
133+ // is opening bracket
134+ this .char_set_delimiter ( index , pos ) = true and
134135 (
135- index = 1 and result = true // if a '[' is first in the string (among brackets), it starts a char set
136+ // if this is the first bracket, `pos` starts a char set
137+ index = 1 and result = true
136138 or
139+ // if the previous char set delimiter was not a closing bracket, `pos` does
140+ // not start a char set. This is needed to handle cases such as `[[]` (a
141+ // char set that matches the `[` char)
137142 index > 1 and
138- not char_set_delimiter ( index - 1 , _) = false and
143+ not this . char_set_delimiter ( index - 1 , _) = false and
139144 result = false
140145 or
141- exists ( int p1 |
142- char_set_delimiter ( index - 1 , p1 ) = false and // if it is preceded by a closing bracket, it starts a char set
146+ // special handling of cases such as `[][]` (the character-set of the characters `]` and `[`).
147+ exists ( int prev_closing_bracket_pos |
148+ // previous bracket is a closing bracket
149+ this .char_set_delimiter ( index - 1 , prev_closing_bracket_pos ) = false and
143150 if
144- exists ( int p2 |
145- p1 = p2 + 1
146- or
147- this .getChar ( p2 + 1 ) = "^" and
148- p1 = p2 + 2
151+ // check if the character that comes before the previous closing bracket
152+ // is an opening bracket (taking `^` into account)
153+ exists ( int pos_before_prev_closing_bracket |
154+ if this .getChar ( prev_closing_bracket_pos - 1 ) = "^"
155+ then pos_before_prev_closing_bracket = prev_closing_bracket_pos - 2
156+ else pos_before_prev_closing_bracket = prev_closing_bracket_pos - 1
149157 |
150- char_set_delimiter ( index - 2 , p2 ) = true // but the closing bracket only closes...
158+ this . char_set_delimiter ( index - 2 , pos_before_prev_closing_bracket ) = true
151159 )
152160 then
153- exists ( int p2 | char_set_delimiter ( index - 2 , p2 ) = true |
154- result = char_set_start ( p2 ) .booleanNot ( ) // ...if it is not the first in a char set
161+ // brackets without anything in between is not valid character ranges, so
162+ // the first closing bracket in `[]]` and `[^]]` does not count,
163+ //
164+ // and we should _not_ mark the second opening bracket in `[][]` and `[^][]`
165+ // as starting a new char set. ^ ^
166+ exists ( int pos_before_prev_closing_bracket |
167+ this .char_set_delimiter ( index - 2 , pos_before_prev_closing_bracket ) = true
168+ |
169+ result = this .char_set_start ( pos_before_prev_closing_bracket ) .booleanNot ( )
155170 )
156- else result = true
171+ else
172+ // if not, `pos` does in fact mark a real start of a character range
173+ result = true
157174 )
158175 )
159176 )
160177 }
161178
162- /** result denotes if the index is a left bracket */
179+ /**
180+ * Helper predicate for chars that could be character-set delimiters.
181+ * Holds if the (non-escaped) char at `pos` in the string, is the (one-based) `index` occurrence of a bracket (`[` or `]`) in the string.
182+ * Result if `true` is the char is `[`, and `false` if the char is `]`.
183+ */
163184 boolean char_set_delimiter ( int index , int pos ) {
164185 pos = rank [ index ] ( int p | this .nonEscapedCharAt ( p ) = "[" or this .nonEscapedCharAt ( p ) = "]" ) and
165186 (
0 commit comments