@@ -30,8 +30,12 @@ abstract class RegexString extends Expr {
3030 private int char_set_depth ( int pos ) {
3131 exists ( this .getChar ( pos ) ) and
3232 result =
33- count ( int i | i < pos and this .char_set_start0 ( i , _) ) -
34- count ( int i | i < pos and this .char_set_end0 ( i ) )
33+ max ( int j |
34+ j = 0 or
35+ j =
36+ count ( int i | i < pos and this .char_set_start0 ( i , _) ) -
37+ count ( int i | i < pos and this .char_set_end0 ( i ) )
38+ )
3539 }
3640
3741 /** Hold if a top-level character set starts between `start` and `end`. */
@@ -168,7 +172,12 @@ abstract class RegexString extends Expr {
168172 private boolean escaping ( int pos ) {
169173 pos = - 1 and result = false
170174 or
171- this .getChar ( pos ) = "\\" and result = this .escaping ( pos - 1 ) .booleanNot ( )
175+ this .getChar ( pos ) = "\\" and
176+ (
177+ if this .getChar ( pos - 1 ) = "c" // in `\c\`, the latter `\` isn't escaping
178+ then result = this .escaping ( pos - 2 ) .booleanNot ( )
179+ else result = this .escaping ( pos - 1 ) .booleanNot ( )
180+ )
172181 or
173182 this .getChar ( pos ) != "\\" and result = false
174183 }
@@ -220,6 +229,16 @@ abstract class RegexString extends Expr {
220229 )
221230 }
222231
232+ /**
233+ * A control sequence, `\cx`
234+ * `x` may be any ascii character including special characters.
235+ */
236+ predicate controlEscape ( int start , int end ) {
237+ this .escapingChar ( start ) and
238+ this .getChar ( start + 1 ) = "c" and
239+ end = start + 3
240+ }
241+
223242 /** Gets the text of this regex */
224243 string getText ( ) { result = this .( StringLiteral ) .getValue ( ) }
225244
@@ -228,7 +247,8 @@ abstract class RegexString extends Expr {
228247 string nonEscapedCharAt ( int i ) {
229248 result = this .getText ( ) .charAt ( i ) and
230249 not exists ( int x , int y | this .escapedCharacter ( x , y ) and i in [ x .. y - 1 ] ) and
231- not exists ( int x , int y | this .quote ( x , y ) and i in [ x .. y - 1 ] )
250+ not exists ( int x , int y | this .quote ( x , y ) and i in [ x .. y - 1 ] ) and
251+ not exists ( int x , int y | this .controlEscape ( x , y ) and i in [ x .. y - 1 ] )
232252 }
233253
234254 private predicate isOptionDivider ( int i ) { this .nonEscapedCharAt ( i ) = "|" }
@@ -246,10 +266,10 @@ abstract class RegexString extends Expr {
246266 )
247267 }
248268
249- /** Named unicode characters, eg \N{degree sign} */
250- private predicate escapedName ( int start , int end ) {
269+ /** An escape sequence that includes braces, such as named characters ( \N{degree sign}), named classes (\p{Lower}), or hex values (\x{h..h}) */
270+ private predicate escapedBraces ( int start , int end ) {
251271 this .escapingChar ( start ) and
252- this .getChar ( start + 1 ) = "N" and
272+ this .getChar ( start + 1 ) = [ "N" , "p" , "P" , "x" ] and
253273 this .getChar ( start + 2 ) = "{" and
254274 this .getChar ( end - 1 ) = "}" and
255275 end > start and
@@ -266,26 +286,38 @@ abstract class RegexString extends Expr {
266286 not this .numbered_backreference ( start , _, _) and
267287 (
268288 // hex value \xhh
269- this .getChar ( start + 1 ) = "x" and end = start + 4
289+ this .getChar ( start + 1 ) = "x" and
290+ this .getChar ( start + 2 ) != "{" and
291+ end = start + 4
270292 or
271- // octal value \o, \oo, or \ooo
272- end in [ start + 2 .. start + 4 ] and
293+ // octal value \0o, \0oo, or \0ooo. Max of 0377.
294+ this .getChar ( start + 1 ) = "0" and
295+ end in [ start + 3 .. start + 5 ] and
273296 forall ( int i | i in [ start + 1 .. end - 1 ] | this .isOctal ( i ) ) and
297+ ( end = start + 5 implies this .getChar ( start + 2 ) <= "3" ) and
274298 not (
275- end < start + 4 and
276- this .isOctal ( end )
299+ end < start + 5 and
300+ this .isOctal ( end ) and
301+ ( end = start + 4 implies this .getChar ( start + 2 ) <= "3" )
277302 )
278303 or
279304 // 16-bit hex value \uhhhh
280305 this .getChar ( start + 1 ) = "u" and end = start + 6
281306 or
282- // 32-bit hex value \Uhhhhhhhh
283- this .getChar ( start + 1 ) = "U" and end = start + 10
307+ escapedBraces ( start , end )
308+ or
309+ // Boundry matchers \b, \b{g}
310+ this .getChar ( start + 1 ) = "b" and
311+ (
312+ if this .getText ( ) .substring ( start + 2 , start + 5 ) = "{g}"
313+ then end = start + 5
314+ else end = start + 2
315+ )
284316 or
285- escapedName ( start , end )
317+ this . controlEscape ( start , end )
286318 or
287319 // escape not handled above, update when adding a new case
288- not this .getChar ( start + 1 ) in [ "x" , "u" , "U " , "N " ] and
320+ not this .getChar ( start + 1 ) in [ "x" , "0" , " u", "p " , "P" , "N" , "b" , "c "] and
289321 not exists ( this .getChar ( start + 1 ) .toInt ( ) ) and
290322 end = start + 2
291323 )
@@ -370,7 +402,7 @@ abstract class RegexString extends Expr {
370402 this .group ( start , end ) and
371403 exists ( int name_end |
372404 this .named_group_start ( start , name_end ) and
373- result = this .getText ( ) .substring ( start + 4 , name_end - 1 )
405+ result = this .getText ( ) .substring ( start + 3 , name_end - 1 )
374406 )
375407 }
376408
@@ -464,7 +496,7 @@ abstract class RegexString extends Expr {
464496 or
465497 this .negative_lookbehind_assertion_start ( start , end )
466498 or
467- this .comment_group_start ( start , end )
499+ this .atomic_group_start ( start , end )
468500 or
469501 this .simple_group_start ( start , end )
470502 }
@@ -485,20 +517,19 @@ abstract class RegexString extends Expr {
485517 private predicate named_group_start ( int start , int end ) {
486518 this .isGroupStart ( start ) and
487519 this .getChar ( start + 1 ) = "?" and
488- this .getChar ( start + 2 ) = "P" and
489- this .getChar ( start + 3 ) = "<" and
490- not this .getChar ( start + 4 ) = "=" and
491- not this .getChar ( start + 4 ) = "!" and
520+ this .getChar ( start + 2 ) = "<" and
521+ not this .getChar ( start + 3 ) = "=" and
522+ not this .getChar ( start + 3 ) = "!" and
492523 exists ( int name_end |
493- name_end = min ( int i | i > start + 4 and this .getChar ( i ) = ">" ) and
524+ name_end = min ( int i | i > start + 3 and this .getChar ( i ) = ">" ) and
494525 end = name_end + 1
495526 )
496527 }
497528
498529 private predicate named_backreference_start ( int start , int end ) {
499530 this .isGroupStart ( start ) and
500531 this .getChar ( start + 1 ) = "?" and
501- this .getChar ( start + 2 ) = "P " and
532+ this .getChar ( start + 2 ) = "k " and
502533 this .getChar ( start + 3 ) = "=" and
503534 // Should this be looking for unescaped ")"?
504535 // TODO: test this
@@ -510,7 +541,7 @@ abstract class RegexString extends Expr {
510541 this .getChar ( start + 1 ) = "?" and
511542 end = start + 3 and
512543 c = this .getChar ( start + 2 ) and
513- c in [ "i" , "L" , " m", "s" , "u" , "x" ]
544+ c in [ "i" , "m" , "s" , "u" , "x" , "U "]
514545 }
515546
516547 /**
@@ -521,15 +552,15 @@ abstract class RegexString extends Expr {
521552 exists ( string c | this .flag_group_start ( _, _, c ) |
522553 c = "i" and result = "IGNORECASE"
523554 or
524- c = "L" and result = "LOCALE"
525- or
526555 c = "m" and result = "MULTILINE"
527556 or
528557 c = "s" and result = "DOTALL"
529558 or
530559 c = "u" and result = "UNICODE"
531560 or
532561 c = "x" and result = "VERBOSE"
562+ or
563+ c = "U" and result = "UNICODECLASS"
533564 )
534565 }
535566
@@ -563,10 +594,10 @@ abstract class RegexString extends Expr {
563594 end = start + 4
564595 }
565596
566- private predicate comment_group_start ( int start , int end ) {
597+ private predicate atomic_group_start ( int start , int end ) {
567598 this .isGroupStart ( start ) and
568599 this .getChar ( start + 1 ) = "?" and
569- this .getChar ( start + 2 ) = "# " and
600+ this .getChar ( start + 2 ) = "> " and
570601 end = start + 3
571602 }
572603
@@ -633,10 +664,10 @@ abstract class RegexString extends Expr {
633664
634665 private predicate qualifier ( int start , int end , boolean maybe_empty , boolean may_repeat_forever ) {
635666 this .short_qualifier ( start , end , maybe_empty , may_repeat_forever ) and
636- not this .getChar ( end ) = "?"
667+ not this .getChar ( end ) = [ "?" , "+" ]
637668 or
638669 exists ( int short_end | this .short_qualifier ( start , short_end , maybe_empty , may_repeat_forever ) |
639- if this .getChar ( short_end ) = "?" then end = short_end + 1 else end = short_end
670+ if this .getChar ( short_end ) = [ "?" , "+" ] then end = short_end + 1 else end = short_end
640671 )
641672 }
642673
@@ -897,11 +928,11 @@ class Regex extends RegexString {
897928 * Gets a mode (if any) of this regular expression. Can be any of:
898929 * DEBUG
899930 * IGNORECASE
900- * LOCALE
901931 * MULTILINE
902932 * DOTALL
903933 * UNICODE
904934 * VERBOSE
935+ * UNICODECLASS
905936 */
906937 string getAMode ( ) {
907938 result != "None" and
0 commit comments