Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 11e465f

Browse files
Implement remaining syntax differences
1 parent 7530902 commit 11e465f

2 files changed

Lines changed: 70 additions & 35 deletions

File tree

java/ql/lib/semmle/code/java/regex/RegexTreeView.qll

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -472,14 +472,15 @@ class RegExpEscape extends RegExpNormalChar {
472472
this.getUnescaped() = "t" and result = "\t"
473473
or
474474
// TODO: Find a way to include a formfeed character
475+
// also the alert/bell character for \a and escape character for \e.
475476
// this.getUnescaped() = "f" and result = " "
476477
// or
477478
this.isUnicode() and
478479
result = this.getUnicode()
479480
}
480481

481482
/** Holds if this terms name is given by the part following the escape character. */
482-
predicate isIdentityEscape() { not this.getUnescaped() in ["n", "r", "t", "f"] }
483+
predicate isIdentityEscape() { not this.getUnescaped() in ["n", "r", "t", "f", "a", "e"] }
483484

484485
override string getPrimaryQLClass() { result = "RegExpEscape" }
485486

@@ -494,7 +495,7 @@ class RegExpEscape extends RegExpNormalChar {
494495
/**
495496
* Holds if this is a unicode escape.
496497
*/
497-
private predicate isUnicode() { this.getText().prefix(2) = ["\\u", "\\U"] }
498+
private predicate isUnicode() { this.getText().prefix(2) = "\\u" }
498499

499500
/**
500501
* Gets the unicode char for this escape.
@@ -551,7 +552,10 @@ private int toHex(string hex) {
551552
* ```
552553
*/
553554
class RegExpCharacterClassEscape extends RegExpEscape {
554-
RegExpCharacterClassEscape() { this.getValue() in ["d", "D", "s", "S", "w", "W"] }
555+
RegExpCharacterClassEscape() {
556+
this.getValue() in ["d", "D", "s", "S", "w", "W", "h", "H", "v", "V"] or
557+
this.getValue().charAt(0) in ["p", "P"]
558+
}
555559

556560
override RegExpTerm getChild(int i) { none() }
557561

java/ql/lib/semmle/code/java/regex/regex.qll

Lines changed: 63 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,12 @@ abstract class RegexString extends Expr {
3030
private int char_set_depth(int pos) {
3131
exists(this.getChar(pos)) and
3232
result =
33-
count(int i | i < pos and this.char_set_start0(i, _)) -
34-
count(int i | i < pos and this.char_set_end0(i))
33+
max(int j |
34+
j = 0 or
35+
j =
36+
count(int i | i < pos and this.char_set_start0(i, _)) -
37+
count(int i | i < pos and this.char_set_end0(i))
38+
)
3539
}
3640

3741
/** Hold if a top-level character set starts between `start` and `end`. */
@@ -168,7 +172,12 @@ abstract class RegexString extends Expr {
168172
private boolean escaping(int pos) {
169173
pos = -1 and result = false
170174
or
171-
this.getChar(pos) = "\\" and result = this.escaping(pos - 1).booleanNot()
175+
this.getChar(pos) = "\\" and
176+
(
177+
if this.getChar(pos - 1) = "c" // in `\c\`, the latter `\` isn't escaping
178+
then result = this.escaping(pos - 2).booleanNot()
179+
else result = this.escaping(pos - 1).booleanNot()
180+
)
172181
or
173182
this.getChar(pos) != "\\" and result = false
174183
}
@@ -220,6 +229,16 @@ abstract class RegexString extends Expr {
220229
)
221230
}
222231

232+
/**
233+
* A control sequence, `\cx`
234+
* `x` may be any ascii character including special characters.
235+
*/
236+
predicate controlEscape(int start, int end) {
237+
this.escapingChar(start) and
238+
this.getChar(start + 1) = "c" and
239+
end = start + 3
240+
}
241+
223242
/** Gets the text of this regex */
224243
string getText() { result = this.(StringLiteral).getValue() }
225244

@@ -228,7 +247,8 @@ abstract class RegexString extends Expr {
228247
string nonEscapedCharAt(int i) {
229248
result = this.getText().charAt(i) and
230249
not exists(int x, int y | this.escapedCharacter(x, y) and i in [x .. y - 1]) and
231-
not exists(int x, int y | this.quote(x, y) and i in [x .. y - 1])
250+
not exists(int x, int y | this.quote(x, y) and i in [x .. y - 1]) and
251+
not exists(int x, int y | this.controlEscape(x, y) and i in [x .. y - 1])
232252
}
233253

234254
private predicate isOptionDivider(int i) { this.nonEscapedCharAt(i) = "|" }
@@ -246,10 +266,10 @@ abstract class RegexString extends Expr {
246266
)
247267
}
248268

249-
/** Named unicode characters, eg \N{degree sign} */
250-
private predicate escapedName(int start, int end) {
269+
/** An escape sequence that includes braces, such as named characters (\N{degree sign}), named classes (\p{Lower}), or hex values (\x{h..h}) */
270+
private predicate escapedBraces(int start, int end) {
251271
this.escapingChar(start) and
252-
this.getChar(start + 1) = "N" and
272+
this.getChar(start + 1) = ["N", "p", "P", "x"] and
253273
this.getChar(start + 2) = "{" and
254274
this.getChar(end - 1) = "}" and
255275
end > start and
@@ -266,26 +286,38 @@ abstract class RegexString extends Expr {
266286
not this.numbered_backreference(start, _, _) and
267287
(
268288
// hex value \xhh
269-
this.getChar(start + 1) = "x" and end = start + 4
289+
this.getChar(start + 1) = "x" and
290+
this.getChar(start + 2) != "{" and
291+
end = start + 4
270292
or
271-
// octal value \o, \oo, or \ooo
272-
end in [start + 2 .. start + 4] and
293+
// octal value \0o, \0oo, or \0ooo. Max of 0377.
294+
this.getChar(start + 1) = "0" and
295+
end in [start + 3 .. start + 5] and
273296
forall(int i | i in [start + 1 .. end - 1] | this.isOctal(i)) and
297+
(end = start + 5 implies this.getChar(start + 2) <= "3") and
274298
not (
275-
end < start + 4 and
276-
this.isOctal(end)
299+
end < start + 5 and
300+
this.isOctal(end) and
301+
(end = start + 4 implies this.getChar(start + 2) <= "3")
277302
)
278303
or
279304
// 16-bit hex value \uhhhh
280305
this.getChar(start + 1) = "u" and end = start + 6
281306
or
282-
// 32-bit hex value \Uhhhhhhhh
283-
this.getChar(start + 1) = "U" and end = start + 10
307+
escapedBraces(start, end)
308+
or
309+
// Boundry matchers \b, \b{g}
310+
this.getChar(start + 1) = "b" and
311+
(
312+
if this.getText().substring(start + 2, start + 5) = "{g}"
313+
then end = start + 5
314+
else end = start + 2
315+
)
284316
or
285-
escapedName(start, end)
317+
this.controlEscape(start, end)
286318
or
287319
// escape not handled above, update when adding a new case
288-
not this.getChar(start + 1) in ["x", "u", "U", "N"] and
320+
not this.getChar(start + 1) in ["x", "0", "u", "p", "P", "N", "b", "c"] and
289321
not exists(this.getChar(start + 1).toInt()) and
290322
end = start + 2
291323
)
@@ -370,7 +402,7 @@ abstract class RegexString extends Expr {
370402
this.group(start, end) and
371403
exists(int name_end |
372404
this.named_group_start(start, name_end) and
373-
result = this.getText().substring(start + 4, name_end - 1)
405+
result = this.getText().substring(start + 3, name_end - 1)
374406
)
375407
}
376408

@@ -464,7 +496,7 @@ abstract class RegexString extends Expr {
464496
or
465497
this.negative_lookbehind_assertion_start(start, end)
466498
or
467-
this.comment_group_start(start, end)
499+
this.atomic_group_start(start, end)
468500
or
469501
this.simple_group_start(start, end)
470502
}
@@ -485,20 +517,19 @@ abstract class RegexString extends Expr {
485517
private predicate named_group_start(int start, int end) {
486518
this.isGroupStart(start) and
487519
this.getChar(start + 1) = "?" and
488-
this.getChar(start + 2) = "P" and
489-
this.getChar(start + 3) = "<" and
490-
not this.getChar(start + 4) = "=" and
491-
not this.getChar(start + 4) = "!" and
520+
this.getChar(start + 2) = "<" and
521+
not this.getChar(start + 3) = "=" and
522+
not this.getChar(start + 3) = "!" and
492523
exists(int name_end |
493-
name_end = min(int i | i > start + 4 and this.getChar(i) = ">") and
524+
name_end = min(int i | i > start + 3 and this.getChar(i) = ">") and
494525
end = name_end + 1
495526
)
496527
}
497528

498529
private predicate named_backreference_start(int start, int end) {
499530
this.isGroupStart(start) and
500531
this.getChar(start + 1) = "?" and
501-
this.getChar(start + 2) = "P" and
532+
this.getChar(start + 2) = "k" and
502533
this.getChar(start + 3) = "=" and
503534
// Should this be looking for unescaped ")"?
504535
// TODO: test this
@@ -510,7 +541,7 @@ abstract class RegexString extends Expr {
510541
this.getChar(start + 1) = "?" and
511542
end = start + 3 and
512543
c = this.getChar(start + 2) and
513-
c in ["i", "L", "m", "s", "u", "x"]
544+
c in ["i", "m", "s", "u", "x", "U"]
514545
}
515546

516547
/**
@@ -521,15 +552,15 @@ abstract class RegexString extends Expr {
521552
exists(string c | this.flag_group_start(_, _, c) |
522553
c = "i" and result = "IGNORECASE"
523554
or
524-
c = "L" and result = "LOCALE"
525-
or
526555
c = "m" and result = "MULTILINE"
527556
or
528557
c = "s" and result = "DOTALL"
529558
or
530559
c = "u" and result = "UNICODE"
531560
or
532561
c = "x" and result = "VERBOSE"
562+
or
563+
c = "U" and result = "UNICODECLASS"
533564
)
534565
}
535566

@@ -563,10 +594,10 @@ abstract class RegexString extends Expr {
563594
end = start + 4
564595
}
565596

566-
private predicate comment_group_start(int start, int end) {
597+
private predicate atomic_group_start(int start, int end) {
567598
this.isGroupStart(start) and
568599
this.getChar(start + 1) = "?" and
569-
this.getChar(start + 2) = "#" and
600+
this.getChar(start + 2) = ">" and
570601
end = start + 3
571602
}
572603

@@ -633,10 +664,10 @@ abstract class RegexString extends Expr {
633664

634665
private predicate qualifier(int start, int end, boolean maybe_empty, boolean may_repeat_forever) {
635666
this.short_qualifier(start, end, maybe_empty, may_repeat_forever) and
636-
not this.getChar(end) = "?"
667+
not this.getChar(end) = ["?", "+"]
637668
or
638669
exists(int short_end | this.short_qualifier(start, short_end, maybe_empty, may_repeat_forever) |
639-
if this.getChar(short_end) = "?" then end = short_end + 1 else end = short_end
670+
if this.getChar(short_end) = ["?", "+"] then end = short_end + 1 else end = short_end
640671
)
641672
}
642673

@@ -897,11 +928,11 @@ class Regex extends RegexString {
897928
* Gets a mode (if any) of this regular expression. Can be any of:
898929
* DEBUG
899930
* IGNORECASE
900-
* LOCALE
901931
* MULTILINE
902932
* DOTALL
903933
* UNICODE
904934
* VERBOSE
935+
* UNICODECLASS
905936
*/
906937
string getAMode() {
907938
result != "None" and

0 commit comments

Comments
 (0)