Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 9f4da65

Browse files
Improve calculation of locations of regex terms
1 parent dd200e2 commit 9f4da65

4 files changed

Lines changed: 123 additions & 56 deletions

File tree

java/ql/lib/semmle/code/java/regex/RegexTreeView.qll

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -188,12 +188,18 @@ class RegExpTerm extends RegExpParent {
188188
predicate hasLocationInfo(
189189
string filepath, int startline, int startcolumn, int endline, int endcolumn
190190
) {
191-
// This currently gives incorrect results for string literals including backslashes. TODO: fix that.
192-
// There are also more complex cases where it fails. Handling all of them would be difficult for not much gain.
193-
exists(int re_start, int re_end |
191+
/*
192+
* This is an approximation that handles the simple and common case of single,
193+
* normal string literal written in the source, but does not give correct results in more complex cases
194+
* such as compile-time concatenation, or multi-line string literals.
195+
*/
196+
197+
exists(int re_start, int re_end, int src_start, int src_end |
194198
re.getLocation().hasLocationInfo(filepath, startline, re_start, endline, re_end) and
195-
startcolumn = re_start + start + 1 and
196-
endcolumn = re_start + end
199+
re.sourceCharacter(start, src_start, _) and
200+
re.sourceCharacter(end - 1, _, src_end) and
201+
startcolumn = re_start + src_start and
202+
endcolumn = re_start + src_end - 1
197203
)
198204
}
199205

java/ql/lib/semmle/code/java/regex/regex.qll

Lines changed: 63 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@ abstract class RegexString extends StringLiteral {
2727
* In order to avoid negative recursion, we return a boolean.
2828
* This way, we can refer to `escaping(pos - 1).booleanNot()`
2929
* rather than to a negated version of `escaping(pos)`.
30-
* Does not take into account escape characters inside quote sequences.
3130
*/
3231
private boolean escaping(int pos) {
3332
pos = -1 and result = false
@@ -104,11 +103,10 @@ abstract class RegexString extends StringLiteral {
104103
end = start + 3
105104
}
106105

107-
string nonEscapedCharAt(int i) {
108-
result = this.getText().charAt(i) and
106+
private string nonEscapedCharAt(int i) {
107+
result = this.getChar(i) and
109108
not exists(int x, int y | this.escapedCharacter(x, y) and i in [x .. y - 1]) and
110-
not exists(int x, int y | this.quote(x, y) and i in [x .. y - 1]) and
111-
not exists(int x, int y | this.controlEscape(x, y) and i in [x .. y - 1])
109+
not exists(int x, int y | this.quote(x, y) and i in [x .. y - 1])
112110
}
113111

114112
/** Holds if a character set starts between `start` and `end`, including any negation character (`^`). */
@@ -822,6 +820,66 @@ abstract class RegexString extends StringLiteral {
822820
this.alternation(start, end) and
823821
this.subalternation(start, part_end, part_start)
824822
}
823+
824+
/**
825+
* Gets the `i`th character of this literal as it was written in the source code.
826+
*/
827+
string getSourceChar(int i) { result = this.(StringLiteral).getLiteral().charAt(i) }
828+
829+
/**
830+
* Helper predicate for `sourceEscapingChar` that
831+
* results in a boolean in order to avoid negative recursion.
832+
*/
833+
private boolean sourceEscaping(int pos) {
834+
pos = -1 and result = false
835+
or
836+
this.getSourceChar(pos) = "\\" and
837+
result = this.sourceEscaping(pos - 1).booleanNot()
838+
or
839+
this.getSourceChar(pos) != "\\" and result = false
840+
}
841+
842+
/**
843+
* Equivalent of `escapingChar` for the literal source rather than the string value.
844+
* Holds if the character at position `pos` in the source literal is a '\' that is
845+
* actually escaping what comes after it.
846+
*/
847+
private predicate sourceEcapingChar(int pos) { this.sourceEscaping(pos) = true }
848+
849+
/**
850+
* Holds if an escaped character exists between `start` and `end` in the source iteral.
851+
*/
852+
private predicate sourceEscapedCharacter(int start, int end) {
853+
this.sourceEcapingChar(start) and
854+
(if this.getSourceChar(start + 1) = "u" then end = start + 6 else end = start + 2)
855+
}
856+
857+
private predicate sourceNonEscapedCharacter(int i) {
858+
exists(this.getSourceChar(i)) and
859+
not exists(int x, int y | this.sourceEscapedCharacter(x, y) and i in [x .. y - 1])
860+
}
861+
862+
/**
863+
* Holds if a character is represented between `start` and `end` in the source literal.
864+
*/
865+
private predicate sourceCharacter(int start, int end) {
866+
sourceEscapedCharacter(start, end)
867+
or
868+
sourceNonEscapedCharacter(start) and
869+
end = start + 1
870+
}
871+
872+
/**
873+
* Holds if the `i`th character of the string is represented between offsets
874+
* `start` (inclusive) and `end` (exclusive) in the source code of this literal.
875+
* This only gives correct results if the literal is written as a normal single-line string literal;
876+
* without compile-time concatenation involved.
877+
*/
878+
predicate sourceCharacter(int pos, int start, int end) {
879+
exists(this.getChar(pos)) and
880+
sourceCharacter(start, end) and
881+
start = rank[pos + 2](int s | sourceCharacter(s, _))
882+
}
825883
}
826884

827885
/** A string literal used as a regular expression */

java/ql/test/library-tests/regex/RegexParseTests.expected

Lines changed: 46 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
parseFailures
22
#select
3-
| Test.java:5:10:5:16 | [A-Z\\d] | [RegExpCharacterClass] |
4-
| Test.java:5:10:5:18 | [A-Z\\d]++ | [RegExpPlus] |
3+
| Test.java:5:10:5:17 | [A-Z\\d] | [RegExpCharacterClass] |
4+
| Test.java:5:10:5:19 | [A-Z\\d]++ | [RegExpPlus] |
55
| Test.java:5:11:5:11 | A | [RegExpConstant,RegExpNormalChar] |
66
| Test.java:5:11:5:13 | A-Z | [RegExpCharacterRange] |
77
| Test.java:5:13:5:13 | Z | [RegExpConstant,RegExpNormalChar] |
8-
| Test.java:5:14:5:15 | \\d | [RegExpCharacterClassEscape] |
9-
| Test.java:6:10:6:39 | \\Q hello world [ *** \\Q ) ( \\E | [RegExpConstant,RegExpQuote] |
10-
| Test.java:7:10:7:21 | [\\Q hi ] \\E] | [RegExpCharacterClass] |
11-
| Test.java:7:11:7:20 | \\Q hi ] \\E | [RegExpConstant,RegExpQuote] |
8+
| Test.java:5:14:5:16 | \\d | [RegExpCharacterClassEscape] |
9+
| Test.java:6:10:6:42 | \\Q hello world [ *** \\Q ) ( \\E | [RegExpConstant,RegExpQuote] |
10+
| Test.java:7:10:7:23 | [\\Q hi ] \\E] | [RegExpCharacterClass] |
11+
| Test.java:7:11:7:22 | \\Q hi ] \\E | [RegExpConstant,RegExpQuote] |
1212
| Test.java:8:10:8:12 | []] | [RegExpCharacterClass] |
1313
| Test.java:8:11:8:11 | ] | [RegExpConstant,RegExpNormalChar] |
1414
| Test.java:9:10:9:13 | [^]] | [RegExpCharacterClass] |
@@ -23,33 +23,33 @@ parseFailures
2323
| Test.java:10:17:10:17 | f | [RegExpConstant,RegExpNormalChar] |
2424
| Test.java:10:18:10:18 | g | [RegExpConstant,RegExpNormalChar] |
2525
| Test.java:10:19:10:19 | ] | [RegExpConstant,RegExpNormalChar] |
26-
| Test.java:11:10:11:53 | [abc&&[\\W\\p{Lower}\\P{Space}\\N{degree sign}]] | [RegExpCharacterClass] |
27-
| Test.java:11:10:11:62 | [abc&&[\\W\\p{Lower}\\P{Space}\\N{degree sign}]]\\b7\\b{g}8 | [RegExpSequence] |
26+
| Test.java:11:10:11:57 | [abc&&[\\W\\p{Lower}\\P{Space}\\N{degree sign}]] | [RegExpCharacterClass] |
27+
| Test.java:11:10:11:68 | [abc&&[\\W\\p{Lower}\\P{Space}\\N{degree sign}]]\\b7\\b{g}8 | [RegExpSequence] |
2828
| Test.java:11:11:11:11 | a | [RegExpConstant,RegExpNormalChar] |
2929
| Test.java:11:12:11:12 | b | [RegExpConstant,RegExpNormalChar] |
3030
| Test.java:11:13:11:13 | c | [RegExpConstant,RegExpNormalChar] |
3131
| Test.java:11:14:11:14 | & | [RegExpConstant,RegExpNormalChar] |
3232
| Test.java:11:15:11:15 | & | [RegExpConstant,RegExpNormalChar] |
3333
| Test.java:11:16:11:16 | [ | [RegExpConstant,RegExpNormalChar] |
34-
| Test.java:11:17:11:18 | \\W | [RegExpCharacterClassEscape] |
35-
| Test.java:11:19:11:27 | \\p{Lower} | [RegExpCharacterClassEscape] |
36-
| Test.java:11:28:11:36 | \\P{Space} | [RegExpCharacterClassEscape] |
37-
| Test.java:11:37:11:51 | \\N{degree sign} | [RegExpConstant,RegExpEscape] |
38-
| Test.java:11:52:11:52 | ] | [RegExpConstant,RegExpNormalChar] |
39-
| Test.java:11:54:11:55 | \\b | [RegExpConstant,RegExpEscape] |
40-
| Test.java:11:56:11:56 | 7 | [RegExpConstant,RegExpNormalChar] |
41-
| Test.java:11:57:11:61 | \\b{g} | [RegExpConstant,RegExpEscape] |
42-
| Test.java:11:62:11:62 | 8 | [RegExpConstant,RegExpNormalChar] |
43-
| Test.java:12:10:12:12 | \\cA | [RegExpConstant,RegExpEscape] |
44-
| Test.java:13:10:13:12 | \\c( | [RegExpConstant,RegExpEscape] |
45-
| Test.java:14:10:14:12 | \\c\\ | [RegExpConstant,RegExpEscape] |
46-
| Test.java:14:10:14:16 | \\c\\(ab) | [RegExpSequence] |
47-
| Test.java:14:13:14:16 | (ab) | [RegExpGroup] |
48-
| Test.java:14:14:14:14 | a | [RegExpConstant,RegExpNormalChar] |
49-
| Test.java:14:14:14:15 | ab | [RegExpSequence] |
50-
| Test.java:14:15:14:15 | b | [RegExpConstant,RegExpNormalChar] |
34+
| Test.java:11:17:11:19 | \\W | [RegExpCharacterClassEscape] |
35+
| Test.java:11:20:11:29 | \\p{Lower} | [RegExpCharacterClassEscape] |
36+
| Test.java:11:30:11:39 | \\P{Space} | [RegExpCharacterClassEscape] |
37+
| Test.java:11:40:11:55 | \\N{degree sign} | [RegExpConstant,RegExpEscape] |
38+
| Test.java:11:56:11:56 | ] | [RegExpConstant,RegExpNormalChar] |
39+
| Test.java:11:58:11:60 | \\b | [RegExpConstant,RegExpEscape] |
40+
| Test.java:11:61:11:61 | 7 | [RegExpConstant,RegExpNormalChar] |
41+
| Test.java:11:62:11:67 | \\b{g} | [RegExpConstant,RegExpEscape] |
42+
| Test.java:11:68:11:68 | 8 | [RegExpConstant,RegExpNormalChar] |
43+
| Test.java:12:10:12:13 | \\cA | [RegExpConstant,RegExpEscape] |
44+
| Test.java:13:10:13:13 | \\c( | [RegExpConstant,RegExpEscape] |
45+
| Test.java:14:10:14:14 | \\c\\ | [RegExpConstant,RegExpEscape] |
46+
| Test.java:14:10:14:18 | \\c\\(ab) | [RegExpSequence] |
47+
| Test.java:14:15:14:18 | (ab) | [RegExpGroup] |
48+
| Test.java:14:16:14:16 | a | [RegExpConstant,RegExpNormalChar] |
49+
| Test.java:14:16:14:17 | ab | [RegExpSequence] |
50+
| Test.java:14:17:14:17 | b | [RegExpConstant,RegExpNormalChar] |
5151
| Test.java:15:10:15:15 | (?>hi) | [RegExpGroup] |
52-
| Test.java:15:10:15:44 | (?>hi)(?<name>hell*?o*+)123\\k<name> | [RegExpSequence] |
52+
| Test.java:15:10:15:45 | (?>hi)(?<name>hell*?o*+)123\\k<name> | [RegExpSequence] |
5353
| Test.java:15:13:15:13 | h | [RegExpConstant,RegExpNormalChar] |
5454
| Test.java:15:13:15:14 | hi | [RegExpSequence] |
5555
| Test.java:15:14:15:14 | i | [RegExpConstant,RegExpNormalChar] |
@@ -65,7 +65,7 @@ parseFailures
6565
| Test.java:15:34:15:34 | 1 | [RegExpConstant,RegExpNormalChar] |
6666
| Test.java:15:35:15:35 | 2 | [RegExpConstant,RegExpNormalChar] |
6767
| Test.java:15:36:15:36 | 3 | [RegExpConstant,RegExpNormalChar] |
68-
| Test.java:15:37:15:44 | \\k<name> | [RegExpBackRef] |
68+
| Test.java:15:37:15:45 | \\k<name> | [RegExpBackRef] |
6969
| Test.java:16:10:16:10 | a | [RegExpConstant,RegExpNormalChar] |
7070
| Test.java:16:10:16:11 | a+ | [RegExpPlus] |
7171
| Test.java:16:10:16:108 | a+b*c?d{2}e{3,4}f{,5}g{6,}h+?i*?j??k{7}?l{8,9}?m{,10}?n{11,}?o++p*+q?+r{12}+s{13,14}+t{,15}+u{16,}+ | [RegExpSequence] |
@@ -120,20 +120,22 @@ parseFailures
120120
| Test.java:17:30:17:35 | (?<!d) | [RegExpNegativeLookbehind] |
121121
| Test.java:17:34:17:34 | d | [RegExpConstant,RegExpNormalChar] |
122122
| Test.java:18:10:18:10 | a | [RegExpConstant,RegExpNormalChar] |
123-
| Test.java:18:10:18:22 | a\|b\|c(d\|e)f\|g | [RegExpAlt] |
124-
| Test.java:18:12:18:12 | b | [RegExpConstant,RegExpNormalChar] |
125-
| Test.java:18:14:18:14 | c | [RegExpConstant,RegExpNormalChar] |
126-
| Test.java:18:14:18:20 | c(d\|e)f | [RegExpSequence] |
127-
| Test.java:18:15:18:19 | (d\|e) | [RegExpGroup] |
128-
| Test.java:18:16:18:16 | d | [RegExpConstant,RegExpNormalChar] |
129-
| Test.java:18:16:18:18 | d\|e | [RegExpAlt] |
130-
| Test.java:18:18:18:18 | e | [RegExpConstant,RegExpNormalChar] |
131-
| Test.java:18:20:18:20 | f | [RegExpConstant,RegExpNormalChar] |
132-
| Test.java:18:22:18:22 | g | [RegExpConstant,RegExpNormalChar] |
133-
| Test.java:19:10:19:12 | \\01 | [RegExpConstant,RegExpEscape] |
134-
| Test.java:19:10:19:27 | \\018\\033\\0377\\0777 | [RegExpSequence] |
135-
| Test.java:19:13:19:13 | 8 | [RegExpConstant,RegExpNormalChar] |
136-
| Test.java:19:14:19:17 | \\033 | [RegExpConstant,RegExpEscape] |
137-
| Test.java:19:18:19:22 | \\0377 | [RegExpConstant,RegExpEscape] |
138-
| Test.java:19:23:19:26 | \\077 | [RegExpConstant,RegExpEscape] |
139-
| Test.java:19:27:19:27 | 7 | [RegExpConstant,RegExpNormalChar] |
123+
| Test.java:18:10:18:24 | a\|\|b\|c(d\|e\|)f\|g | [RegExpAlt] |
124+
| Test.java:18:12:18:24 | \|b\|c(d\|e\|)f\|g | [RegExpAlt] |
125+
| Test.java:18:13:18:13 | b | [RegExpConstant,RegExpNormalChar] |
126+
| Test.java:18:15:18:15 | c | [RegExpConstant,RegExpNormalChar] |
127+
| Test.java:18:15:18:22 | c(d\|e\|)f | [RegExpSequence] |
128+
| Test.java:18:16:18:21 | (d\|e\|) | [RegExpGroup] |
129+
| Test.java:18:17:18:17 | d | [RegExpConstant,RegExpNormalChar] |
130+
| Test.java:18:17:18:20 | d\|e\| | [RegExpAlt] |
131+
| Test.java:18:19:18:19 | e | [RegExpConstant,RegExpNormalChar] |
132+
| Test.java:18:22:18:22 | f | [RegExpConstant,RegExpNormalChar] |
133+
| Test.java:18:24:18:24 | g | [RegExpConstant,RegExpNormalChar] |
134+
| Test.java:19:10:19:13 | \\01 | [RegExpConstant,RegExpEscape] |
135+
| Test.java:19:10:19:37 | \\018\\033\\0377\\0777\u1337 | [RegExpSequence] |
136+
| Test.java:19:14:19:14 | 8 | [RegExpConstant,RegExpNormalChar] |
137+
| Test.java:19:15:19:19 | \\033 | [RegExpConstant,RegExpEscape] |
138+
| Test.java:19:20:19:25 | \\0377 | [RegExpConstant,RegExpEscape] |
139+
| Test.java:19:26:19:30 | \\077 | [RegExpConstant,RegExpEscape] |
140+
| Test.java:19:31:19:31 | 7 | [RegExpConstant,RegExpNormalChar] |
141+
| Test.java:19:32:19:37 | \u1337 | [RegExpConstant,RegExpNormalChar] |

java/ql/test/library-tests/regex/Test.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,9 @@ class Test {
1515
"(?>hi)(?<name>hell*?o*+)123\\k<name>",
1616
"a+b*c?d{2}e{3,4}f{,5}g{6,}h+?i*?j??k{7}?l{8,9}?m{,10}?n{11,}?o++p*+q?+r{12}+s{13,14}+t{,15}+u{16,}+",
1717
"(?i)(?=a)(?!b)(?<=c)(?<!d)",
18-
"a|b|c(d|e)f|g",
19-
"\\018\\033\\0377\\0777"
18+
"a||b|c(d|e|)f|g",
19+
"\\018\\033\\0377\\0777\u1337",
20+
2021
};
2122

2223
void test() {

0 commit comments

Comments
 (0)