33 * that match URLs and hostname patterns.
44 */
55
6- private import codeql.ruby.security.performance.RegExpTreeView
7- private import codeql.ruby.DataFlow
6+ private import HostnameRegexpSpecific
87
98/**
109 * Holds if the given constant is unlikely to occur in the origin part of a URL.
@@ -20,7 +19,7 @@ predicate isConstantInvalidInsideOrigin(RegExpConstant term) {
2019
2120/** Holds if `term` is a dot constant of form `\.` or `[.]`. */
2221predicate isDotConstant ( RegExpTerm term ) {
23- term .( RegExpEscape ) .getValue ( ) = "."
22+ term .( RegExpCharEscape ) .getValue ( ) = "."
2423 or
2524 exists ( RegExpCharacterClass cls |
2625 term = cls and
@@ -108,3 +107,96 @@ predicate alwaysMatchesHostnameAlt(RegExpAlt alt, int i) {
108107 alwaysMatchesHostnameAlt ( alt , i - 1 ) and
109108 alwaysMatchesHostname ( alt .getChild ( i ) )
110109}
110+
111+ /**
112+ * Holds if `term` occurs inside a quantifier or alternative (and thus
113+ * can not be expected to correspond to a unique match), or as part of
114+ * a lookaround assertion (which are rarely used for capture groups).
115+ */
116+ predicate isInsideChoiceOrSubPattern ( RegExpTerm term ) {
117+ exists ( RegExpParent parent | parent = term .getParent ( ) |
118+ parent instanceof RegExpAlt
119+ or
120+ parent instanceof RegExpQuantifier
121+ or
122+ parent instanceof RegExpSubPattern
123+ or
124+ isInsideChoiceOrSubPattern ( parent )
125+ )
126+ }
127+
128+ /**
129+ * Holds if `group` is likely to be used as a capture group.
130+ */
131+ predicate isLikelyCaptureGroup ( RegExpGroup group ) {
132+ group .isCapture ( ) and
133+ not isInsideChoiceOrSubPattern ( group )
134+ }
135+
136+ /**
137+ * Holds if `seq` contains two consecutive dots `..` or escaped dots.
138+ *
139+ * At least one of these dots is not intended to be a subdomain separator,
140+ * so we avoid flagging the pattern in this case.
141+ */
142+ predicate hasConsecutiveDots ( RegExpSequence seq ) {
143+ exists ( int i |
144+ isDotLike ( seq .getChild ( i ) ) and
145+ isDotLike ( seq .getChild ( i + 1 ) )
146+ )
147+ }
148+
149+ predicate isIncompleteHostNameRegExpPattern ( RegExpTerm regexp , RegExpSequence seq , string msg ) {
150+ seq = regexp .getAChild * ( ) and
151+ exists ( RegExpDot unescapedDot , int i , string hostname |
152+ hasTopLevelDomainEnding ( seq , i ) and
153+ not isConstantInvalidInsideOrigin ( seq .getChild ( [ 0 .. i - 1 ] ) .getAChild * ( ) ) and
154+ not isLikelyCaptureGroup ( seq .getChild ( [ i .. seq .getNumChild ( ) - 1 ] ) .getAChild * ( ) ) and
155+ unescapedDot = seq .getChild ( [ 0 .. i - 1 ] ) .getAChild * ( ) and
156+ unescapedDot != seq .getChild ( i - 1 ) and // Should not be the '.' immediately before the TLD
157+ not hasConsecutiveDots ( unescapedDot .getParent ( ) ) and
158+ hostname =
159+ seq .getChild ( i - 2 ) .getRawValue ( ) + seq .getChild ( i - 1 ) .getRawValue ( ) +
160+ seq .getChild ( i ) .getRawValue ( )
161+ |
162+ if unescapedDot .getParent ( ) instanceof RegExpQuantifier
163+ then
164+ // `.*\.example.com` can match `evil.com/?x=.example.com`
165+ //
166+ // This problem only occurs when the pattern is applied against a full URL, not just a hostname/origin.
167+ // We therefore check if the pattern includes a suffix after the TLD, such as `.*\.example.com/`.
168+ // Note that a post-anchored pattern (`.*\.example.com$`) will usually fail to match a full URL,
169+ // and patterns with neither a suffix nor an anchor fall under the purview of MissingRegExpAnchor.
170+ seq .getChild ( 0 ) instanceof RegExpCaret and
171+ not seq .getAChild ( ) instanceof RegExpDollar and
172+ seq .getChild ( [ i .. i + 1 ] ) .( RegExpConstant ) .getValue ( ) .regexpMatch ( ".*[/?#].*" ) and
173+ msg =
174+ "has an unrestricted wildcard '" + unescapedDot .getParent ( ) .( RegExpQuantifier ) .getRawValue ( )
175+ + "' which may cause '" + hostname +
176+ "' to be matched anywhere in the URL, outside the hostname."
177+ else
178+ msg =
179+ "has an unescaped '.' before '" + hostname +
180+ "', so it might match more hosts than expected."
181+ )
182+ }
183+
184+ predicate incompleteHostnameRegExp (
185+ RegExpSequence hostSequence , string message , DataFlow:: Node aux , string label
186+ ) {
187+ exists ( RegExpPatternSource re , RegExpTerm regexp , string msg , string kind |
188+ regexp = re .getRegExpTerm ( ) and
189+ isIncompleteHostNameRegExpPattern ( regexp , hostSequence , msg ) and
190+ (
191+ if re .getAParse ( ) != re
192+ then (
193+ kind = "string, which is used as a regular expression $@," and
194+ aux = re .getAParse ( )
195+ ) else (
196+ kind = "regular expression" and aux = re
197+ )
198+ )
199+ |
200+ message = "This " + kind + " " + msg and label = "here"
201+ )
202+ }
0 commit comments