@@ -1497,6 +1497,158 @@ private module StdlibPrivate {
14971497 }
14981498}
14991499
1500+ // ---------------------------------------------------------------------------
1501+ // re
1502+ // ---------------------------------------------------------------------------
1503+ /**
1504+ * List of methods in the `re` module immediately executing a regular expression.
1505+ *
1506+ * See https://docs.python.org/3/library/re.html#module-contents
1507+ */
1508+ private class RegexExecutionMethod extends string {
1509+ RegexExecutionMethod ( ) {
1510+ this in [ "match" , "fullmatch" , "search" , "split" , "findall" , "finditer" , "sub" , "subn" ]
1511+ }
1512+ }
1513+
1514+ /** Gets the index of the argument representing the string to be searched by a regex. */
1515+ int stringArg ( RegexExecutionMethod method ) {
1516+ method in [ "match" , "fullmatch" , "search" , "split" , "findall" , "finditer" ] and
1517+ result = 1
1518+ or
1519+ method in [ "sub" , "subn" ] and
1520+ result = 2
1521+ }
1522+
1523+ /**
1524+ * A a call to a method from the `re` module immediately executing a regular expression.
1525+ *
1526+ * See `RegexExecutionMethods`
1527+ */
1528+ private class DirectRegex extends DataFlow:: CallCfgNode , RegexExecution:: Range {
1529+ RegexExecutionMethod method ;
1530+
1531+ DirectRegex ( ) { this = API:: moduleImport ( "re" ) .getMember ( method ) .getACall ( ) }
1532+
1533+ override DataFlow:: Node getRegexNode ( ) {
1534+ result in [ this .getArg ( 0 ) , this .getArgByName ( "pattern" ) ]
1535+ }
1536+
1537+ override DataFlow:: Node getString ( ) {
1538+ result in [ this .getArg ( stringArg ( method ) ) , this .getArgByName ( "string" ) ]
1539+ }
1540+ }
1541+
1542+ /** Helper module for tracking compiled regexes. */
1543+ private module CompiledRegexes {
1544+ private import semmle.python.dataflow.new.DataFlow2
1545+ private import semmle.python.RegexTreeView
1546+
1547+ // TODO: This module should be refactored once API graphs are more expressinve.
1548+ /** A configuration for finding uses of compiled regexes. */
1549+ class RegexDefinitionConfiguration extends DataFlow2:: Configuration {
1550+ RegexDefinitionConfiguration ( ) { this = "RegexDefinitionConfiguration" }
1551+
1552+ override predicate isSource ( DataFlow:: Node source ) { source instanceof RegexDefinitonSource }
1553+
1554+ override predicate isSink ( DataFlow:: Node sink ) { sink instanceof RegexDefinitionSink }
1555+ }
1556+
1557+ /** A regex compilation. */
1558+ class RegexDefinitonSource extends DataFlow:: CallCfgNode {
1559+ DataFlow:: Node regexNode ;
1560+
1561+ RegexDefinitonSource ( ) {
1562+ this = API:: moduleImport ( "re" ) .getMember ( "compile" ) .getACall ( ) and
1563+ regexNode in [ this .getArg ( 0 ) , this .getArgByName ( "pattern" ) ]
1564+ }
1565+
1566+ /** Gets the data flow node for the regex being compiled by this node. */
1567+ DataFlow:: Node getRegexNode ( ) { result = regexNode }
1568+ }
1569+
1570+ /** A use of a compiled regex. */
1571+ class RegexDefinitionSink extends DataFlow:: Node {
1572+ RegexExecutionMethod method ;
1573+ DataFlow:: CallCfgNode executingCall ;
1574+
1575+ RegexDefinitionSink ( ) {
1576+ executingCall =
1577+ API:: moduleImport ( "re" ) .getMember ( "compile" ) .getReturn ( ) .getMember ( method ) .getACall ( ) and
1578+ this = executingCall .getFunction ( ) .( DataFlow:: AttrRead ) .getObject ( )
1579+ }
1580+
1581+ /** Gets the method used to execute the regex. */
1582+ RegexExecutionMethod getMethod ( ) { result = method }
1583+
1584+ /** Gets the data flow node for the executing call. */
1585+ DataFlow:: CallCfgNode getExecutingCall ( ) { result = executingCall }
1586+ }
1587+ }
1588+
1589+ private import CompiledRegexes
1590+
1591+ /**
1592+ * A call on compiled regular expression (obtained via `re.compile`) executing a
1593+ * regular expression.
1594+ *
1595+ * Given the following example:
1596+ *
1597+ * ```py
1598+ * pattern = re.compile(input)
1599+ * pattern.match(s)
1600+ * ```
1601+ *
1602+ * This class will identify that `re.compile` compiles `input` and afterwards
1603+ * executes `re`'s `match`. As a result, `this` will refer to `pattern.match(s)`
1604+ * and `this.getRegexNode()` will return the node for `input` (`re.compile`'s first argument).
1605+ *
1606+ *
1607+ * See `RegexExecutionMethods`
1608+ *
1609+ * See https://docs.python.org/3/library/re.html#regular-expression-objects
1610+ */
1611+ private class CompiledRegex extends DataFlow:: CallCfgNode , RegexExecution {
1612+ DataFlow:: Node regexNode ;
1613+ RegexExecutionMethod method ;
1614+
1615+ CompiledRegex ( ) {
1616+ exists (
1617+ RegexDefinitionConfiguration conf , RegexDefinitonSource source , RegexDefinitionSink sink
1618+ |
1619+ conf .hasFlow ( source , sink ) and
1620+ regexNode = source .getRegexNode ( ) and
1621+ method = sink .getMethod ( ) and
1622+ this = sink .getExecutingCall ( )
1623+ )
1624+ }
1625+
1626+ override DataFlow:: Node getRegexNode ( ) { result = regexNode }
1627+
1628+ override DataFlow:: Node getString ( ) {
1629+ result in [ this .getArg ( stringArg ( method ) - 1 ) , this .getArgByName ( "string" ) ]
1630+ }
1631+ }
1632+
1633+ /**
1634+ * A call to 're.escape'.
1635+ * See https://docs.python.org/3/library/re.html#re.escape
1636+ */
1637+ private class ReEscapeCall extends Escaping:: Range , DataFlow:: CallCfgNode {
1638+ DataFlow:: Node regexNode ;
1639+
1640+ ReEscapeCall ( ) {
1641+ this = API:: moduleImport ( "re" ) .getMember ( "escape" ) .getACall ( ) and
1642+ regexNode in [ this .getArg ( 0 ) , this .getArgByName ( "pattern" ) ]
1643+ }
1644+
1645+ override DataFlow:: Node getAnInput ( ) { result = regexNode }
1646+
1647+ override DataFlow:: Node getOutput ( ) { result = this }
1648+
1649+ override string getKind ( ) { result = Escaping:: getRegexKind ( ) }
1650+ }
1651+
15001652// ---------------------------------------------------------------------------
15011653// OTHER
15021654// ---------------------------------------------------------------------------
0 commit comments