Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit b8194bd

Browse files
committed
Python: Add support for API graphs
Currently only supports the "use" side of things. For the most part, this follows the corresponding implementation for JavaScript. Major differences include: - No `MkImportUse` nodes -- we just move directly from `MkModuleImport` to its uses. - Paths are no longer labelled by s-expressions, but rather by a string that mirrors how you would access it in QL. This makes it very easy to see how to access an API component -- simply look at its `toString`! This PR also extends `LocalSourceNode` to support looking up attribute references and invocations of such nodes. This was again based on the JavaScript equivalent (though without specific classes for `InvokeNode` and the like, it's a bit more awkward to use).
1 parent c9537f2 commit b8194bd

2 files changed

Lines changed: 424 additions & 1 deletion

File tree

Lines changed: 343 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,343 @@
1+
import python
2+
import semmle.python.dataflow.new.DataFlow
3+
4+
module API {
5+
class Node extends Impl::TApiNode {
6+
/**
7+
* Gets a data-flow node corresponding to a use of the API component represented by this node.
8+
*
9+
* For example, `import re; re.escape` is a use of the `escape` function from the
10+
* `re` module, and `import re; re.escape("hello")` is a use of the return of that function.
11+
*
12+
* This includes indirect uses found via data flow, meaning that in
13+
* ```python
14+
* def f(x):
15+
* pass
16+
*
17+
* f(obj.foo)
18+
* ```
19+
* both `obj.foo` and `x` are uses of the `foo` member from `obj`.
20+
*/
21+
DataFlow::Node getAUse() {
22+
exists(DataFlow::LocalSourceNode src | Impl::use(this, src) |
23+
Impl::trackUseNode(src).flowsTo(result)
24+
)
25+
}
26+
27+
/**
28+
* Gets an immediate use of the API component represented by this node.
29+
*
30+
* For example, `import re; re.escape` is a an immediate use of the `escape` member
31+
* from the `re` module.
32+
*
33+
* Unlike `getAUse()`, this predicate only gets the immediate references, not the indirect uses
34+
* found via data flow. This means that in `x = re.escape` only `re.escape` is a reference
35+
* to the `escape` member of `re`, neither `x` nor any node that `x` flows to is a reference to
36+
* this API component.
37+
*/
38+
DataFlow::LocalSourceNode getAnImmediateUse() { Impl::use(this, result) }
39+
40+
/**
41+
* Gets a call to the function represented by this API component.
42+
*/
43+
DataFlow::Node getACall() { result = getReturn().getAnImmediateUse() }
44+
45+
/**
46+
* Gets a node representing member `m` of this API component.
47+
*
48+
* For example, modules have an `exports` member representing their exports, and objects have
49+
* their properties as members.
50+
*/
51+
bindingset[m]
52+
bindingset[result]
53+
Node getMember(string m) { result = getASuccessor(Label::member(m)) }
54+
55+
/**
56+
* Gets a node representing a member of this API component where the name of the member is
57+
* not known statically.
58+
*/
59+
Node getUnknownMember() { result = getASuccessor(Label::unknownMember()) }
60+
61+
/**
62+
* Gets a node representing a member of this API component where the name of the member may
63+
* or may not be known statically.
64+
*/
65+
Node getAMember() {
66+
result = getASuccessor(Label::member(_)) or
67+
result = getUnknownMember()
68+
}
69+
70+
/**
71+
* Gets a node representing the result of the function represented by this node.
72+
*
73+
* This predicate may have multiple results when there are multiple invocations of this API component.
74+
* Consider using `getACall()` if there is a need to distingiush between individual calls.
75+
*/
76+
Node getReturn() { result = getASuccessor(Label::return()) }
77+
78+
/**
79+
* Gets a string representation of the lexicographically least among all shortest access paths
80+
* from the root to this node.
81+
*/
82+
string getPath() { result = min(string p | p = getAPath(Impl::distanceFromRoot(this)) | p) }
83+
84+
/**
85+
* Gets a node such that there is an edge in the API graph between this node and the other
86+
* one, and that edge is labeled with `lbl`.
87+
*/
88+
Node getASuccessor(string lbl) { Impl::edge(this, lbl, result) }
89+
90+
/**
91+
* Gets a node such that there is an edge in the API graph between that other node and
92+
* this one, and that edge is labeled with `lbl`
93+
*/
94+
Node getAPredecessor(string lbl) { this = result.getASuccessor(lbl) }
95+
96+
/**
97+
* Gets a node such that there is an edge in the API graph between this node and the other
98+
* one.
99+
*/
100+
Node getAPredecessor() { result = getAPredecessor(_) }
101+
102+
/**
103+
* Gets a node such that there is an edge in the API graph between that other node and
104+
* this one.
105+
*/
106+
Node getASuccessor() { result = getASuccessor(_) }
107+
108+
/**
109+
* Gets the data-flow node that gives rise to this node, if any.
110+
*/
111+
DataFlow::Node getInducingNode() { this = Impl::MkUse(result) }
112+
113+
/**
114+
* Holds if this node is located in file `path` between line `startline`, column `startcol`,
115+
* and line `endline`, column `endcol`.
116+
*
117+
* For nodes that do not have a meaningful location, `path` is the empty string and all other
118+
* parameters are zero.
119+
*/
120+
predicate hasLocationInfo(string path, int startline, int startcol, int endline, int endcol) {
121+
getInducingNode().hasLocationInfo(path, startline, startcol, endline, endcol)
122+
or
123+
not exists(getInducingNode()) and
124+
path = "" and
125+
startline = 0 and
126+
startcol = 0 and
127+
endline = 0 and
128+
endcol = 0
129+
}
130+
131+
/**
132+
* Gets a textual representation of this node.
133+
*/
134+
string toString() {
135+
none() // defined in subclasses
136+
}
137+
138+
/**
139+
* Gets a path of the given `length` from the root to this node.
140+
*/
141+
private string getAPath(int length) {
142+
this instanceof Impl::MkRoot and
143+
length = 0 and
144+
result = ""
145+
or
146+
exists(Node pred, string lbl, string predpath |
147+
Impl::edge(pred, lbl, this) and
148+
lbl != "" and
149+
predpath = pred.getAPath(length - 1) and
150+
exists(string dot | if length = 1 then dot = "" else dot = "." |
151+
result = predpath + dot + lbl and
152+
// avoid producing strings longer than 1MB
153+
result.length() < 1000 * 1000
154+
)
155+
) and
156+
length in [1 .. Impl::distanceFromRoot(this)]
157+
}
158+
159+
/** Gets the shortest distance from the root to this node in the API graph. */
160+
int getDepth() { result = Impl::distanceFromRoot(this) }
161+
}
162+
163+
/** The root node of an API graph. */
164+
class Root extends Node, Impl::MkRoot {
165+
override string toString() { result = "root" }
166+
}
167+
168+
/** A node corresponding to the use of an API component. */
169+
class Use extends Node, Impl::TUse {
170+
override string toString() {
171+
exists(string type |
172+
this = Impl::MkUse(_) and type = "Use "
173+
or
174+
this = Impl::MkModuleImport(_) and type = "ModuleImport "
175+
|
176+
result = type + getPath()
177+
or
178+
not exists(this.getPath()) and result = type + "with no path"
179+
)
180+
}
181+
}
182+
183+
/** Gets the root node. */
184+
Root root() { any() }
185+
186+
/** Gets a node corresponding to an import of module `m`. */
187+
Node moduleImport(string m) { result = Impl::MkModuleImport(m) }
188+
189+
/**
190+
* Provides the actual implementation of API graphs, cached for performance.
191+
*
192+
* Ideally, we'd like nodes to correspond to (global) access paths, with edge labels
193+
* corresponding to extending the access path by one element. We also want to be able to map
194+
* nodes to their definitions and uses in the data-flow graph, and this should happen modulo
195+
* (inter-procedural) data flow.
196+
*
197+
* This, however, is not easy to implement, since access paths can have unbounded length
198+
* and we need some way of recognizing cycles to avoid non-termination. Unfortunately, expressing
199+
* a condition like "this node hasn't been involved in constructing any predecessor of
200+
* this node in the API graph" without negative recursion is tricky.
201+
*
202+
* So instead most nodes are directly associated with a data-flow node, representing
203+
* either a use or a definition of an API component. This ensures that we only have a finite
204+
* number of nodes. However, we can now have multiple nodes with the same access
205+
* path, which are essentially indistinguishable for a client of the API.
206+
*
207+
* On the other hand, a single node can have multiple access paths (which is, of
208+
* course, unavoidable). We pick as canonical the alphabetically least access path with
209+
* shortest length.
210+
*/
211+
cached
212+
private module Impl {
213+
cached
214+
newtype TApiNode =
215+
/** The root of the API graph. */
216+
MkRoot() or
217+
/** An abstract representative for imports of the module called `name`. */
218+
MkModuleImport(string name) { imports(_, name) } or
219+
/** A use of an API member at the node `nd`. */
220+
MkUse(DataFlow::Node nd) { use(_, _, nd) }
221+
222+
class TUse = MkModuleImport or MkUse;
223+
224+
/** Holds if `imp` is an import of a module named `name` */
225+
private predicate imports(DataFlow::Node imp, string name) { imp = DataFlow::importNode(name) }
226+
227+
/**
228+
* Holds if `ref` is a use of a node that should have an incoming edge from `base` labeled
229+
* `lbl` in the API graph.
230+
*/
231+
cached
232+
predicate use(TApiNode base, string lbl, DataFlow::Node ref) {
233+
exists(DataFlow::LocalSourceNode src, DataFlow::LocalSourceNode pred |
234+
use(base, src) and pred = trackUseNode(src)
235+
|
236+
lbl = Label::memberFromRef(ref) and
237+
ref = pred.getAnAttributeRead()
238+
or
239+
lbl = Label::return() and
240+
ref = pred.getAnInvocation()
241+
)
242+
}
243+
244+
/**
245+
* Holds if `ref` is a use of node `nd`.
246+
*/
247+
cached
248+
predicate use(TApiNode nd, DataFlow::Node ref) {
249+
exists(string name |
250+
nd = MkModuleImport(name) and
251+
ref = DataFlow::importNode(name)
252+
)
253+
or
254+
nd = MkUse(ref)
255+
}
256+
257+
/**
258+
* Gets a data-flow node to which `nd`, which is a use of an API-graph node, flows.
259+
*
260+
* The flow from `nd` to that node may be inter-procedural.
261+
*/
262+
private DataFlow::LocalSourceNode trackUseNode(
263+
DataFlow::LocalSourceNode src, DataFlow::TypeTracker t
264+
) {
265+
t.start() and
266+
use(_, src) and
267+
result = src
268+
or
269+
// Due to bad performance when using `trackUseNode(t2, attr_name).track(t2, t)`
270+
// we have inlined that code and forced a join
271+
exists(DataFlow::StepSummary summary |
272+
t = trackUseNode_first_join(src, result, summary).append(summary)
273+
)
274+
}
275+
276+
pragma[nomagic]
277+
private DataFlow::TypeTracker trackUseNode_first_join(
278+
DataFlow::LocalSourceNode src, DataFlow::LocalSourceNode res, DataFlow::StepSummary summary
279+
) {
280+
DataFlow::StepSummary::step(trackUseNode(src, result), res, summary)
281+
}
282+
283+
cached
284+
DataFlow::LocalSourceNode trackUseNode(DataFlow::LocalSourceNode src) {
285+
result = trackUseNode(src, DataFlow::TypeTracker::end())
286+
}
287+
288+
/**
289+
* Holds if there is an edge from `pred` to `succ` in the API graph that is labeled with `lbl`.
290+
*/
291+
cached
292+
predicate edge(Node pred, string lbl, Node succ) {
293+
/* There's an edge from the root node for each imported module. */
294+
exists(string m |
295+
pred = MkRoot() and
296+
lbl = Label::mod(m)
297+
|
298+
succ = MkModuleImport(m)
299+
)
300+
or
301+
/* Every node that is a use of an API component is itself added to the API graph. */
302+
exists(DataFlow::LocalSourceNode ref |
303+
use(pred, lbl, ref) and
304+
succ = MkUse(ref)
305+
)
306+
}
307+
308+
/**
309+
* Holds if there is an edge from `pred` to `succ` in the API graph.
310+
*/
311+
private predicate edge(TApiNode pred, TApiNode succ) { edge(pred, _, succ) }
312+
313+
/** Gets the shortest distance from the root to `nd` in the API graph. */
314+
cached
315+
int distanceFromRoot(TApiNode nd) = shortestDistances(MkRoot/0, edge/2)(_, nd, result)
316+
}
317+
}
318+
319+
private module Label {
320+
/** Gets the edge label for the module `m`. */
321+
bindingset[m]
322+
bindingset[result]
323+
string mod(string m) { result = "moduleImport(\"" + m + "\")" }
324+
325+
/** Gets the `member` edge label for member `m`. */
326+
bindingset[m]
327+
bindingset[result]
328+
string member(string m) { result = "getMember(\"" + m + "\")" }
329+
330+
/** Gets the `member` edge label for the unknown member. */
331+
string unknownMember() { result = "getUnknownMember()" }
332+
333+
/** Gets the `member` edge label for the given attribute reference. */
334+
string memberFromRef(DataFlow::AttrRef pr) {
335+
result = member(pr.getAttributeName())
336+
or
337+
not exists(pr.getAttributeName()) and
338+
result = unknownMember()
339+
}
340+
341+
/** Gets the `return` edge label. */
342+
string return() { result = "getReturn()" }
343+
}

0 commit comments

Comments
 (0)