Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit c1c2b3e

Browse files
committed
dis(): This had a problem with proto 0 pickles, in that POP sometimes
popped a MARK, but without stack emulation the disassembler couldn't know that, and subsequent indentation got hosed. Now the disassembler does do enough stack emulation to catch this. While I was at it, also added lots of sanity checks for other stack operations, and correct use of the memo. This goes (I think) a long way toward being a "pickle verifier" now too.
1 parent 5d9113d commit c1c2b3e

1 file changed

Lines changed: 113 additions & 17 deletions

File tree

Lib/pickletools.py

Lines changed: 113 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,11 @@
1313
# Other ideas:
1414
#
1515
# - A pickle verifier: read a pickle and check it exhaustively for
16-
# well-formedness.
16+
# well-formedness. dis() does a lot of this already.
1717
#
1818
# - A protocol identifier: examine a pickle and return its protocol number
1919
# (== the highest .proto attr value among all the opcodes in the pickle).
20+
# dis() already prints this info at the end.
2021
#
2122
# - A pickle optimizer: for example, tuple-building code is sometimes more
2223
# elaborate than necessary, catering for the possibility that the tuple
@@ -712,6 +713,9 @@ def __init__(self, name, obtype, doc):
712713
assert isinstance(doc, str)
713714
self.doc = doc
714715

716+
def __repr__(self):
717+
return self.name
718+
715719

716720
pyint = StackObject(
717721
name='int',
@@ -1858,10 +1862,33 @@ def dis(pickle, out=None, indentlevel=4):
18581862
18591863
Optional arg indentlevel is the number of blanks by which to indent
18601864
a new MARK level. It defaults to 4.
1865+
1866+
In addition to printing the disassembly, some sanity checks are made:
1867+
1868+
+ All embedded opcode arguments "make sense".
1869+
1870+
+ Explicit and implicit pop operations have enough items on the stack.
1871+
1872+
+ When an opcode implicitly refers to a markobject, a markobject is
1873+
actually on the stack.
1874+
1875+
+ A memo entry isn't referenced before it's defined.
1876+
1877+
+ The markobject isn't stored in the memo.
1878+
1879+
+ A memo entry isn't redefined.
18611880
"""
18621881

1863-
markstack = []
1882+
# Most of the hair here is for sanity checks, but most of it is needed
1883+
# anyway to detect when a protocol 0 POP takes a MARK off the stack
1884+
# (which in turn is needed to indent MARK blocks correctly).
1885+
1886+
stack = [] # crude emulation of unpickler stack
1887+
memo = {} # crude emulation of unpicker memo
1888+
maxproto = -1 # max protocol number seen
1889+
markstack = [] # bytecode positions of MARK opcodes
18641890
indentchunk = ' ' * indentlevel
1891+
errormsg = None
18651892
for opcode, arg, pos in genops(pickle):
18661893
if pos is not None:
18671894
print >> out, "%5d:" % pos,
@@ -1870,12 +1897,54 @@ def dis(pickle, out=None, indentlevel=4):
18701897
indentchunk * len(markstack),
18711898
opcode.name)
18721899

1900+
maxproto = max(maxproto, opcode.proto)
1901+
1902+
# See whether a MARK should be popped.
1903+
before = opcode.stack_before # don't mutate
1904+
after = opcode.stack_after # don't mutate
18731905
markmsg = None
1874-
if markstack and markobject in opcode.stack_before:
1875-
assert markobject not in opcode.stack_after
1876-
markpos = markstack.pop()
1877-
if markpos is not None:
1878-
markmsg = "(MARK at %d)" % markpos
1906+
if markobject in before or (opcode.name == "POP" and
1907+
stack and
1908+
stack[-1] is markobject):
1909+
assert markobject not in after
1910+
if markstack:
1911+
markpos = markstack.pop()
1912+
if markpos is None:
1913+
markmsg = "(MARK at unknown opcode offset)"
1914+
else:
1915+
markmsg = "(MARK at %d)" % markpos
1916+
# Pop everything at and after the topmost markobject.
1917+
while stack[-1] is not markobject:
1918+
stack.pop()
1919+
stack.pop()
1920+
# Remove markobject stuff from stack_before.
1921+
try:
1922+
i = before.index(markobject)
1923+
before = before[:i]
1924+
except ValueError:
1925+
assert opcode.name == "POP"
1926+
assert len(before) == 1
1927+
before = [] # stop code later from popping again
1928+
else:
1929+
errormsg = markmsg = "no MARK exists on stack"
1930+
1931+
# Check for correct memo usage.
1932+
if opcode.name in ("PUT", "BINPUT", "LONG_BINPUT"):
1933+
if arg in memo:
1934+
errormsg = "memo key %r already defined" % arg
1935+
elif not stack:
1936+
errormsg = "stack is empty -- can't store into memo"
1937+
elif stack[-1] is markobject:
1938+
errormsg = "can't store markobject in the memo"
1939+
else:
1940+
memo[arg] = stack[-1]
1941+
1942+
elif opcode.name in ("GET", "BINGET", "LONG_BINGET"):
1943+
if arg in memo:
1944+
assert len(after) == 1
1945+
after = [memo[arg]] # for better stack emulation
1946+
else:
1947+
errormsg = "memo key %r has never been stored into" % arg
18791948

18801949
if arg is not None or markmsg:
18811950
# make a mild effort to align arguments
@@ -1886,10 +1955,27 @@ def dis(pickle, out=None, indentlevel=4):
18861955
line += ' ' + markmsg
18871956
print >> out, line
18881957

1889-
if markobject in opcode.stack_after:
1958+
if errormsg:
1959+
# Note that we delayed complaining until the offending opcode
1960+
# was printed.
1961+
raise ValueError(errormsg)
1962+
1963+
# Emulate the stack effects.
1964+
n = len(before)
1965+
if len(stack) < n:
1966+
raise ValueError("tried to pop %d items from stack with "
1967+
"only %d items" % (n, len(stack)))
1968+
if n:
1969+
del stack[-n:]
1970+
if markobject in after:
18901971
assert markobject not in opcode.stack_before
18911972
markstack.append(pos)
18921973

1974+
stack.extend(after)
1975+
1976+
print >> out, "highest protocol among opcodes =", maxproto
1977+
if stack:
1978+
raise ValueError("stack not empty after STOP: %r" % stack)
18931979

18941980
_dis_test = r"""
18951981
>>> import pickle
@@ -1919,6 +2005,7 @@ def dis(pickle, out=None, indentlevel=4):
19192005
48: s SETITEM
19202006
49: a APPEND
19212007
50: . STOP
2008+
highest protocol among opcodes = 0
19222009
19232010
Try again with a "binary" pickle.
19242011
@@ -1943,6 +2030,7 @@ def dis(pickle, out=None, indentlevel=4):
19432030
36: s SETITEM
19442031
37: e APPENDS (MARK at 3)
19452032
38: . STOP
2033+
highest protocol among opcodes = 1
19462034
19472035
Exercise the INST/OBJ/BUILD family.
19482036
@@ -1951,6 +2039,7 @@ def dis(pickle, out=None, indentlevel=4):
19512039
0: c GLOBAL 'random random'
19522040
15: p PUT 0
19532041
18: . STOP
2042+
highest protocol among opcodes = 0
19542043
19552044
>>> x = [pickle.PicklingError()] * 2
19562045
>>> dis(pickle.dumps(x, 0))
@@ -1973,6 +2062,7 @@ def dis(pickle, out=None, indentlevel=4):
19732062
52: g GET 1
19742063
55: a APPEND
19752064
56: . STOP
2065+
highest protocol among opcodes = 0
19762066
19772067
>>> dis(pickle.dumps(x, 1))
19782068
0: ] EMPTY_LIST
@@ -1993,6 +2083,7 @@ def dis(pickle, out=None, indentlevel=4):
19932083
46: h BINGET 2
19942084
48: e APPENDS (MARK at 3)
19952085
49: . STOP
2086+
highest protocol among opcodes = 1
19962087
19972088
Try "the canonical" recursive-object test.
19982089
@@ -2017,6 +2108,8 @@ def dis(pickle, out=None, indentlevel=4):
20172108
10: p PUT 1
20182109
13: a APPEND
20192110
14: . STOP
2111+
highest protocol among opcodes = 0
2112+
20202113
>>> dis(pickle.dumps(L, 1))
20212114
0: ] EMPTY_LIST
20222115
1: q BINPUT 0
@@ -2026,13 +2119,11 @@ def dis(pickle, out=None, indentlevel=4):
20262119
7: q BINPUT 1
20272120
9: a APPEND
20282121
10: . STOP
2122+
highest protocol among opcodes = 1
20292123
2030-
The protocol 0 pickle of the tuple causes the disassembly to get confused,
2031-
as it doesn't realize that the POP opcode at 16 gets rid of the MARK at 0
2032-
(so the output remains indented until the end). The protocol 1 pickle
2033-
doesn't trigger this glitch, because the disassembler realizes that
2034-
POP_MARK gets rid of the MARK. Doing a better job on the protocol 0
2035-
pickle would require the disassembler to emulate the stack.
2124+
Note that, in the protocol 0 pickle of the recursive tuple, the disassembler
2125+
has to emulate the stack in order to realize that the POP opcode at 16 gets
2126+
rid of the MARK at 0.
20362127
20372128
>>> dis(pickle.dumps(T, 0))
20382129
0: ( MARK
@@ -2045,9 +2136,11 @@ def dis(pickle, out=None, indentlevel=4):
20452136
11: p PUT 1
20462137
14: a APPEND
20472138
15: 0 POP
2048-
16: 0 POP
2049-
17: g GET 1
2050-
20: . STOP
2139+
16: 0 POP (MARK at 0)
2140+
17: g GET 1
2141+
20: . STOP
2142+
highest protocol among opcodes = 0
2143+
20512144
>>> dis(pickle.dumps(T, 1))
20522145
0: ( MARK
20532146
1: ] EMPTY_LIST
@@ -2060,6 +2153,7 @@ def dis(pickle, out=None, indentlevel=4):
20602153
11: 1 POP_MARK (MARK at 0)
20612154
12: h BINGET 1
20622155
14: . STOP
2156+
highest protocol among opcodes = 1
20632157
20642158
Try protocol 2.
20652159
@@ -2072,6 +2166,7 @@ def dis(pickle, out=None, indentlevel=4):
20722166
8: q BINPUT 1
20732167
10: a APPEND
20742168
11: . STOP
2169+
highest protocol among opcodes = 2
20752170
20762171
>>> dis(pickle.dumps(T, 2))
20772172
0: \x80 PROTO 2
@@ -2084,6 +2179,7 @@ def dis(pickle, out=None, indentlevel=4):
20842179
11: 0 POP
20852180
12: h BINGET 1
20862181
14: . STOP
2182+
highest protocol among opcodes = 2
20872183
"""
20882184

20892185
__test__ = {'disassembler_test': _dis_test,

0 commit comments

Comments
 (0)