1313# Other ideas:
1414#
1515# - A pickle verifier: read a pickle and check it exhaustively for
16- # well-formedness.
16+ # well-formedness. dis() does a lot of this already.
1717#
1818# - A protocol identifier: examine a pickle and return its protocol number
1919# (== the highest .proto attr value among all the opcodes in the pickle).
20+ # dis() already prints this info at the end.
2021#
2122# - A pickle optimizer: for example, tuple-building code is sometimes more
2223# elaborate than necessary, catering for the possibility that the tuple
@@ -712,6 +713,9 @@ def __init__(self, name, obtype, doc):
712713 assert isinstance (doc , str )
713714 self .doc = doc
714715
716+ def __repr__ (self ):
717+ return self .name
718+
715719
716720pyint = StackObject (
717721 name = 'int' ,
@@ -1858,10 +1862,33 @@ def dis(pickle, out=None, indentlevel=4):
18581862
18591863 Optional arg indentlevel is the number of blanks by which to indent
18601864 a new MARK level. It defaults to 4.
1865+
1866+ In addition to printing the disassembly, some sanity checks are made:
1867+
1868+ + All embedded opcode arguments "make sense".
1869+
1870+ + Explicit and implicit pop operations have enough items on the stack.
1871+
1872+ + When an opcode implicitly refers to a markobject, a markobject is
1873+ actually on the stack.
1874+
1875+ + A memo entry isn't referenced before it's defined.
1876+
1877+ + The markobject isn't stored in the memo.
1878+
1879+ + A memo entry isn't redefined.
18611880 """
18621881
1863- markstack = []
1882+ # Most of the hair here is for sanity checks, but most of it is needed
1883+ # anyway to detect when a protocol 0 POP takes a MARK off the stack
1884+ # (which in turn is needed to indent MARK blocks correctly).
1885+
1886+ stack = [] # crude emulation of unpickler stack
1887+ memo = {} # crude emulation of unpicker memo
1888+ maxproto = - 1 # max protocol number seen
1889+ markstack = [] # bytecode positions of MARK opcodes
18641890 indentchunk = ' ' * indentlevel
1891+ errormsg = None
18651892 for opcode , arg , pos in genops (pickle ):
18661893 if pos is not None :
18671894 print >> out , "%5d:" % pos ,
@@ -1870,12 +1897,54 @@ def dis(pickle, out=None, indentlevel=4):
18701897 indentchunk * len (markstack ),
18711898 opcode .name )
18721899
1900+ maxproto = max (maxproto , opcode .proto )
1901+
1902+ # See whether a MARK should be popped.
1903+ before = opcode .stack_before # don't mutate
1904+ after = opcode .stack_after # don't mutate
18731905 markmsg = None
1874- if markstack and markobject in opcode .stack_before :
1875- assert markobject not in opcode .stack_after
1876- markpos = markstack .pop ()
1877- if markpos is not None :
1878- markmsg = "(MARK at %d)" % markpos
1906+ if markobject in before or (opcode .name == "POP" and
1907+ stack and
1908+ stack [- 1 ] is markobject ):
1909+ assert markobject not in after
1910+ if markstack :
1911+ markpos = markstack .pop ()
1912+ if markpos is None :
1913+ markmsg = "(MARK at unknown opcode offset)"
1914+ else :
1915+ markmsg = "(MARK at %d)" % markpos
1916+ # Pop everything at and after the topmost markobject.
1917+ while stack [- 1 ] is not markobject :
1918+ stack .pop ()
1919+ stack .pop ()
1920+ # Remove markobject stuff from stack_before.
1921+ try :
1922+ i = before .index (markobject )
1923+ before = before [:i ]
1924+ except ValueError :
1925+ assert opcode .name == "POP"
1926+ assert len (before ) == 1
1927+ before = [] # stop code later from popping again
1928+ else :
1929+ errormsg = markmsg = "no MARK exists on stack"
1930+
1931+ # Check for correct memo usage.
1932+ if opcode .name in ("PUT" , "BINPUT" , "LONG_BINPUT" ):
1933+ if arg in memo :
1934+ errormsg = "memo key %r already defined" % arg
1935+ elif not stack :
1936+ errormsg = "stack is empty -- can't store into memo"
1937+ elif stack [- 1 ] is markobject :
1938+ errormsg = "can't store markobject in the memo"
1939+ else :
1940+ memo [arg ] = stack [- 1 ]
1941+
1942+ elif opcode .name in ("GET" , "BINGET" , "LONG_BINGET" ):
1943+ if arg in memo :
1944+ assert len (after ) == 1
1945+ after = [memo [arg ]] # for better stack emulation
1946+ else :
1947+ errormsg = "memo key %r has never been stored into" % arg
18791948
18801949 if arg is not None or markmsg :
18811950 # make a mild effort to align arguments
@@ -1886,10 +1955,27 @@ def dis(pickle, out=None, indentlevel=4):
18861955 line += ' ' + markmsg
18871956 print >> out , line
18881957
1889- if markobject in opcode .stack_after :
1958+ if errormsg :
1959+ # Note that we delayed complaining until the offending opcode
1960+ # was printed.
1961+ raise ValueError (errormsg )
1962+
1963+ # Emulate the stack effects.
1964+ n = len (before )
1965+ if len (stack ) < n :
1966+ raise ValueError ("tried to pop %d items from stack with "
1967+ "only %d items" % (n , len (stack )))
1968+ if n :
1969+ del stack [- n :]
1970+ if markobject in after :
18901971 assert markobject not in opcode .stack_before
18911972 markstack .append (pos )
18921973
1974+ stack .extend (after )
1975+
1976+ print >> out , "highest protocol among opcodes =" , maxproto
1977+ if stack :
1978+ raise ValueError ("stack not empty after STOP: %r" % stack )
18931979
18941980_dis_test = r"""
18951981>>> import pickle
@@ -1919,6 +2005,7 @@ def dis(pickle, out=None, indentlevel=4):
19192005 48: s SETITEM
19202006 49: a APPEND
19212007 50: . STOP
2008+ highest protocol among opcodes = 0
19222009
19232010Try again with a "binary" pickle.
19242011
@@ -1943,6 +2030,7 @@ def dis(pickle, out=None, indentlevel=4):
19432030 36: s SETITEM
19442031 37: e APPENDS (MARK at 3)
19452032 38: . STOP
2033+ highest protocol among opcodes = 1
19462034
19472035Exercise the INST/OBJ/BUILD family.
19482036
@@ -1951,6 +2039,7 @@ def dis(pickle, out=None, indentlevel=4):
19512039 0: c GLOBAL 'random random'
19522040 15: p PUT 0
19532041 18: . STOP
2042+ highest protocol among opcodes = 0
19542043
19552044>>> x = [pickle.PicklingError()] * 2
19562045>>> dis(pickle.dumps(x, 0))
@@ -1973,6 +2062,7 @@ def dis(pickle, out=None, indentlevel=4):
19732062 52: g GET 1
19742063 55: a APPEND
19752064 56: . STOP
2065+ highest protocol among opcodes = 0
19762066
19772067>>> dis(pickle.dumps(x, 1))
19782068 0: ] EMPTY_LIST
@@ -1993,6 +2083,7 @@ def dis(pickle, out=None, indentlevel=4):
19932083 46: h BINGET 2
19942084 48: e APPENDS (MARK at 3)
19952085 49: . STOP
2086+ highest protocol among opcodes = 1
19962087
19972088Try "the canonical" recursive-object test.
19982089
@@ -2017,6 +2108,8 @@ def dis(pickle, out=None, indentlevel=4):
20172108 10: p PUT 1
20182109 13: a APPEND
20192110 14: . STOP
2111+ highest protocol among opcodes = 0
2112+
20202113>>> dis(pickle.dumps(L, 1))
20212114 0: ] EMPTY_LIST
20222115 1: q BINPUT 0
@@ -2026,13 +2119,11 @@ def dis(pickle, out=None, indentlevel=4):
20262119 7: q BINPUT 1
20272120 9: a APPEND
20282121 10: . STOP
2122+ highest protocol among opcodes = 1
20292123
2030- The protocol 0 pickle of the tuple causes the disassembly to get confused,
2031- as it doesn't realize that the POP opcode at 16 gets rid of the MARK at 0
2032- (so the output remains indented until the end). The protocol 1 pickle
2033- doesn't trigger this glitch, because the disassembler realizes that
2034- POP_MARK gets rid of the MARK. Doing a better job on the protocol 0
2035- pickle would require the disassembler to emulate the stack.
2124+ Note that, in the protocol 0 pickle of the recursive tuple, the disassembler
2125+ has to emulate the stack in order to realize that the POP opcode at 16 gets
2126+ rid of the MARK at 0.
20362127
20372128>>> dis(pickle.dumps(T, 0))
20382129 0: ( MARK
@@ -2045,9 +2136,11 @@ def dis(pickle, out=None, indentlevel=4):
20452136 11: p PUT 1
20462137 14: a APPEND
20472138 15: 0 POP
2048- 16: 0 POP
2049- 17: g GET 1
2050- 20: . STOP
2139+ 16: 0 POP (MARK at 0)
2140+ 17: g GET 1
2141+ 20: . STOP
2142+ highest protocol among opcodes = 0
2143+
20512144>>> dis(pickle.dumps(T, 1))
20522145 0: ( MARK
20532146 1: ] EMPTY_LIST
@@ -2060,6 +2153,7 @@ def dis(pickle, out=None, indentlevel=4):
20602153 11: 1 POP_MARK (MARK at 0)
20612154 12: h BINGET 1
20622155 14: . STOP
2156+ highest protocol among opcodes = 1
20632157
20642158Try protocol 2.
20652159
@@ -2072,6 +2166,7 @@ def dis(pickle, out=None, indentlevel=4):
20722166 8: q BINPUT 1
20732167 10: a APPEND
20742168 11: . STOP
2169+ highest protocol among opcodes = 2
20752170
20762171>>> dis(pickle.dumps(T, 2))
20772172 0: \x80 PROTO 2
@@ -2084,6 +2179,7 @@ def dis(pickle, out=None, indentlevel=4):
20842179 11: 0 POP
20852180 12: h BINGET 1
20862181 14: . STOP
2182+ highest protocol among opcodes = 2
20872183"""
20882184
20892185__test__ = {'disassembler_test' : _dis_test ,
0 commit comments