2121
2222interesting = re .compile ('[&<]' )
2323incomplete = re .compile ('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
24- '<([a-zA-Z][^<>]*|'
25- '/([a-zA-Z][^<>]*)?|'
26- '![^<>]*)?' )
24+ '<([a-zA-Z][^<>]*|'
25+ '/([a-zA-Z][^<>]*)?|'
26+ '![^<>]*)?' )
2727
2828entityref = re .compile ('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]' )
2929charref = re .compile ('&#([0-9]+)[^0-9]' )
@@ -58,8 +58,8 @@ class SGMLParseError(RuntimeError):
5858class SGMLParser (_markupbase .ParserBase ):
5959 # Definition of entities -- derived classes may override
6060 entity_or_charref = re .compile ('&(?:'
61- '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)'
62- ')(;?)' )
61+ '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)'
62+ ')(;?)' )
6363
6464 def __init__ (self , verbose = 0 ):
6565 """Initialize and reset this instance."""
@@ -121,32 +121,37 @@ def goahead(self, end):
121121 i = n
122122 break
123123 match = interesting .search (rawdata , i )
124- if match : j = match .start ()
125- else : j = n
124+ if match :
125+ j = match .start ()
126+ else :
127+ j = n
126128 if i < j :
127129 self .handle_data (rawdata [i :j ])
128130 i = j
129- if i == n : break
131+ if i == n :
132+ break
130133 if rawdata [i ] == '<' :
131134 if starttagopen .match (rawdata , i ):
132135 if self .literal :
133136 self .handle_data (rawdata [i ])
134- i = i + 1
137+ i = i + 1
135138 continue
136139 k = self .parse_starttag (i )
137- if k < 0 : break
140+ if k < 0 :
141+ break
138142 i = k
139143 continue
140144 if rawdata .startswith ("</" , i ):
141145 k = self .parse_endtag (i )
142- if k < 0 : break
146+ if k < 0 :
147+ break
143148 i = k
144149 self .literal = 0
145150 continue
146151 if self .literal :
147152 if n > (i + 1 ):
148153 self .handle_data ("<" )
149- i = i + 1
154+ i = i + 1
150155 else :
151156 # incomplete
152157 break
@@ -157,40 +162,45 @@ def goahead(self, end):
157162 # This should be removed,
158163 # and comments handled only in parse_declaration.
159164 k = self .parse_comment (i )
160- if k < 0 : break
165+ if k < 0 :
166+ break
161167 i = k
162168 continue
163169 if rawdata .startswith ("<?" , i ):
164170 k = self .parse_pi (i )
165- if k < 0 : break
171+ if k < 0 :
172+ break
166173 i = i + k
167174 continue
168175 if rawdata .startswith ("<!" , i ):
169176 # This is some sort of declaration; in "HTML as
170177 # deployed," this should only be the document type
171178 # declaration ("<!DOCTYPE html...>").
172179 k = self .parse_declaration (i )
173- if k < 0 : break
180+ if k < 0 :
181+ break
174182 i = k
175183 continue
176184 elif rawdata [i ] == '&' :
177185 if self .literal :
178186 self .handle_data (rawdata [i ])
179- i = i + 1
187+ i = i + 1
180188 continue
181189 match = charref .match (rawdata , i )
182190 if match :
183191 name = match .group (1 )
184192 self .handle_charref (name )
185193 i = match .end (0 )
186- if rawdata [i - 1 ] != ';' : i = i - 1
194+ if rawdata [i - 1 ] != ';' :
195+ i = i - 1
187196 continue
188197 match = entityref .match (rawdata , i )
189198 if match :
190199 name = match .group (1 )
191200 self .handle_entityref (name )
192201 i = match .end (0 )
193- if rawdata [i - 1 ] != ';' : i = i - 1
202+ if rawdata [i - 1 ] != ';' :
203+ i = i - 1
194204 continue
195205 else :
196206 self .error ('neither < nor & ??' )
@@ -199,11 +209,11 @@ def goahead(self, end):
199209 match = incomplete .match (rawdata , i )
200210 if not match :
201211 self .handle_data (rawdata [i ])
202- i = i + 1
212+ i = i + 1
203213 continue
204214 j = match .end (0 )
205215 if j == n :
206- break # Really incomplete
216+ break # Really incomplete
207217 self .handle_data (rawdata [i :j ])
208218 i = j
209219 # end while
@@ -256,40 +266,41 @@ def parse_starttag(self, i):
256266 # As a shortcut way to exit, this isn't so bad, but shouldn't
257267 # be used to locate the actual end of the start tag since the
258268 # < or > characters may be embedded in an attribute value.
259- match = endbracket .search (rawdata , i + 1 )
269+ match = endbracket .search (rawdata , i + 1 )
260270 if not match :
261271 return - 1
262272 j = match .start (0 )
263- # Now parse the data between i+ 1 and j into a tag and attrs
273+ # Now parse the data between i + 1 and j into a tag and attrs
264274 attrs = []
265275 if rawdata [i :i + 2 ] == '<>' :
266276 # SGML shorthand: <> == <last open tag seen>
267277 k = j
268278 tag = self .lasttag
269279 else :
270- match = tagfind .match (rawdata , i + 1 )
280+ match = tagfind .match (rawdata , i + 1 )
271281 if not match :
272282 self .error ('unexpected call to parse_starttag' )
273283 k = match .end (0 )
274- tag = rawdata [i + 1 :k ].lower ()
284+ tag = rawdata [i + 1 :k ].lower ()
275285 self .lasttag = tag
276286 while k < j :
277287 match = attrfind .match (rawdata , k )
278- if not match : break
288+ if not match :
289+ break
279290 attrname , rest , attrvalue = match .group (1 , 2 , 3 )
280291 if not rest :
281292 attrvalue = attrname
282293 else :
283294 if (attrvalue [:1 ] == "'" == attrvalue [- 1 :] or
284- attrvalue [:1 ] == '"' == attrvalue [- 1 :]):
295+ attrvalue [:1 ] == '"' == attrvalue [- 1 :]):
285296 # strip quotes
286297 attrvalue = attrvalue [1 :- 1 ]
287298 attrvalue = self .entity_or_charref .sub (
288299 self ._convert_ref , attrvalue )
289300 attrs .append ((attrname .lower (), attrvalue ))
290301 k = match .end (0 )
291302 if rawdata [j ] == '>' :
292- j = j + 1
303+ j = j + 1
293304 self .__starttag_text = rawdata [start_pos :j ]
294305 self .finish_starttag (tag , attrs )
295306 return j
@@ -308,13 +319,13 @@ def _convert_ref(self, match):
308319 # Internal -- parse endtag
309320 def parse_endtag (self , i ):
310321 rawdata = self .rawdata
311- match = endbracket .search (rawdata , i + 1 )
322+ match = endbracket .search (rawdata , i + 1 )
312323 if not match :
313324 return - 1
314325 j = match .start (0 )
315326 tag = rawdata [i + 2 :j ].strip ().lower ()
316327 if rawdata [j ] == '>' :
317- j = j + 1
328+ j = j + 1
318329 self .finish_endtag (tag )
319330 return j
320331
@@ -361,7 +372,8 @@ def finish_endtag(self, tag):
361372 return
362373 found = len (self .stack )
363374 for i in range (found ):
364- if self .stack [i ] == tag : found = i
375+ if self .stack [i ] == tag :
376+ found = i
365377 while len (self .stack ) > found :
366378 tag = self .stack [- 1 ]
367379 try :
@@ -411,7 +423,7 @@ def handle_charref(self, name):
411423
412424 # Definition of entities -- derived classes may override
413425 entitydefs = \
414- {'lt' : '<' , 'gt' : '>' , 'amp' : '&' , 'quot' : '"' , 'apos' : '\' ' }
426+ {'lt' : '<' , 'gt' : '>' , 'amp' : '&' , 'quot' : '"' , 'apos' : '\' ' }
415427
416428 def convert_entityref (self , name ):
417429 """Convert entity references.
@@ -450,10 +462,17 @@ def handle_pi(self, data):
450462 pass
451463
452464 # To be overridden -- handlers for unknown objects
453- def unknown_starttag (self , tag , attrs ): pass
454- def unknown_endtag (self , tag ): pass
455- def unknown_charref (self , ref ): pass
456- def unknown_entityref (self , ref ): pass
465+ def unknown_starttag (self , tag , attrs ):
466+ pass
467+
468+ def unknown_endtag (self , tag ):
469+ pass
470+
471+ def unknown_charref (self , ref ):
472+ pass
473+
474+ def unknown_entityref (self , ref ):
475+ pass
457476
458477
459478class TestSGMLParser (SGMLParser ):
@@ -511,7 +530,7 @@ def close(self):
511530 self .flush ()
512531
513532
514- def test (args = None ):
533+ def test (args = None ):
515534 import sys
516535
517536 if args is None :
@@ -548,4 +567,4 @@ def test(args = None):
548567
549568
550569if __name__ == '__main__' :
551- test ()
570+ test ()
0 commit comments