1010
1111import markupbase
1212import re
13- import string
1413
1514# Regular expressions used for parsing
1615
2322
2423starttagopen = re .compile ('<[a-zA-Z]' )
2524piclose = re .compile ('>' )
26- endtagopen = re .compile ('</' )
2725commentclose = re .compile (r'--\s*>' )
2826tagfind = re .compile ('[a-zA-Z][-.a-zA-Z0-9:_]*' )
2927attrfind = re .compile (
@@ -96,7 +94,6 @@ def __init__(self):
9694 def reset (self ):
9795 """Reset this instance. Loses all unprocessed data."""
9896 self .rawdata = ''
99- self .stack = []
10097 self .lasttag = '???'
10198 self .interesting = interesting_normal
10299 markupbase .ParserBase .reset (self )
@@ -145,18 +142,19 @@ def goahead(self, end):
145142 if i < j : self .handle_data (rawdata [i :j ])
146143 i = self .updatepos (i , j )
147144 if i == n : break
148- if rawdata [i ] == '<' :
145+ startswith = rawdata .startswith
146+ if startswith ('<' , i ):
149147 if starttagopen .match (rawdata , i ): # < + letter
150148 k = self .parse_starttag (i )
151- elif endtagopen . match ( rawdata , i ): # </
149+ elif startswith ( "</" , i ):
152150 k = self .parse_endtag (i )
153151 if k >= 0 :
154152 self .clear_cdata_mode ()
155- elif rawdata . startswith ("<!--" , i ): # <!--
153+ elif startswith ("<!--" , i ):
156154 k = self .parse_comment (i )
157- elif rawdata . startswith ("<?" , i ): # <?
155+ elif startswith ("<?" , i ):
158156 k = self .parse_pi (i )
159- elif rawdata . startswith ("<!" , i ): # <!
157+ elif startswith ("<!" , i ):
160158 k = self .parse_declaration (i )
161159 elif (i + 1 ) < n :
162160 self .handle_data ("<" )
@@ -168,33 +166,32 @@ def goahead(self, end):
168166 self .error ("EOF in middle of construct" )
169167 break
170168 i = self .updatepos (i , k )
171- elif rawdata [ i : i + 2 ] == "&#" :
169+ elif startswith ( "&#" , i ) :
172170 match = charref .match (rawdata , i )
173171 if match :
174172 name = match .group ()[2 :- 1 ]
175173 self .handle_charref (name )
176174 k = match .end ()
177- if rawdata [ k - 1 ] != ';' :
175+ if not startswith ( ';' , k - 1 ) :
178176 k = k - 1
179177 i = self .updatepos (i , k )
180178 continue
181179 else :
182180 break
183- elif rawdata [ i ] == '&' :
181+ elif startswith ( '&' , i ) :
184182 match = entityref .match (rawdata , i )
185183 if match :
186184 name = match .group (1 )
187185 self .handle_entityref (name )
188186 k = match .end ()
189- if rawdata [ k - 1 ] != ';' :
187+ if not startswith ( ';' , k - 1 ) :
190188 k = k - 1
191189 i = self .updatepos (i , k )
192190 continue
193191 match = incomplete .match (rawdata , i )
194192 if match :
195193 # match.group() will contain at least 2 chars
196- rest = rawdata [i :]
197- if end and match .group () == rest :
194+ if end and match .group () == rawdata [i :]:
198195 self .error ("EOF in middle of entity or char ref" )
199196 # incomplete
200197 break
@@ -252,7 +249,7 @@ def parse_starttag(self, i):
252249 match = tagfind .match (rawdata , i + 1 )
253250 assert match , 'unexpected call to parse_starttag()'
254251 k = match .end ()
255- self .lasttag = tag = string . lower ( rawdata [i + 1 :k ])
252+ self .lasttag = tag = rawdata [i + 1 :k ]. lower ( )
256253
257254 while k < endpos :
258255 m = attrfind .match (rawdata , k )
@@ -265,21 +262,21 @@ def parse_starttag(self, i):
265262 attrvalue [:1 ] == '"' == attrvalue [- 1 :]:
266263 attrvalue = attrvalue [1 :- 1 ]
267264 attrvalue = self .unescape (attrvalue )
268- attrs .append ((string .lower (attrname ), attrvalue ))
265+ attrs .append ((attrname .lower (), attrvalue ))
269266 k = m .end ()
270267
271- end = string . strip ( rawdata [k :endpos ])
268+ end = rawdata [k :endpos ]. strip ( )
272269 if end not in (">" , "/>" ):
273270 lineno , offset = self .getpos ()
274271 if "\n " in self .__starttag_text :
275- lineno = lineno + string . count ( self .__starttag_text , "\n " )
272+ lineno = lineno + self .__starttag_text . count ( "\n " )
276273 offset = len (self .__starttag_text ) \
277- - string . rfind ( self .__starttag_text , "\n " )
274+ - self .__starttag_text . rfind ( "\n " )
278275 else :
279276 offset = offset + len (self .__starttag_text )
280277 self .error ("junk characters in start tag: %s"
281278 % `rawdata[k:endpos][:20]` )
282- if end [ - 2 :] == '/>' :
279+ if end . endswith ( '/>' ) :
283280 # XHTML-style empty tag: <span attr="value" />
284281 self .handle_startendtag (tag , attrs )
285282 else :
@@ -299,10 +296,9 @@ def check_for_whole_start_tag(self, i):
299296 if next == ">" :
300297 return j + 1
301298 if next == "/" :
302- s = rawdata [j :j + 2 ]
303- if s == "/>" :
299+ if rawdata .startswith ("/>" , j ):
304300 return j + 2
305- if s == "/" :
301+ if rawdata . startswith ( "/" , j ) :
306302 # buffer boundary
307303 return - 1
308304 # else bogus input
@@ -332,7 +328,7 @@ def parse_endtag(self, i):
332328 if not match :
333329 self .error ("bad end tag: %s" % `rawdata[i:j]` )
334330 tag = match .group (1 )
335- self .handle_endtag (string .lower (tag ))
331+ self .handle_endtag (tag .lower ())
336332 return j
337333
338334 # Overridable -- finish processing of start+end tag: <tag.../>
@@ -379,9 +375,9 @@ def unknown_decl(self, data):
379375 def unescape (self , s ):
380376 if '&' not in s :
381377 return s
382- s = string .replace (s , "<" , "<" )
383- s = string .replace (s , ">" , ">" )
384- s = string .replace (s , "'" , "'" )
385- s = string .replace (s , """ , '"' )
386- s = string .replace (s , "&" , "&" ) # Must be last
378+ s = s .replace ("<" , "<" )
379+ s = s .replace (">" , ">" )
380+ s = s .replace ("'" , "'" )
381+ s = s .replace (""" , '"' )
382+ s = s .replace ("&" , "&" ) # Must be last
387383 return s
0 commit comments