Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 248b043

Browse files
committed
Convert to using string methods instead of the string module.
In goahead(), use a bound version of rawdata.startswith() since we use the same method all the time and never change the value of rawdata. This can save a lot of bound method creation.
1 parent 073148c commit 248b043

1 file changed

Lines changed: 25 additions & 29 deletions

File tree

Lib/HTMLParser.py

Lines changed: 25 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010

1111
import markupbase
1212
import re
13-
import string
1413

1514
# Regular expressions used for parsing
1615

@@ -23,7 +22,6 @@
2322

2423
starttagopen = re.compile('<[a-zA-Z]')
2524
piclose = re.compile('>')
26-
endtagopen = re.compile('</')
2725
commentclose = re.compile(r'--\s*>')
2826
tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
2927
attrfind = re.compile(
@@ -96,7 +94,6 @@ def __init__(self):
9694
def reset(self):
9795
"""Reset this instance. Loses all unprocessed data."""
9896
self.rawdata = ''
99-
self.stack = []
10097
self.lasttag = '???'
10198
self.interesting = interesting_normal
10299
markupbase.ParserBase.reset(self)
@@ -145,18 +142,19 @@ def goahead(self, end):
145142
if i < j: self.handle_data(rawdata[i:j])
146143
i = self.updatepos(i, j)
147144
if i == n: break
148-
if rawdata[i] == '<':
145+
startswith = rawdata.startswith
146+
if startswith('<', i):
149147
if starttagopen.match(rawdata, i): # < + letter
150148
k = self.parse_starttag(i)
151-
elif endtagopen.match(rawdata, i): # </
149+
elif startswith("</", i):
152150
k = self.parse_endtag(i)
153151
if k >= 0:
154152
self.clear_cdata_mode()
155-
elif rawdata.startswith("<!--", i): # <!--
153+
elif startswith("<!--", i):
156154
k = self.parse_comment(i)
157-
elif rawdata.startswith("<?", i): # <?
155+
elif startswith("<?", i):
158156
k = self.parse_pi(i)
159-
elif rawdata.startswith("<!", i): # <!
157+
elif startswith("<!", i):
160158
k = self.parse_declaration(i)
161159
elif (i + 1) < n:
162160
self.handle_data("<")
@@ -168,33 +166,32 @@ def goahead(self, end):
168166
self.error("EOF in middle of construct")
169167
break
170168
i = self.updatepos(i, k)
171-
elif rawdata[i:i+2] == "&#":
169+
elif startswith("&#", i):
172170
match = charref.match(rawdata, i)
173171
if match:
174172
name = match.group()[2:-1]
175173
self.handle_charref(name)
176174
k = match.end()
177-
if rawdata[k-1] != ';':
175+
if not startswith(';', k-1):
178176
k = k - 1
179177
i = self.updatepos(i, k)
180178
continue
181179
else:
182180
break
183-
elif rawdata[i] == '&':
181+
elif startswith('&', i):
184182
match = entityref.match(rawdata, i)
185183
if match:
186184
name = match.group(1)
187185
self.handle_entityref(name)
188186
k = match.end()
189-
if rawdata[k-1] != ';':
187+
if not startswith(';', k-1):
190188
k = k - 1
191189
i = self.updatepos(i, k)
192190
continue
193191
match = incomplete.match(rawdata, i)
194192
if match:
195193
# match.group() will contain at least 2 chars
196-
rest = rawdata[i:]
197-
if end and match.group() == rest:
194+
if end and match.group() == rawdata[i:]:
198195
self.error("EOF in middle of entity or char ref")
199196
# incomplete
200197
break
@@ -252,7 +249,7 @@ def parse_starttag(self, i):
252249
match = tagfind.match(rawdata, i+1)
253250
assert match, 'unexpected call to parse_starttag()'
254251
k = match.end()
255-
self.lasttag = tag = string.lower(rawdata[i+1:k])
252+
self.lasttag = tag = rawdata[i+1:k].lower()
256253

257254
while k < endpos:
258255
m = attrfind.match(rawdata, k)
@@ -265,21 +262,21 @@ def parse_starttag(self, i):
265262
attrvalue[:1] == '"' == attrvalue[-1:]:
266263
attrvalue = attrvalue[1:-1]
267264
attrvalue = self.unescape(attrvalue)
268-
attrs.append((string.lower(attrname), attrvalue))
265+
attrs.append((attrname.lower(), attrvalue))
269266
k = m.end()
270267

271-
end = string.strip(rawdata[k:endpos])
268+
end = rawdata[k:endpos].strip()
272269
if end not in (">", "/>"):
273270
lineno, offset = self.getpos()
274271
if "\n" in self.__starttag_text:
275-
lineno = lineno + string.count(self.__starttag_text, "\n")
272+
lineno = lineno + self.__starttag_text.count("\n")
276273
offset = len(self.__starttag_text) \
277-
- string.rfind(self.__starttag_text, "\n")
274+
- self.__starttag_text.rfind("\n")
278275
else:
279276
offset = offset + len(self.__starttag_text)
280277
self.error("junk characters in start tag: %s"
281278
% `rawdata[k:endpos][:20]`)
282-
if end[-2:] == '/>':
279+
if end.endswith('/>'):
283280
# XHTML-style empty tag: <span attr="value" />
284281
self.handle_startendtag(tag, attrs)
285282
else:
@@ -299,10 +296,9 @@ def check_for_whole_start_tag(self, i):
299296
if next == ">":
300297
return j + 1
301298
if next == "/":
302-
s = rawdata[j:j+2]
303-
if s == "/>":
299+
if rawdata.startswith("/>", j):
304300
return j + 2
305-
if s == "/":
301+
if rawdata.startswith("/", j):
306302
# buffer boundary
307303
return -1
308304
# else bogus input
@@ -332,7 +328,7 @@ def parse_endtag(self, i):
332328
if not match:
333329
self.error("bad end tag: %s" % `rawdata[i:j]`)
334330
tag = match.group(1)
335-
self.handle_endtag(string.lower(tag))
331+
self.handle_endtag(tag.lower())
336332
return j
337333

338334
# Overridable -- finish processing of start+end tag: <tag.../>
@@ -379,9 +375,9 @@ def unknown_decl(self, data):
379375
def unescape(self, s):
380376
if '&' not in s:
381377
return s
382-
s = string.replace(s, "&lt;", "<")
383-
s = string.replace(s, "&gt;", ">")
384-
s = string.replace(s, "&apos;", "'")
385-
s = string.replace(s, "&quot;", '"')
386-
s = string.replace(s, "&amp;", "&") # Must be last
378+
s = s.replace("&lt;", "<")
379+
s = s.replace("&gt;", ">")
380+
s = s.replace("&apos;", "'")
381+
s = s.replace("&quot;", '"')
382+
s = s.replace("&amp;", "&") # Must be last
387383
return s

0 commit comments

Comments
 (0)