Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 029acfb

Browse files
committed
Deal more appropriately with bare ampersands and pointy brackets; this
module has to deal with "class" HTML-as-deployed as well as XHTML, so we cannot be as strict as XHTML allows. This closes SF bug #453059, but uses a different fix than suggested in the bug comments.
1 parent 18da1e1 commit 029acfb

2 files changed

Lines changed: 39 additions & 19 deletions

File tree

Lib/HTMLParser.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@
1515

1616
interesting_normal = re.compile('[&<]')
1717
interesting_cdata = re.compile(r'<(/|\Z)')
18-
incomplete = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*|#[0-9]*)?')
18+
incomplete = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*'
19+
'|#([0-9]*|[xX][0-9a-fA-F]*))?')
1920

2021
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
2122
charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
@@ -185,11 +186,8 @@ def goahead(self, end):
185186
elif declopen.match(rawdata, i): # <!
186187
k = self.parse_declaration(i)
187188
else:
188-
if i < n-1:
189-
raise HTMLParseError(
190-
"invalid '<' construct: %s" % `rawdata[i:i+2]`,
191-
self.getpos())
192-
k = -1
189+
self.handle_data("<")
190+
k = i + 1
193191
if k < 0:
194192
if end:
195193
raise HTMLParseError("EOF in middle of construct",
@@ -203,7 +201,7 @@ def goahead(self, end):
203201
self.handle_charref(name)
204202
k = match.end()
205203
if rawdata[k-1] != ';':
206-
k = k-1
204+
k = k - 1
207205
i = self.updatepos(i, k)
208206
continue
209207
match = entityref.match(rawdata, i)
@@ -212,17 +210,19 @@ def goahead(self, end):
212210
self.handle_entityref(name)
213211
k = match.end()
214212
if rawdata[k-1] != ';':
215-
k = k-1
213+
k = k - 1
216214
i = self.updatepos(i, k)
217215
continue
218-
if incomplete.match(rawdata, i):
219-
if end:
216+
match = incomplete.match(rawdata, i)
217+
if match:
218+
rest = rawdata[i:]
219+
if end and rest != "&" and match.group() == rest:
220220
raise HTMLParseError(
221221
"EOF in middle of entity or char ref",
222222
self.getpos())
223223
return -1 # incomplete
224-
raise HTMLParseError("'&' not part of entity or char ref",
225-
self.getpos())
224+
self.handle_data("&")
225+
i = self.updatepos(i, i + 1)
226226
else:
227227
assert 0, "interesting.search() lied"
228228
# end while

Lib/test/test_htmlparser.py

Lines changed: 27 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Tests for HTMLParser.py."""
22

33
import HTMLParser
4+
import pprint
45
import sys
56
import test_support
67
import unittest
@@ -83,9 +84,10 @@ def _run_check(self, source, events, collector=EventCollector):
8384
for c in self.epilogue:
8485
parser.feed(c)
8586
parser.close()
86-
self.assert_(parser.get_events() ==
87-
self.initial_events + events + self.final_events,
88-
parser.get_events())
87+
events = parser.get_events()
88+
self.assertEqual(events,
89+
self.initial_events + events + self.final_events,
90+
"got events:\n" + pprint.pformat(events))
8991

9092
def _run_check_extra(self, source, events):
9193
self._run_check(source, events, EventCollectorExtra)
@@ -137,6 +139,18 @@ def test_simple_html(self):
137139
("data", "\n"),
138140
])
139141

142+
def test_doctype_decl(self):
143+
inside = """\
144+
DOCTYPE html [
145+
<!ELEMENT html - O EMPTY>
146+
<!ATTLIST html
147+
version CDATA #IMPLIED '4.0'>
148+
<!-- comment -->
149+
]"""
150+
self._run_check("<!%s>" % inside, [
151+
("decl", inside),
152+
])
153+
140154
def test_bad_nesting(self):
141155
# Strangely, this *is* supposed to test that overlapping
142156
# elements are allowed. HTMLParser is more geared toward
@@ -148,6 +162,16 @@ def test_bad_nesting(self):
148162
("endtag", "b"),
149163
])
150164

165+
def test_bare_ampersands(self):
166+
self._run_check("this text & contains & ampersands &", [
167+
("data", "this text & contains & ampersands &"),
168+
])
169+
170+
def test_bare_pointy_brackets(self):
171+
self._run_check("this < text > contains < bare>pointy< brackets", [
172+
("data", "this < text > contains < bare>pointy< brackets"),
173+
])
174+
151175
def test_attr_syntax(self):
152176
output = [
153177
("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", None)])
@@ -199,16 +223,12 @@ def test_buffer_artefacts(self):
199223
self._run_check(["<a b='>'", ">"], output)
200224

201225
def test_starttag_junk_chars(self):
202-
self._parse_error("<")
203-
self._parse_error("<>")
204226
self._parse_error("</>")
205227
self._parse_error("</$>")
206228
self._parse_error("</")
207229
self._parse_error("</a")
208230
self._parse_error("<a<a>")
209231
self._parse_error("</a<a>")
210-
self._parse_error("<$")
211-
self._parse_error("<$>")
212232
self._parse_error("<!")
213233
self._parse_error("<a $>")
214234
self._parse_error("<a")

0 commit comments

Comments
 (0)