Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 15cb489

Browse files
committed
#13358: HTMLParser now calls handle_data only once for each CDATA.
1 parent 8008f2a commit 15cb489

3 files changed

Lines changed: 27 additions & 4 deletions

File tree

Lib/html/parser.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
# Regular expressions used for parsing
1515

1616
interesting_normal = re.compile('[&<]')
17-
interesting_cdata = re.compile(r'<(/|\Z)')
1817
incomplete = re.compile('&[a-zA-Z#]')
1918

2019
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
@@ -149,8 +148,8 @@ def get_starttag_text(self):
149148
return self.__starttag_text
150149

151150
def set_cdata_mode(self, elem):
152-
self.interesting = interesting_cdata
153151
self.cdata_elem = elem.lower()
152+
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
154153

155154
def clear_cdata_mode(self):
156155
self.interesting = interesting_normal
@@ -168,6 +167,8 @@ def goahead(self, end):
168167
if match:
169168
j = match.start()
170169
else:
170+
if self.cdata_elem:
171+
break
171172
j = n
172173
if i < j: self.handle_data(rawdata[i:j])
173174
i = self.updatepos(i, j)
@@ -250,7 +251,7 @@ def goahead(self, end):
250251
else:
251252
assert 0, "interesting.search() lied"
252253
# end while
253-
if end and i < n:
254+
if end and i < n and not self.cdata_elem:
254255
self.handle_data(rawdata[i:n])
255256
i = self.updatepos(i, n)
256257
self.rawdata = rawdata[i:]

Lib/test/test_htmlparser.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -301,7 +301,27 @@ def test_cdata_content(self):
301301
("data", content),
302302
("endtag", element_lower)])
303303

304-
304+
def test_cdata_with_closing_tags(self):
305+
# see issue #13358
306+
# make sure that HTMLParser calls handle_data only once for each CDATA.
307+
# The normal event collector normalizes the events in get_events,
308+
# so we override it to return the original list of events.
309+
class Collector(EventCollector):
310+
def get_events(self):
311+
return self.events
312+
313+
content = """<!-- not a comment --> &not-an-entity-ref;
314+
<a href="" /> </p><p> <span></span></style>
315+
'</script' + '>'"""
316+
for element in [' script', 'script ', ' script ',
317+
'\nscript', 'script\n', '\nscript\n']:
318+
element_lower = element.lower().strip()
319+
s = '<script>{content}</{element}>'.format(element=element,
320+
content=content)
321+
self._run_check(s, [("starttag", element_lower, []),
322+
("data", content),
323+
("endtag", element_lower)],
324+
collector=Collector())
305325

306326
class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
307327

Misc/NEWS

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,8 @@ Core and Builtins
7676
Library
7777
-------
7878

79+
- Issue #13358: HTMLParser now calls handle_data only once for each CDATA.
80+
7981
- Issue #4147: minidom's toprettyxml no longer adds whitespace around a text
8082
node when it is the only child of an element. Initial patch by Dan
8183
Kenigsberg.

0 commit comments

Comments
 (0)