Thanks to visit codestin.com
Credit goes to github.com

Skip to content

gh-118350: Add escapable-raw-text to CDATA mode #135310

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 13 additions & 7 deletions Lib/html/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,8 @@ class HTMLParser(_markupbase.ParserBase):
containing respectively the named or numeric reference as the
argument.
"""

CDATA_CONTENT_ELEMENTS = ("script", "style")
# For escapable raw text elements (textarea and title), CDATA mode is reused
CDATA_CONTENT_ELEMENTS = ("script", "style", "textarea", "title")

def __init__(self, *, convert_charrefs=True):
"""Initialize and reset this instance.
Expand All @@ -117,6 +117,7 @@ def reset(self):
self.lasttag = '???'
self.interesting = interesting_normal
self.cdata_elem = None
self._raw_escapable = False
super().reset()

def feed(self, data):
Expand All @@ -140,11 +141,16 @@ def get_starttag_text(self):

def set_cdata_mode(self, elem):
self.cdata_elem = elem.lower()
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
if self.cdata_elem in ["textarea", "title"]:
self._raw_escapable = True
self.interesting = re.compile('[&]')
else:
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)

def clear_cdata_mode(self):
self.interesting = interesting_normal
self.cdata_elem = None
self._raw_escapable = False

# Internal -- handle data as far as reasonable. May leave state
# and data to be processed by a subsequent call. If 'end' is
Expand All @@ -154,7 +160,7 @@ def goahead(self, end):
i = 0
n = len(rawdata)
while i < n:
if self.convert_charrefs and not self.cdata_elem:
if self.convert_charrefs and (not self.cdata_elem or self._raw_escapable):
j = rawdata.find('<', i)
if j < 0:
# if we can't find the next <, either we are at the end
Expand All @@ -177,7 +183,7 @@ def goahead(self, end):
break
j = n
if i < j:
if self.convert_charrefs and not self.cdata_elem:
if self.convert_charrefs and (not self.cdata_elem or self._raw_escapable):
self.handle_data(unescape(rawdata[i:j]))
else:
self.handle_data(rawdata[i:j])
Expand Down Expand Up @@ -210,7 +216,7 @@ def goahead(self, end):
k = i + 1
else:
k += 1
if self.convert_charrefs and not self.cdata_elem:
if self.convert_charrefs and (not self.cdata_elem or self._raw_escapable):
self.handle_data(unescape(rawdata[i:k]))
else:
self.handle_data(rawdata[i:k])
Expand Down Expand Up @@ -261,7 +267,7 @@ def goahead(self, end):
assert 0, "interesting.search() lied"
# end while
if end and i < n:
if self.convert_charrefs and not self.cdata_elem:
if self.convert_charrefs and (not self.cdata_elem or self._raw_escapable):
self.handle_data(unescape(rawdata[i:n]))
else:
self.handle_data(rawdata[i:n])
Expand Down
59 changes: 59 additions & 0 deletions Lib/test/test_htmlparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,65 @@ def test_cdata_content(self):
("data", content),
("endtag", element_lower)])

def test_raw_text_content(self):
# Tags should be treated as text in raw text and escapable raw text content.
content = """<h1>tagshould be handled as text"""
elements = [
"script",
"style",
"title",
"textarea",
"SCRIPT",
"STYLE",
"TITLE",
"TEXTAREA",
"Script",
"Style",
"Title",
"Textarea",
]
for element in elements:
source = f"<{element}>{content}"
self._run_check(source, [
("starttag", element.lower(), []),
("data", content)
])

def test_escapable_raw_text_content(self):
# Charrefs should be escaped in esacapable raw text content.
class Collector(EventCollector):
pass

content = "Timon &amp; Pumba"
expected = "Timon & Pumba"
elements = [
"title",
"textarea",
"TITLE",
"TEXTAREA",
"Title",
"Textarea",
]
for element in elements:
source = f"<{element}>{content}"
self._run_check(
source, [
("starttag", element.lower(), []),
('data', expected),
],
collector=Collector(convert_charrefs=True),
)
# test with convert_charrefs=False
self._run_check(
source, [
("starttag", element.lower(), []),
('data', 'Timon '),
('entityref', 'amp'),
('data', ' Pumba')
],
)


def test_cdata_with_closing_tags(self):
# see issue #13358
# make sure that HTMLParser calls handle_data only once for each CDATA.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix a bug in html parser related to escapable raw text mode.
Loading