diff --git a/Lib/html/parser.py b/Lib/html/parser.py index 1e30956fe24f83..6a7a2d982aaba6 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -98,8 +98,8 @@ class HTMLParser(_markupbase.ParserBase): containing respectively the named or numeric reference as the argument. """ - - CDATA_CONTENT_ELEMENTS = ("script", "style") + # For escapable raw text elements (textarea and title), CDATA mode is reused + CDATA_CONTENT_ELEMENTS = ("script", "style", "textarea", "title") def __init__(self, *, convert_charrefs=True): """Initialize and reset this instance. @@ -117,6 +117,7 @@ def reset(self): self.lasttag = '???' self.interesting = interesting_normal self.cdata_elem = None + self._raw_escapable = False super().reset() def feed(self, data): @@ -140,11 +141,16 @@ def get_starttag_text(self): def set_cdata_mode(self, elem): self.cdata_elem = elem.lower() - self.interesting = re.compile(r'' % self.cdata_elem, re.I) + if self.cdata_elem in ["textarea", "title"]: + self._raw_escapable = True + self.interesting = re.compile('[&]') + else: + self.interesting = re.compile(r'' % self.cdata_elem, re.I) def clear_cdata_mode(self): self.interesting = interesting_normal self.cdata_elem = None + self._raw_escapable = False # Internal -- handle data as far as reasonable. May leave state # and data to be processed by a subsequent call. If 'end' is @@ -154,7 +160,7 @@ def goahead(self, end): i = 0 n = len(rawdata) while i < n: - if self.convert_charrefs and not self.cdata_elem: + if self.convert_charrefs and (not self.cdata_elem or self._raw_escapable): j = rawdata.find('<', i) if j < 0: # if we can't find the next <, either we are at the end @@ -177,7 +183,7 @@ def goahead(self, end): break j = n if i < j: - if self.convert_charrefs and not self.cdata_elem: + if self.convert_charrefs and (not self.cdata_elem or self._raw_escapable): self.handle_data(unescape(rawdata[i:j])) else: self.handle_data(rawdata[i:j]) @@ -210,7 +216,7 @@ def goahead(self, end): k = i + 1 else: k += 1 - if self.convert_charrefs and not self.cdata_elem: + if self.convert_charrefs and (not self.cdata_elem or self._raw_escapable): self.handle_data(unescape(rawdata[i:k])) else: self.handle_data(rawdata[i:k]) @@ -261,7 +267,7 @@ def goahead(self, end): assert 0, "interesting.search() lied" # end while if end and i < n: - if self.convert_charrefs and not self.cdata_elem: + if self.convert_charrefs and (not self.cdata_elem or self._raw_escapable): self.handle_data(unescape(rawdata[i:n])) else: self.handle_data(rawdata[i:n]) diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index 61fa24fab574f2..9ae600c07b13cb 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -295,6 +295,65 @@ def test_cdata_content(self): ("data", content), ("endtag", element_lower)]) + def test_raw_text_content(self): + # Tags should be treated as text in raw text and escapable raw text content. + content = """

tagshould be handled as text""" + elements = [ + "script", + "style", + "title", + "textarea", + "SCRIPT", + "STYLE", + "TITLE", + "TEXTAREA", + "Script", + "Style", + "Title", + "Textarea", + ] + for element in elements: + source = f"<{element}>{content}" + self._run_check(source, [ + ("starttag", element.lower(), []), + ("data", content) + ]) + + def test_escapable_raw_text_content(self): + # Charrefs should be escaped in esacapable raw text content. + class Collector(EventCollector): + pass + + content = "Timon & Pumba" + expected = "Timon & Pumba" + elements = [ + "title", + "textarea", + "TITLE", + "TEXTAREA", + "Title", + "Textarea", + ] + for element in elements: + source = f"<{element}>{content}" + self._run_check( + source, [ + ("starttag", element.lower(), []), + ('data', expected), + ], + collector=Collector(convert_charrefs=True), + ) + # test with convert_charrefs=False + self._run_check( + source, [ + ("starttag", element.lower(), []), + ('data', 'Timon '), + ('entityref', 'amp'), + ('data', ' Pumba') + ], + ) + + def test_cdata_with_closing_tags(self): # see issue #13358 # make sure that HTMLParser calls handle_data only once for each CDATA. diff --git a/Misc/NEWS.d/next/Library/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst b/Misc/NEWS.d/next/Library/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst new file mode 100644 index 00000000000000..a9754421d5ac97 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-06-09-20-38-25.gh-issue-118350.KgWCcP.rst @@ -0,0 +1 @@ +Fix a bug in html parser related to escapable raw text mode.