From 4fb000658ea11daf9b196c03c8d61908367e474e Mon Sep 17 00:00:00 2001 From: Ezio Melotti Date: Sat, 10 May 2025 16:31:43 +0200 Subject: [PATCH] gh-77057: Fix handling of invalid markup declarations in HTMLParser (GH-9295) (cherry picked from commit 76c0b01bc401c3e976011bbc69cec56dbebe0ad5) Co-authored-by: Ezio Melotti Co-authored-by: Serhiy Storchaka --- Lib/html/parser.py | 4 +- Lib/test/test_htmlparser.py | 81 +++++++++++++++---- ...5-05-09-15-50-00.gh-issue-77057.fV8SU-.rst | 2 + 3 files changed, 68 insertions(+), 19 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2025-05-09-15-50-00.gh-issue-77057.fV8SU-.rst diff --git a/Lib/html/parser.py b/Lib/html/parser.py index 0a1dd3b7d3bfd2..1b8b6ea0e5ab7a 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -278,7 +278,7 @@ def parse_html_declaration(self, i): if rawdata[i:i+4] == '' '' '' - '') + '' + # see #32876 + '' + '' + '' + '' + '' + '' # required '[' after CDATA + ) expected = [ ('comment', ' not really a comment '), ('comment', ' not a comment either --'), @@ -579,39 +600,65 @@ def test_broken_comments(self): ('comment', ''), ('comment', '<-- this was an empty comment'), ('comment', '!! another bogus comment !!!'), + ('comment', '[with square brackets]!'), + ('comment', '[\nmultiline\nbogusness\n]!'), + ('comment', '[more brackets]-[and a hyphen]!'), + ('comment', '[cdata[should be uppercase]]'), + ('comment', '[CDATA [whitespaces are not ignored]]'), + ('comment', '[CDATA]]'), ] self._run_check(html, expected) def test_broken_condcoms(self): # these condcoms are missing the '--' after '' + # and they are considered bogus comments according to + # "8.2.4.42. Markup declaration open state" html = ('broken condcom' '' '' 'foo' '') - # According to the HTML5 specs sections "8.2.4.44 Bogus comment state" - # and "8.2.4.45 Markup declaration open state", comment tokens should - # be emitted instead of 'unknown decl', but calling unknown_decl - # provides more flexibility. - # See also Lib/_markupbase.py:parse_declaration expected = [ - ('unknown decl', 'if !(IE)'), + ('comment', '[if !(IE)]'), ('data', 'broken condcom'), - ('unknown decl', 'endif'), - ('unknown decl', 'if ! IE'), + ('comment', '[endif]'), + ('comment', '[if ! IE]'), ('startendtag', 'link', [('href', 'favicon.tiff')]), - ('unknown decl', 'endif'), - ('unknown decl', 'if !IE 6'), + ('comment', '[endif]'), + ('comment', '[if !IE 6]'), ('startendtag', 'img', [('src', 'firefox.png')]), - ('unknown decl', 'endif'), - ('unknown decl', 'if !ie 6'), + ('comment', '[endif]'), + ('comment', '[if !ie 6]'), ('starttag', 'b', []), ('data', 'foo'), ('endtag', 'b'), - ('unknown decl', 'endif'), - ('unknown decl', 'if (!IE)|(lt IE 9)'), + ('comment', '[endif]'), + ('comment', '[if (!IE)|(lt IE 9)]'), ('startendtag', 'img', [('src', 'mammoth.bmp')]), - ('unknown decl', 'endif') + ('comment', '[endif]') + ] + self._run_check(html, expected) + + def test_cdata_declarations(self): + # More tests should be added. See also "8.2.4.42. Markup + # declaration open state", "8.2.4.69. CDATA section state", + # and issue 32876 + html = ('') + expected = [('unknown decl', 'CDATA[just some plain text')] + self._run_check(html, expected) + + def test_cdata_declarations_multiline(self): + html = (' b) {' + ' printf("[How?]");' + ' }' + ']]>') + expected = [ + ('starttag', 'code', []), + ('unknown decl', + 'CDATA[ if (a < b && a > b) { ' + 'printf("[How?]"); }'), + ('endtag', 'code') ] self._run_check(html, expected) diff --git a/Misc/NEWS.d/next/Library/2025-05-09-15-50-00.gh-issue-77057.fV8SU-.rst b/Misc/NEWS.d/next/Library/2025-05-09-15-50-00.gh-issue-77057.fV8SU-.rst new file mode 100644 index 00000000000000..42107de75c7d29 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-05-09-15-50-00.gh-issue-77057.fV8SU-.rst @@ -0,0 +1,2 @@ +Fix handling of invalid markup declarations in +:class:`html.parser.HTMLParser`.