From 6deda184a030906069d084485036710d2afe04b3 Mon Sep 17 00:00:00 2001 From: Michal Nazarewicz Date: Wed, 3 Mar 2021 09:45:45 +0100 Subject: [PATCH 1/2] =?UTF-8?q?html.parser:=20fix=20=E2=80=98=E2=80=99=20handling=20not=20capturing=20=E2=80=98]?= =?UTF-8?q?=E2=80=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per documentation, the unknown_decl method is called with ‘the entire contents of the declaration inside the markup.’ However, this is not quite the case for ‘’ where the first of the two final closing square brackets should be included but isn’t. In other words, for such declaration unknown_decl is called with ‘CDATA[...’ string (observe unmatched brackets). Not including the closing bracket doesn’t fit the documentation but also makes it impossible to output the declaration without change since ‘""’ eats one of the brackets at the end. Fix by including the first of the closing brackets when calling unknown_decl. --- Lib/_markupbase.py | 6 +++--- Lib/test/test_htmlparser.py | 8 ++++++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/Lib/_markupbase.py b/Lib/_markupbase.py index 3ad7e279960f7e..bb0e62fd60bdff 100644 --- a/Lib/_markupbase.py +++ b/Lib/_markupbase.py @@ -10,12 +10,12 @@ _declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match _declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match _commentclose = re.compile(r'--\s*>') -_markedsectionclose = re.compile(r']\s*]\s*>') +_markedsectionclose = re.compile(r'](\s*]\s*>)') # An analysis of the MS-Word extensions is available at # http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf -_msmarkedsectionclose = re.compile(r']\s*>') +_msmarkedsectionclose = re.compile(r'(]\s*>)') del re @@ -157,7 +157,7 @@ def parse_marked_section(self, i, report=1): if not match: return -1 if report: - j = match.start(0) + j = match.start(1) self.unknown_decl(rawdata[i+3: j]) return match.end(0) diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index 12917755a56017..54e212b4800dfa 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -315,6 +315,14 @@ def get_events(self): ("endtag", element_lower)], collector=Collector(convert_charrefs=False)) + def test_cdata_decl(self): + self._run_check('x<y', + [('starttag', 'math', []), + ('starttag', 'ms', []), + ('unknown decl', 'CDATA[x" '' From 2df83c3c2d325b0f3babf16efe5459e9fd140652 Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Sat, 6 Mar 2021 13:23:35 +0000 Subject: [PATCH 2/2] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20blu?= =?UTF-8?q?rb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Misc/NEWS.d/next/Library/2021-03-06-13-23-34.bpo-0.jzVmiO.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Library/2021-03-06-13-23-34.bpo-0.jzVmiO.rst diff --git a/Misc/NEWS.d/next/Library/2021-03-06-13-23-34.bpo-0.jzVmiO.rst b/Misc/NEWS.d/next/Library/2021-03-06-13-23-34.bpo-0.jzVmiO.rst new file mode 100644 index 00000000000000..ede8ce0b5377e4 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2021-03-06-13-23-34.bpo-0.jzVmiO.rst @@ -0,0 +1 @@ +Fix html.parser dropping closing square bracket when passing CDATA content into unknown_decl method. \ No newline at end of file