From 6deda184a030906069d084485036710d2afe04b3 Mon Sep 17 00:00:00 2001
From: Michal Nazarewicz <mina86@mina86.com>
Date: Wed, 3 Mar 2021 09:45:45 +0100
Subject: [PATCH 1/2] =?UTF-8?q?html.parser:=20fix=20=E2=80=98<![CDATA[=20.?=
 =?UTF-8?q?..=20]]>=E2=80=99=20handling=20not=20capturing=20=E2=80=98]?=
 =?UTF-8?q?=E2=80=99?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Per documentation, the unknown_decl method is called with ‘the entire
contents of the declaration inside the <![...]> markup.’  However,
this is not quite the case for ‘<![CDATA[...]]>’ where the first of
the two final closing square brackets should be included but isn’t.
In other words, for such declaration unknown_decl is called with
‘CDATA[...’ string (observe unmatched brackets).

Not including the closing bracket doesn’t fit the documentation but
also makes it impossible to output the declaration without change
since ‘"<![" + data + "]>"’ eats one of the brackets at the end.

Fix by including the first of the closing brackets when calling
unknown_decl.
---
 Lib/_markupbase.py          | 6 +++---
 Lib/test/test_htmlparser.py | 8 ++++++++
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/Lib/_markupbase.py b/Lib/_markupbase.py
index 3ad7e279960f7e..bb0e62fd60bdff 100644
--- a/Lib/_markupbase.py
+++ b/Lib/_markupbase.py
@@ -10,12 +10,12 @@
 _declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match
 _declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match
 _commentclose = re.compile(r'--\s*>')
-_markedsectionclose = re.compile(r']\s*]\s*>')
+_markedsectionclose = re.compile(r'](\s*]\s*>)')
 
 # An analysis of the MS-Word extensions is available at
 # http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf
 
-_msmarkedsectionclose = re.compile(r']\s*>')
+_msmarkedsectionclose = re.compile(r'(]\s*>)')
 
 del re
 
@@ -157,7 +157,7 @@ def parse_marked_section(self, i, report=1):
         if not match:
             return -1
         if report:
-            j = match.start(0)
+            j = match.start(1)
             self.unknown_decl(rawdata[i+3: j])
         return match.end(0)
 
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
index 12917755a56017..54e212b4800dfa 100644
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -315,6 +315,14 @@ def get_events(self):
                                 ("endtag", element_lower)],
                             collector=Collector(convert_charrefs=False))
 
+    def test_cdata_decl(self):
+        self._run_check('<math><ms><![CDATA[x<y]]></ms></math>',
+                        [('starttag', 'math', []),
+                         ('starttag', 'ms', []),
+                         ('unknown decl', 'CDATA[x<y]'),
+                         ('endtag', 'ms'),
+                         ('endtag', 'math')])
+
     def test_comments(self):
         html = ("<!-- I'm a valid comment -->"
                 '<!--me too!-->'

From 2df83c3c2d325b0f3babf16efe5459e9fd140652 Mon Sep 17 00:00:00 2001
From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com>
Date: Sat, 6 Mar 2021 13:23:35 +0000
Subject: [PATCH 2/2] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20blu?=
 =?UTF-8?q?rb=5Fit.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 Misc/NEWS.d/next/Library/2021-03-06-13-23-34.bpo-0.jzVmiO.rst | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 Misc/NEWS.d/next/Library/2021-03-06-13-23-34.bpo-0.jzVmiO.rst

diff --git a/Misc/NEWS.d/next/Library/2021-03-06-13-23-34.bpo-0.jzVmiO.rst b/Misc/NEWS.d/next/Library/2021-03-06-13-23-34.bpo-0.jzVmiO.rst
new file mode 100644
index 00000000000000..ede8ce0b5377e4
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2021-03-06-13-23-34.bpo-0.jzVmiO.rst
@@ -0,0 +1 @@
+Fix html.parser dropping closing square bracket when passing CDATA content into unknown_decl method.
\ No newline at end of file