diff --git a/Lib/_markupbase.py b/Lib/_markupbase.py index 3ad7e279960f7e..bb0e62fd60bdff 100644 --- a/Lib/_markupbase.py +++ b/Lib/_markupbase.py @@ -10,12 +10,12 @@ _declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match _declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match _commentclose = re.compile(r'--\s*>') -_markedsectionclose = re.compile(r']\s*]\s*>') +_markedsectionclose = re.compile(r'](\s*]\s*>)') # An analysis of the MS-Word extensions is available at # http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf -_msmarkedsectionclose = re.compile(r']\s*>') +_msmarkedsectionclose = re.compile(r'(]\s*>)') del re @@ -157,7 +157,7 @@ def parse_marked_section(self, i, report=1): if not match: return -1 if report: - j = match.start(0) + j = match.start(1) self.unknown_decl(rawdata[i+3: j]) return match.end(0) diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index 12917755a56017..54e212b4800dfa 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -315,6 +315,14 @@ def get_events(self): ("endtag", element_lower)], collector=Collector(convert_charrefs=False)) + def test_cdata_decl(self): + self._run_check('

x<y

', + [('starttag', 'math', []), + ('starttag', 'ms', []), + ('unknown decl', 'CDATA[x" '' diff --git a/Misc/NEWS.d/next/Library/2021-03-06-13-23-34.bpo-0.jzVmiO.rst b/Misc/NEWS.d/next/Library/2021-03-06-13-23-34.bpo-0.jzVmiO.rst new file mode 100644 index 00000000000000..ede8ce0b5377e4 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2021-03-06-13-23-34.bpo-0.jzVmiO.rst @@ -0,0 +1 @@ +Fix html.parser dropping closing square bracket when passing CDATA content into unknown_decl method. \ No newline at end of file