Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit d5d4406

Browse files
committed
#1745761, #755670, #13357, #12629, #1200313: merge with 3.2.
2 parents 84b48a6 + c2fe577 commit d5d4406

3 files changed

Lines changed: 167 additions & 81 deletions

File tree

Lib/html/parser.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,8 @@
3030
r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
3131
r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
3232
attrfind_tolerant = re.compile(
33-
r',?\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
34-
r'(\'[^\']*\'|"[^"]*"|[^>\s]*))?')
33+
r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
34+
r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
3535
locatestarttagend = re.compile(r"""
3636
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
3737
(?:\s+ # whitespace before attribute name
@@ -49,16 +49,16 @@
4949
locatestarttagend_tolerant = re.compile(r"""
5050
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
5151
(?:\s* # optional whitespace before attribute name
52-
(?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
53-
(?:\s*=\s* # value indicator
52+
(?:(?<=['"\s])[^\s/>][^\s/=>]* # attribute name
53+
(?:\s*=+\s* # value indicator
5454
(?:'[^']*' # LITA-enclosed value
55-
|\"[^\"]*\" # LIT-enclosed value
56-
|[^'\">\s]+ # bare value
55+
|"[^"]*" # LIT-enclosed value
56+
|(?!['"])[^>\s]* # bare value
5757
)
5858
(?:\s*,)* # possibly followed by a comma
59-
)?
60-
)
61-
)*
59+
)?\s*
60+
)*
61+
)?
6262
\s* # trailing whitespace
6363
""", re.VERBOSE)
6464
endendtag = re.compile('>')
@@ -295,6 +295,7 @@ def parse_starttag(self, i):
295295
elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
296296
attrvalue[:1] == '"' == attrvalue[-1:]:
297297
attrvalue = attrvalue[1:-1]
298+
if attrvalue:
298299
attrvalue = self.unescape(attrvalue)
299300
attrs.append((attrname.lower(), attrvalue))
300301
k = m.end()

Lib/test/test_htmlparser.py

Lines changed: 154 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -196,60 +196,6 @@ def test_bare_pointy_brackets(self):
196196
("data", "this < text > contains < bare>pointy< brackets"),
197197
])
198198

199-
def test_attr_syntax(self):
200-
output = [
201-
("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", None)])
202-
]
203-
self._run_check("""<a b='v' c="v" d=v e>""", output)
204-
self._run_check("""<a b = 'v' c = "v" d = v e>""", output)
205-
self._run_check("""<a\nb\n=\n'v'\nc\n=\n"v"\nd\n=\nv\ne>""", output)
206-
self._run_check("""<a\tb\t=\t'v'\tc\t=\t"v"\td\t=\tv\te>""", output)
207-
208-
def test_attr_values(self):
209-
self._run_check("""<a b='xxx\n\txxx' c="yyy\t\nyyy" d='\txyz\n'>""",
210-
[("starttag", "a", [("b", "xxx\n\txxx"),
211-
("c", "yyy\t\nyyy"),
212-
("d", "\txyz\n")])
213-
])
214-
self._run_check("""<a b='' c="">""", [
215-
("starttag", "a", [("b", ""), ("c", "")]),
216-
])
217-
# Regression test for SF patch #669683.
218-
self._run_check("<e a=rgb(1,2,3)>", [
219-
("starttag", "e", [("a", "rgb(1,2,3)")]),
220-
])
221-
# Regression test for SF bug #921657.
222-
self._run_check("<a href=mailto:[email protected]>", [
223-
("starttag", "a", [("href", "mailto:[email protected]")]),
224-
])
225-
226-
def test_attr_nonascii(self):
227-
# see issue 7311
228-
self._run_check("<img src=/foo/bar.png alt=\u4e2d\u6587>", [
229-
("starttag", "img", [("src", "/foo/bar.png"),
230-
("alt", "\u4e2d\u6587")]),
231-
])
232-
self._run_check("<a title='\u30c6\u30b9\u30c8' "
233-
"href='\u30c6\u30b9\u30c8.html'>", [
234-
("starttag", "a", [("title", "\u30c6\u30b9\u30c8"),
235-
("href", "\u30c6\u30b9\u30c8.html")]),
236-
])
237-
self._run_check('<a title="\u30c6\u30b9\u30c8" '
238-
'href="\u30c6\u30b9\u30c8.html">', [
239-
("starttag", "a", [("title", "\u30c6\u30b9\u30c8"),
240-
("href", "\u30c6\u30b9\u30c8.html")]),
241-
])
242-
243-
def test_attr_entity_replacement(self):
244-
self._run_check("""<a b='&amp;&gt;&lt;&quot;&apos;'>""", [
245-
("starttag", "a", [("b", "&><\"'")]),
246-
])
247-
248-
def test_attr_funky_names(self):
249-
self._run_check("""<a a.b='v' c:d=v e-f=v>""", [
250-
("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]),
251-
])
252-
253199
def test_illegal_declarations(self):
254200
self._parse_error('<!spacer type="block" height="25">')
255201

@@ -295,13 +241,11 @@ def test_starttag_junk_chars(self):
295241
self._parse_error("<a<a>")
296242
self._parse_error("</a<a>")
297243
self._parse_error("<!")
298-
self._parse_error("<a $>")
299244
self._parse_error("<a")
300245
self._parse_error("<a foo='bar'")
301246
self._parse_error("<a foo='bar")
302247
self._parse_error("<a foo='>'")
303248
self._parse_error("<a foo='>")
304-
self._parse_error("<a foo=>")
305249

306250
def test_declaration_junk_chars(self):
307251
self._parse_error("<!DOCTYPE foo $ >")
@@ -358,10 +302,6 @@ def test_cdata_content(self):
358302
("endtag", element_lower)])
359303

360304

361-
def test_entityrefs_in_attributes(self):
362-
self._run_check("<html foo='&euro;&amp;&#97;&#x61;&unsupported;'>",
363-
[("starttag", "html", [("foo", "\u20AC&aa&unsupported;")])])
364-
365305

366306
class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
367307

@@ -371,15 +311,14 @@ def get_collector(self):
371311
def test_tolerant_parsing(self):
372312
self._run_check('<html <html>te>>xt&a<<bc</a></html>\n'
373313
'<img src="URL><//img></html</html>', [
374-
('data', '<html '),
375-
('starttag', 'html', []),
376-
('data', 'te>>xt'),
377-
('entityref', 'a'),
378-
('data', '<<bc'),
379-
('endtag', 'a'),
380-
('endtag', 'html'),
381-
('data', '\n<img src="URL><//img></html'),
382-
('endtag', 'html')])
314+
('starttag', 'html', [('<html', None)]),
315+
('data', 'te>>xt'),
316+
('entityref', 'a'),
317+
('data', '<<bc'),
318+
('endtag', 'a'),
319+
('endtag', 'html'),
320+
('data', '\n<img src="URL><//img></html'),
321+
('endtag', 'html')])
383322

384323
def test_with_unquoted_attributes(self):
385324
# see #12008
@@ -410,7 +349,7 @@ def test_comma_between_attributes(self):
410349
'method="post">', [
411350
('starttag', 'form',
412351
[('action', '/xxx.php?a=1&b=2&amp'),
413-
('method', 'post')])])
352+
(',', None), ('method', 'post')])])
414353

415354
def test_weird_chars_in_unquoted_attribute_values(self):
416355
self._run_check('<form action=bogus|&#()value>', [
@@ -441,7 +380,7 @@ def test_correct_detection_of_start_tags(self):
441380

442381
html = '<div style="", foo = "bar" ><b>The <a href="some_url">rain</a>'
443382
expected = [
444-
('starttag', 'div', [('style', ''), ('foo', 'bar')]),
383+
('starttag', 'div', [('style', ''), (',', None), ('foo', 'bar')]),
445384
('starttag', 'b', []),
446385
('data', 'The '),
447386
('starttag', 'a', [('href', 'some_url')]),
@@ -458,8 +397,151 @@ def test_unescape_function(self):
458397
self.assertEqual(p.unescape('&#123; ' * 1050), '{ ' * 1050)
459398

460399

400+
class AttributesStrictTestCase(TestCaseBase):
401+
402+
def get_collector(self):
403+
return EventCollector(strict=True)
404+
405+
def test_attr_syntax(self):
406+
output = [
407+
("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", None)])
408+
]
409+
self._run_check("""<a b='v' c="v" d=v e>""", output)
410+
self._run_check("""<a b = 'v' c = "v" d = v e>""", output)
411+
self._run_check("""<a\nb\n=\n'v'\nc\n=\n"v"\nd\n=\nv\ne>""", output)
412+
self._run_check("""<a\tb\t=\t'v'\tc\t=\t"v"\td\t=\tv\te>""", output)
413+
414+
def test_attr_values(self):
415+
self._run_check("""<a b='xxx\n\txxx' c="yyy\t\nyyy" d='\txyz\n'>""",
416+
[("starttag", "a", [("b", "xxx\n\txxx"),
417+
("c", "yyy\t\nyyy"),
418+
("d", "\txyz\n")])])
419+
self._run_check("""<a b='' c="">""",
420+
[("starttag", "a", [("b", ""), ("c", "")])])
421+
# Regression test for SF patch #669683.
422+
self._run_check("<e a=rgb(1,2,3)>",
423+
[("starttag", "e", [("a", "rgb(1,2,3)")])])
424+
# Regression test for SF bug #921657.
425+
self._run_check(
426+
"<a href=mailto:[email protected]>",
427+
[("starttag", "a", [("href", "mailto:[email protected]")])])
428+
429+
def test_attr_nonascii(self):
430+
# see issue 7311
431+
self._run_check(
432+
"<img src=/foo/bar.png alt=\u4e2d\u6587>",
433+
[("starttag", "img", [("src", "/foo/bar.png"),
434+
("alt", "\u4e2d\u6587")])])
435+
self._run_check(
436+
"<a title='\u30c6\u30b9\u30c8' href='\u30c6\u30b9\u30c8.html'>",
437+
[("starttag", "a", [("title", "\u30c6\u30b9\u30c8"),
438+
("href", "\u30c6\u30b9\u30c8.html")])])
439+
self._run_check(
440+
'<a title="\u30c6\u30b9\u30c8" href="\u30c6\u30b9\u30c8.html">',
441+
[("starttag", "a", [("title", "\u30c6\u30b9\u30c8"),
442+
("href", "\u30c6\u30b9\u30c8.html")])])
443+
444+
def test_attr_entity_replacement(self):
445+
self._run_check(
446+
"<a b='&amp;&gt;&lt;&quot;&apos;'>",
447+
[("starttag", "a", [("b", "&><\"'")])])
448+
449+
def test_attr_funky_names(self):
450+
self._run_check(
451+
"<a a.b='v' c:d=v e-f=v>",
452+
[("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")])])
453+
454+
def test_entityrefs_in_attributes(self):
455+
self._run_check(
456+
"<html foo='&euro;&amp;&#97;&#x61;&unsupported;'>",
457+
[("starttag", "html", [("foo", "\u20AC&aa&unsupported;")])])
458+
459+
460+
461+
class AttributesTolerantTestCase(AttributesStrictTestCase):
462+
463+
def get_collector(self):
464+
return EventCollector(strict=False)
465+
466+
def test_attr_funky_names2(self):
467+
self._run_check(
468+
"<a $><b $=%><c \=/>",
469+
[("starttag", "a", [("$", None)]),
470+
("starttag", "b", [("$", "%")]),
471+
("starttag", "c", [("\\", "/")])])
472+
473+
def test_entities_in_attribute_value(self):
474+
# see #1200313
475+
for entity in ['&', '&amp;', '&#38;', '&#x26;']:
476+
self._run_check('<a href="%s">' % entity,
477+
[("starttag", "a", [("href", "&")])])
478+
self._run_check("<a href='%s'>" % entity,
479+
[("starttag", "a", [("href", "&")])])
480+
self._run_check("<a href=%s>" % entity,
481+
[("starttag", "a", [("href", "&")])])
482+
483+
def test_malformed_attributes(self):
484+
# see #13357
485+
html = (
486+
"<a href=test'style='color:red;bad1'>test - bad1</a>"
487+
"<a href=test'+style='color:red;ba2'>test - bad2</a>"
488+
"<a href=test'&nbsp;style='color:red;bad3'>test - bad3</a>"
489+
"<a href = test'&nbsp;style='color:red;bad4' >test - bad4</a>"
490+
)
491+
expected = [
492+
('starttag', 'a', [('href', "test'style='color:red;bad1'")]),
493+
('data', 'test - bad1'), ('endtag', 'a'),
494+
('starttag', 'a', [('href', "test'+style='color:red;ba2'")]),
495+
('data', 'test - bad2'), ('endtag', 'a'),
496+
('starttag', 'a', [('href', "test'\xa0style='color:red;bad3'")]),
497+
('data', 'test - bad3'), ('endtag', 'a'),
498+
('starttag', 'a', [('href', "test'\xa0style='color:red;bad4'")]),
499+
('data', 'test - bad4'), ('endtag', 'a')
500+
]
501+
self._run_check(html, expected)
502+
503+
def test_malformed_adjacent_attributes(self):
504+
# see #12629
505+
self._run_check('<x><y z=""o"" /></x>',
506+
[('starttag', 'x', []),
507+
('startendtag', 'y', [('z', ''), ('o""', None)]),
508+
('endtag', 'x')])
509+
self._run_check('<x><y z="""" /></x>',
510+
[('starttag', 'x', []),
511+
('startendtag', 'y', [('z', ''), ('""', None)]),
512+
('endtag', 'x')])
513+
514+
# see #755670 for the following 3 tests
515+
def test_adjacent_attributes(self):
516+
self._run_check('<a width="100%"cellspacing=0>',
517+
[("starttag", "a",
518+
[("width", "100%"), ("cellspacing","0")])])
519+
520+
self._run_check('<a id="foo"class="bar">',
521+
[("starttag", "a",
522+
[("id", "foo"), ("class","bar")])])
523+
524+
def test_missing_attribute_value(self):
525+
self._run_check('<a v=>',
526+
[("starttag", "a", [("v", "")])])
527+
528+
def test_javascript_attribute_value(self):
529+
self._run_check("<a href=javascript:popup('/popup/help.html')>",
530+
[("starttag", "a",
531+
[("href", "javascript:popup('/popup/help.html')")])])
532+
533+
def test_end_tag_in_attribute_value(self):
534+
# see #1745761
535+
self._run_check("<a href='http://www.example.org/\">;'>spam</a>",
536+
[("starttag", "a",
537+
[("href", "http://www.example.org/\">;")]),
538+
("data", "spam"), ("endtag", "a")])
539+
540+
541+
461542
def test_main():
462-
support.run_unittest(HTMLParserStrictTestCase, HTMLParserTolerantTestCase)
543+
support.run_unittest(HTMLParserStrictTestCase, HTMLParserTolerantTestCase,
544+
AttributesStrictTestCase, AttributesTolerantTestCase)
463545

464546

465547
if __name__ == "__main__":

Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -365,6 +365,9 @@ Core and Builtins
365365
Library
366366
-------
367367

368+
- Issues #1745761, #755670, #13357, #12629, #1200313: HTMLParser now correctly
369+
handles non-valid attributes, including adjacent and unquoted attributes.
370+
368371
- Issue #13193: Fix distutils.filelist.FileList and
369372
packaging.manifest.Manifest under Windows. The "recursive-include"
370373
directive now recognizes both legal path separators.

0 commit comments

Comments
 (0)