2929piclose = re .compile ('>' )
3030commentclose = re .compile (r'--\s*>' )
3131# Note:
32- # 1) the strict attrfind isn't really strict, but we can't make it
33- # correctly strict without breaking backward compatibility;
34- # 2) if you change tagfind/attrfind remember to update locatestarttagend too;
35- # 3) if you change tagfind/attrfind and/or locatestarttagend the parser will
32+ # 1) if you change tagfind/attrfind remember to update locatestarttagend too;
33+ # 2) if you change tagfind/attrfind and/or locatestarttagend the parser will
3634# explode, so don't do it.
37- tagfind = re .compile ('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*' )
3835# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
3936# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
4037tagfind_tolerant = re .compile ('([a-zA-Z][^\t \n \r \f />\x00 ]*)(?:\s|/(?!>))*' )
41- attrfind = re .compile (
42- r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
43- r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?' )
4438attrfind_tolerant = re .compile (
4539 r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
4640 r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*' )
47- locatestarttagend = re .compile (r"""
48- <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
49- (?:\s+ # whitespace before attribute name
50- (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
51- (?:\s*=\s* # value indicator
52- (?:'[^']*' # LITA-enclosed value
53- |\"[^\"]*\" # LIT-enclosed value
54- |[^'\">\s]+ # bare value
55- )
56- )?
57- )
58- )*
59- \s* # trailing whitespace
60- """ , re .VERBOSE )
6141locatestarttagend_tolerant = re .compile (r"""
6242 <[a-zA-Z][^\t\n\r\f />\x00]* # tag name
6343 (?:[\s/]* # optional whitespace before attribute name
7959endtagfind = re .compile ('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>' )
8060
8161
82- class HTMLParseError (Exception ):
83- """Exception raised for all parse errors."""
84-
85- def __init__ (self , msg , position = (None , None )):
86- assert msg
87- self .msg = msg
88- self .lineno = position [0 ]
89- self .offset = position [1 ]
90-
91- def __str__ (self ):
92- result = self .msg
93- if self .lineno is not None :
94- result = result + ", at line %d" % self .lineno
95- if self .offset is not None :
96- result = result + ", column %d" % (self .offset + 1 )
97- return result
98-
99-
10062_default_sentinel = object ()
10163
10264class HTMLParser (_markupbase .ParserBase ):
@@ -123,22 +85,12 @@ class HTMLParser(_markupbase.ParserBase):
12385
12486 CDATA_CONTENT_ELEMENTS = ("script" , "style" )
12587
126- def __init__ (self , strict = _default_sentinel , * ,
127- convert_charrefs = _default_sentinel ):
88+ def __init__ (self , * , convert_charrefs = _default_sentinel ):
12889 """Initialize and reset this instance.
12990
13091 If convert_charrefs is True (default: False), all character references
13192 are automatically converted to the corresponding Unicode characters.
132- If strict is set to False (the default) the parser will parse invalid
133- markup, otherwise it will raise an error. Note that the strict mode
134- and argument are deprecated.
13593 """
136- if strict is not _default_sentinel :
137- warnings .warn ("The strict argument and mode are deprecated." ,
138- DeprecationWarning , stacklevel = 2 )
139- else :
140- strict = False # default
141- self .strict = strict
14294 if convert_charrefs is _default_sentinel :
14395 convert_charrefs = False # default
14496 warnings .warn ("The value of convert_charrefs will become True in "
@@ -168,11 +120,6 @@ def close(self):
168120 """Handle any buffered data."""
169121 self .goahead (1 )
170122
171- def error (self , message ):
172- warnings .warn ("The 'error' method is deprecated." ,
173- DeprecationWarning , stacklevel = 2 )
174- raise HTMLParseError (message , self .getpos ())
175-
176123 __starttag_text = None
177124
178125 def get_starttag_text (self ):
@@ -227,10 +174,7 @@ def goahead(self, end):
227174 elif startswith ("<?" , i ):
228175 k = self .parse_pi (i )
229176 elif startswith ("<!" , i ):
230- if self .strict :
231- k = self .parse_declaration (i )
232- else :
233- k = self .parse_html_declaration (i )
177+ k = self .parse_html_declaration (i )
234178 elif (i + 1 ) < n :
235179 self .handle_data ("<" )
236180 k = i + 1
@@ -239,8 +183,6 @@ def goahead(self, end):
239183 if k < 0 :
240184 if not end :
241185 break
242- if self .strict :
243- self .error ("EOF in middle of construct" )
244186 k = rawdata .find ('>' , i + 1 )
245187 if k < 0 :
246188 k = rawdata .find ('<' , i + 1 )
@@ -282,13 +224,10 @@ def goahead(self, end):
282224 if match :
283225 # match.group() will contain at least 2 chars
284226 if end and match .group () == rawdata [i :]:
285- if self .strict :
286- self .error ("EOF in middle of entity or char ref" )
287- else :
288- k = match .end ()
289- if k <= i :
290- k = n
291- i = self .updatepos (i , i + 1 )
227+ k = match .end ()
228+ if k <= i :
229+ k = n
230+ i = self .updatepos (i , i + 1 )
292231 # incomplete
293232 break
294233 elif (i + 1 ) < n :
@@ -367,18 +306,12 @@ def parse_starttag(self, i):
367306
368307 # Now parse the data between i+1 and j into a tag and attrs
369308 attrs = []
370- if self .strict :
371- match = tagfind .match (rawdata , i + 1 )
372- else :
373- match = tagfind_tolerant .match (rawdata , i + 1 )
309+ match = tagfind_tolerant .match (rawdata , i + 1 )
374310 assert match , 'unexpected call to parse_starttag()'
375311 k = match .end ()
376312 self .lasttag = tag = match .group (1 ).lower ()
377313 while k < endpos :
378- if self .strict :
379- m = attrfind .match (rawdata , k )
380- else :
381- m = attrfind_tolerant .match (rawdata , k )
314+ m = attrfind_tolerant .match (rawdata , k )
382315 if not m :
383316 break
384317 attrname , rest , attrvalue = m .group (1 , 2 , 3 )
@@ -401,9 +334,6 @@ def parse_starttag(self, i):
401334 - self .__starttag_text .rfind ("\n " )
402335 else :
403336 offset = offset + len (self .__starttag_text )
404- if self .strict :
405- self .error ("junk characters in start tag: %r"
406- % (rawdata [k :endpos ][:20 ],))
407337 self .handle_data (rawdata [i :endpos ])
408338 return endpos
409339 if end .endswith ('/>' ):
@@ -419,10 +349,7 @@ def parse_starttag(self, i):
419349 # or -1 if incomplete.
420350 def check_for_whole_start_tag (self , i ):
421351 rawdata = self .rawdata
422- if self .strict :
423- m = locatestarttagend .match (rawdata , i )
424- else :
425- m = locatestarttagend_tolerant .match (rawdata , i )
352+ m = locatestarttagend_tolerant .match (rawdata , i )
426353 if m :
427354 j = m .end ()
428355 next = rawdata [j :j + 1 ]
@@ -435,9 +362,6 @@ def check_for_whole_start_tag(self, i):
435362 # buffer boundary
436363 return - 1
437364 # else bogus input
438- if self .strict :
439- self .updatepos (i , j + 1 )
440- self .error ("malformed empty start tag" )
441365 if j > i :
442366 return j
443367 else :
@@ -450,9 +374,6 @@ def check_for_whole_start_tag(self, i):
450374 # end of input in or before attribute value, or we have the
451375 # '/' from a '/>' ending
452376 return - 1
453- if self .strict :
454- self .updatepos (i , j )
455- self .error ("malformed start tag" )
456377 if j > i :
457378 return j
458379 else :
@@ -472,8 +393,6 @@ def parse_endtag(self, i):
472393 if self .cdata_elem is not None :
473394 self .handle_data (rawdata [i :gtpos ])
474395 return gtpos
475- if self .strict :
476- self .error ("bad end tag: %r" % (rawdata [i :gtpos ],))
477396 # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
478397 namematch = tagfind_tolerant .match (rawdata , i + 2 )
479398 if not namematch :
@@ -539,8 +458,7 @@ def handle_pi(self, data):
539458 pass
540459
541460 def unknown_decl (self , data ):
542- if self .strict :
543- self .error ("unknown declaration: %r" % (data ,))
461+ pass
544462
545463 # Internal -- helper to remove special character quoting
546464 def unescape (self , s ):
0 commit comments