2424piclose = re .compile ('>' )
2525commentclose = re .compile (r'--\s*>' )
2626tagfind = re .compile ('[a-zA-Z][-.a-zA-Z0-9:_]*' )
27+ # Note, the strict one of this pair isn't really strict, but we can't
28+ # make it correctly strict without breaking backward compatibility.
2729attrfind = re .compile (
2830 r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
2931 r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]*))?' )
30-
32+ attrfind_tolerant = re .compile (
33+ r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
34+ r'(\'[^\']*\'|"[^"]*"|[^>\s]*))?' )
3135locatestarttagend = re .compile (r"""
3236 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
3337 (?:\s+ # whitespace before attribute name
4246 )*
4347 \s* # trailing whitespace
4448""" , re .VERBOSE )
49+ locatestarttagend_tolerant = re .compile (r"""
50+ <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
51+ (?:\s* # optional whitespace before attribute name
52+ (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
53+ (?:\s*=\s* # value indicator
54+ (?:'[^']*' # LITA-enclosed value
55+ |\"[^\"]*\" # LIT-enclosed value
56+ |[^'\">\s]+ # bare value
57+ )
58+ (?:\s*,)* # possibly followed by a comma
59+ )?
60+ )
61+ )*
62+ \s* # trailing whitespace
63+ """ , re .VERBOSE )
4564endendtag = re .compile ('>' )
4665endtagfind = re .compile ('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>' )
4766
@@ -86,9 +105,15 @@ class HTMLParser(_markupbase.ParserBase):
86105
87106 CDATA_CONTENT_ELEMENTS = ("script" , "style" )
88107
108+ def __init__ (self , strict = True ):
109+ """Initialize and reset this instance.
89110
90- def __init__ (self ):
91- """Initialize and reset this instance."""
111+ If strict is set to True (the default), errors are raised when invalid
112+ HTML is encountered. If set to False, an attempt is instead made to
113+ continue parsing, making "best guesses" about the intended meaning, in
114+ a fashion similar to what browsers typically do.
115+ """
116+ self .strict = strict
92117 self .reset ()
93118
94119 def reset (self ):
@@ -160,9 +185,18 @@ def goahead(self, end):
160185 else :
161186 break
162187 if k < 0 :
163- if end :
188+ if not end :
189+ break
190+ if self .strict :
164191 self .error ("EOF in middle of construct" )
165- break
192+ k = rawdata .find ('>' , i + 1 )
193+ if k < 0 :
194+ k = rawdata .find ('<' , i + 1 )
195+ if k < 0 :
196+ k = i + 1
197+ else :
198+ k += 1
199+ self .handle_data (rawdata [i :k ])
166200 i = self .updatepos (i , k )
167201 elif startswith ("&#" , i ):
168202 match = charref .match (rawdata , i )
@@ -193,7 +227,12 @@ def goahead(self, end):
193227 if match :
194228 # match.group() will contain at least 2 chars
195229 if end and match .group () == rawdata [i :]:
196- self .error ("EOF in middle of entity or char ref" )
230+ if self .strict :
231+ self .error ("EOF in middle of entity or char ref" )
232+ else :
233+ if k <= i :
234+ k = n
235+ i = self .updatepos (i , i + 1 )
197236 # incomplete
198237 break
199238 elif (i + 1 ) < n :
@@ -240,7 +279,10 @@ def parse_starttag(self, i):
240279 self .lasttag = tag = rawdata [i + 1 :k ].lower ()
241280
242281 while k < endpos :
243- m = attrfind .match (rawdata , k )
282+ if self .strict :
283+ m = attrfind .match (rawdata , k )
284+ else :
285+ m = attrfind_tolerant .search (rawdata , k )
244286 if not m :
245287 break
246288 attrname , rest , attrvalue = m .group (1 , 2 , 3 )
@@ -262,8 +304,11 @@ def parse_starttag(self, i):
262304 - self .__starttag_text .rfind ("\n " )
263305 else :
264306 offset = offset + len (self .__starttag_text )
265- self .error ("junk characters in start tag: %r"
266- % (rawdata [k :endpos ][:20 ],))
307+ if self .strict :
308+ self .error ("junk characters in start tag: %r"
309+ % (rawdata [k :endpos ][:20 ],))
310+ self .handle_data (rawdata [i :endpos ])
311+ return endpos
267312 if end .endswith ('/>' ):
268313 # XHTML-style empty tag: <span attr="value" />
269314 self .handle_startendtag (tag , attrs )
@@ -277,7 +322,10 @@ def parse_starttag(self, i):
277322 # or -1 if incomplete.
278323 def check_for_whole_start_tag (self , i ):
279324 rawdata = self .rawdata
280- m = locatestarttagend .match (rawdata , i )
325+ if self .strict :
326+ m = locatestarttagend .match (rawdata , i )
327+ else :
328+ m = locatestarttagend_tolerant .match (rawdata , i )
281329 if m :
282330 j = m .end ()
283331 next = rawdata [j :j + 1 ]
@@ -290,8 +338,13 @@ def check_for_whole_start_tag(self, i):
290338 # buffer boundary
291339 return - 1
292340 # else bogus input
293- self .updatepos (i , j + 1 )
294- self .error ("malformed empty start tag" )
341+ if self .strict :
342+ self .updatepos (i , j + 1 )
343+ self .error ("malformed empty start tag" )
344+ if j > i :
345+ return j
346+ else :
347+ return i + 1
295348 if next == "" :
296349 # end of input
297350 return - 1
@@ -300,8 +353,13 @@ def check_for_whole_start_tag(self, i):
300353 # end of input in or before attribute value, or we have the
301354 # '/' from a '/>' ending
302355 return - 1
303- self .updatepos (i , j )
304- self .error ("malformed start tag" )
356+ if self .strict :
357+ self .updatepos (i , j )
358+ self .error ("malformed start tag" )
359+ if j > i :
360+ return j
361+ else :
362+ return i + 1
305363 raise AssertionError ("we should not get here!" )
306364
307365 # Internal -- parse endtag, return end or -1 if incomplete
@@ -314,7 +372,15 @@ def parse_endtag(self, i):
314372 j = match .end ()
315373 match = endtagfind .match (rawdata , i ) # </ + tag + >
316374 if not match :
317- self .error ("bad end tag: %r" % (rawdata [i :j ],))
375+ if self .strict :
376+ self .error ("bad end tag: %r" % (rawdata [i :j ],))
377+ k = rawdata .find ('<' , i + 1 , j )
378+ if k > i :
379+ j = k
380+ if j <= i :
381+ j = i + 1
382+ self .handle_data (rawdata [i :j ])
383+ return j
318384 tag = match .group (1 )
319385 self .handle_endtag (tag .lower ())
320386 self .clear_cdata_mode ()
@@ -358,7 +424,8 @@ def handle_pi(self, data):
358424 pass
359425
360426 def unknown_decl (self , data ):
361- self .error ("unknown declaration: %r" % (data ,))
427+ if self .strict :
428+ self .error ("unknown declaration: %r" % (data ,))
362429
363430 # Internal -- helper to remove special character quoting
364431 entitydefs = None
0 commit comments