|
23 | 23 | piclose = re.compile('>') |
24 | 24 | commentclose = re.compile(r'--\s*>') |
25 | 25 | tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*') |
| 26 | +# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state |
| 27 | +# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state |
| 28 | +tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*') |
26 | 29 | # Note, the strict one of this pair isn't really strict, but we can't |
27 | 30 | # make it correctly strict without breaking backward compatibility. |
28 | 31 | attrfind = re.compile( |
@@ -270,7 +273,7 @@ def goahead(self, end): |
270 | 273 | # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state |
271 | 274 | def parse_bogus_comment(self, i, report=1): |
272 | 275 | rawdata = self.rawdata |
273 | | - if rawdata[i:i+2] != '<!': |
| 276 | + if rawdata[i:i+2] not in ('<!', '</'): |
274 | 277 | self.error('unexpected call to parse_comment()') |
275 | 278 | pos = rawdata.find('>', i+2) |
276 | 279 | if pos == -1: |
@@ -398,31 +401,40 @@ def parse_endtag(self, i): |
398 | 401 | match = endendtag.search(rawdata, i+1) # > |
399 | 402 | if not match: |
400 | 403 | return -1 |
401 | | - j = match.end() |
| 404 | + gtpos = match.end() |
402 | 405 | match = endtagfind.match(rawdata, i) # </ + tag + > |
403 | 406 | if not match: |
404 | 407 | if self.cdata_elem is not None: |
405 | | - self.handle_data(rawdata[i:j]) |
406 | | - return j |
| 408 | + self.handle_data(rawdata[i:gtpos]) |
| 409 | + return gtpos |
407 | 410 | if self.strict: |
408 | | - self.error("bad end tag: %r" % (rawdata[i:j],)) |
409 | | - k = rawdata.find('<', i + 1, j) |
410 | | - if k > i: |
411 | | - j = k |
412 | | - if j <= i: |
413 | | - j = i + 1 |
414 | | - self.handle_data(rawdata[i:j]) |
415 | | - return j |
| 411 | + self.error("bad end tag: %r" % (rawdata[i:gtpos],)) |
| 412 | + # find the name: w3.org/TR/html5/tokenization.html#tag-name-state |
| 413 | + namematch = tagfind_tolerant.match(rawdata, i+2) |
| 414 | + if not namematch: |
| 415 | + # w3.org/TR/html5/tokenization.html#end-tag-open-state |
| 416 | + if rawdata[i:i+3] == '</>': |
| 417 | + return i+3 |
| 418 | + else: |
| 419 | + return self.parse_bogus_comment(i) |
| 420 | + tagname = namematch.group().lower() |
| 421 | + # consume and ignore other stuff between the name and the > |
| 422 | + # Note: this is not 100% correct, since we might have things like |
| 423 | + # </tag attr=">">, but looking for > after tha name should cover |
| 424 | + # most of the cases and is much simpler |
| 425 | + gtpos = rawdata.find('>', namematch.end()) |
| 426 | + self.handle_endtag(tagname) |
| 427 | + return gtpos+1 |
416 | 428 |
|
417 | 429 | elem = match.group(1).lower() # script or style |
418 | 430 | if self.cdata_elem is not None: |
419 | 431 | if elem != self.cdata_elem: |
420 | | - self.handle_data(rawdata[i:j]) |
421 | | - return j |
| 432 | + self.handle_data(rawdata[i:gtpos]) |
| 433 | + return gtpos |
422 | 434 |
|
423 | 435 | self.handle_endtag(elem.lower()) |
424 | 436 | self.clear_cdata_mode() |
425 | | - return j |
| 437 | + return gtpos |
426 | 438 |
|
427 | 439 | # Overridable -- finish processing of start+end tag: <tag.../> |
428 | 440 | def handle_startendtag(self, tag, attrs): |
|
0 commit comments