Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 566ca68

Browse files
committed
Move RCData parser to where it should have been to begin with in order to fix last lxml unit test
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401151
1 parent abfca04 commit 566ca68

File tree

3 files changed

+47
-38
lines changed

3 files changed

+47
-38
lines changed

src/html5lib/html5parser.py

Lines changed: 38 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ def __init__(self, strict = False, tree=simpletree.TreeBuilder,
7575
"afterBody": AfterBodyPhase(self, self.tree),
7676
"inFrameset": InFramesetPhase(self, self.tree),
7777
"afterFrameset": AfterFramesetPhase(self, self.tree),
78-
"trailingEnd": TrailingEndPhase(self, self.tree)
78+
"trailingEnd": TrailingEndPhase(self, self.tree),
7979
# XXX after after body
8080
# XXX after after frameset
8181
# XXX trailingEnd is gone
@@ -117,10 +117,11 @@ def _parse(self, stream, innerHTML=False, container="div",
117117
# relevant ... need others too
118118
self.lastPhase = None
119119

120+
self.beforeRCDataPhase = None
121+
120122
# XXX This is temporary for the moment so there isn't any other
121123
# changes needed for the parser to work with the iterable tokenizer
122-
for token in self.tokenizer:
123-
token = self.normalizeToken(token)
124+
for token in self.normalizedTokens():
124125
type = token["type"]
125126
method = getattr(self.phase, "process%s" % type, None)
126127
if type in ("Characters", "SpaceCharacters", "Comment"):
@@ -137,6 +138,10 @@ def _parse(self, stream, innerHTML=False, container="div",
137138
# When the loop finishes it's EOF
138139
self.phase.processEOF()
139140

141+
def normalizedTokens(self):
142+
for token in self.tokenizer:
143+
yield self.normalizeToken(token)
144+
140145
def parse(self, stream, encoding=None, parseMeta=True, useChardet=True):
141146
"""Parse a HTML document into a well-formed tree
142147
@@ -238,6 +243,29 @@ def resetInsertionMode(self):
238243
self.phase = self.phases["inBody"]
239244
break
240245

246+
def parseRCDataCData(self, name, attributes, contentType):
247+
"""Generic (R)CDATA Parsing algorithm
248+
contentType - RCDATA or CDATA
249+
"""
250+
assert contentType in ("CDATA", "RCDATA")
251+
252+
element = self.tree.insertElement(name, attributes)
253+
self.tokenizer.contentModelFlag = contentModelFlags[contentType]
254+
255+
for token in self.normalizedTokens():
256+
if token["type"] in ("Characters", "SpaceCharacters"):
257+
self.tree.insertText(token["data"])
258+
elif token["type"] == "ParseError":
259+
self.parseError(token["data"], token.get("datavars", {}))
260+
else:
261+
assert self.tokenizer.contentModelFlag == contentModelFlags["PCDATA"]
262+
assert token["type"] == "EndTag" and token["name"] == name, repr(token)
263+
assert self.tree.openElements.pop() == element
264+
return
265+
#Otherwise we hit EOF
266+
assert self.tree.openElements.pop() == element
267+
self.parseError("expected-closing-tag-but-got-eof")
268+
241269
class Phase(object):
242270
"""Base class for helper object that implements each phase of processing
243271
"""
@@ -298,29 +326,6 @@ def startTagHtml(self, name, attributes):
298326
def processEndTag(self, name):
299327
self.endTagHandler[name](name)
300328

301-
def parseRCDataCData(self, name, attributes, contentType):
302-
"""Generic (R)CDATA Parsing algorithm
303-
contentType - RCDATA or CDATA
304-
"""
305-
assert contentType in ("CDATA", "RCDATA")
306-
element = self.tree.insertElement(name, attributes)
307-
self.parser.tokenizer.contentModelFlag = contentModelFlags[contentType]
308-
for token in self.parser.tokenizer:
309-
token = self.parser.normalizeToken(token)
310-
if token["type"] in ("Characters", "SpaceCharacters"):
311-
self.tree.insertText(token["data"])
312-
elif token["type"] == "ParseError":
313-
self.parser.parseError(token["data"], token.get("datavars", {}))
314-
else:
315-
assert self.parser.tokenizer.contentModelFlag == contentModelFlags["PCDATA"]
316-
assert token["type"] == "EndTag" and token["name"] == name, repr(token)
317-
assert self.tree.openElements.pop() == element
318-
return
319-
#Otherwise we hit EOF
320-
assert self.tree.openElements.pop() == element
321-
self.parser.parseError("expected-closing-tag-but-got-eof")
322-
323-
324329
class InitialPhase(Phase):
325330
# This phase deals with error handling as well which is currently not
326331
# covered in the specification. The error handling is typically known as
@@ -586,18 +591,18 @@ def startTagHead(self, name, attributes):
586591
self.parser.parseError("two-heads-are-not-better-than-one")
587592

588593
def startTagTitle(self, name, attributes):
589-
self.parseRCDataCData(name, attributes, "RCDATA")
594+
self.parser.parseRCDataCData(name, attributes, "RCDATA")
590595

591596
def startTagStyle(self, name, attributes):
592-
self.parseRCDataCData(name, attributes, "CDATA")
597+
self.parser.parseRCDataCData(name, attributes, "CDATA")
593598

594599
def startTagNoScript(self, name, attributes):
595600
#Need to decide whether to implement the scripting-disabled case
596-
self.parseRCDataCData(name, attributes, "CDATA")
601+
self.parser.parseRCDataCData(name, attributes, "CDATA")
597602

598603
def startTagScript(self, name, attributes):
599604
#I think this is equivalent to the CDATA stuff since we don't execute script
600-
self.parseRCDataCData(name, attributes, "CDATA")
605+
self.parser.parseRCDataCData(name, attributes, "CDATA")
601606

602607
def startTagBaseLinkMeta(self, name, attributes):
603608
if (self.tree.headPointer is not None and self.parser.phase == self.parser.phases["inHead"]):
@@ -612,7 +617,7 @@ def startTagOther(self, name, attributes):
612617
self.parser.phase.processStartTag(name, attributes)
613618

614619
def endTagHead(self, name):
615-
assert self.tree.openElements[-1].name == "head"
620+
assert self.tree.openElements[-1].name == "head", "Expected head got %s"%self.tree.openElements[-1].name
616621
self.tree.openElements.pop()
617622
self.parser.phase = self.parser.phases["afterHead"]
618623

@@ -922,7 +927,7 @@ def startTagAppletMarqueeObject(self, name, attributes):
922927

923928
def startTagXmp(self, name, attributes):
924929
self.tree.reconstructActiveFormattingElements()
925-
self.parseRCDataCData(name, attributes, "CDATA")
930+
self.parser.parseRCDataCData(name, attributes, "CDATA")
926931

927932
def startTagTable(self, name, attributes):
928933
if self.tree.elementInScope("p"):
@@ -982,7 +987,7 @@ def startTagTextarea(self, name, attributes):
982987

983988
def startTagCdata(self, name, attributes):
984989
"""iframe, noembed noframes, noscript(if scripting enabled)"""
985-
self.parseRCDataCData(name, attributes, "CDATA")
990+
self.parser.parseRCDataCData(name, attributes, "CDATA")
986991

987992
def startTagSelect(self, name, attributes):
988993
self.tree.reconstructActiveFormattingElements()

src/html5lib/liberalxmlparser.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,9 @@ def _parse(self, stream, innerHTML=False, container="div", encoding=None,
6363
encoding, lowercaseElementName=False,
6464
lowercaseAttrName=False)
6565

66+
def parseRCDataCData(self, name, attributes, contentType):
67+
self.tree.insertElement(name, attributes)
68+
6669
class XHTMLParser(XMLParser):
6770
""" liberal XMTHML parser """
6871

src/html5lib/tokenizer.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -315,10 +315,11 @@ def dataState(self):
315315
self.escapeFlag and "".join(self.lastFourChars) == "<!--":
316316
self.escapeFlag = True
317317
self.tokenQueue.append({"type": "Characters", "data":data})
318-
elif data == "<" and (self.contentModelFlag ==\
319-
contentModelFlags["PCDATA"] or (self.contentModelFlag in
320-
(contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and\
321-
self.escapeFlag == False)):
318+
elif (data == "<" and (self.contentModelFlag == contentModelFlags["PCDATA"]
319+
or (self.contentModelFlag in
320+
(contentModelFlags["CDATA"],
321+
contentModelFlags["RCDATA"]) and
322+
self.escapeFlag == False))):
322323
self.state = self.states["tagOpen"]
323324
elif data == ">" and self.contentModelFlag in\
324325
(contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and\
@@ -343,7 +344,7 @@ def dataState(self):
343344
self.lastFourChars += chars[-4:]
344345
self.lastFourChars = self.lastFourChars[-4:]
345346
return True
346-
347+
347348
def entityDataState(self):
348349
entity = self.consumeEntity()
349350
if entity:

0 commit comments

Comments
 (0)