Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 0c239c7

Browse files
committed
Playing with XHTML, experiments with libXML/HTMLParser.h *Work in progress*
Differs from initWithXMLString by using libXML's HTML parser, which automatically decodes XHTML/HTML entities found within the document which eliminates the need to resanitize strings extracted from the document libXML treats a htmlDocPtr the same as xmlDocPtr HTMLParser.h promisses that it handles 'real world' html as well, is it a fair assumption that if the input is indeed XHTML, that the output is the same (with the addition of decoding XHTML character entities) Signed-off-by: Todd Brannam <[email protected]>
1 parent 32eff29 commit 0c239c7

6 files changed

Lines changed: 387 additions & 22 deletions

File tree

Source/CXHTMLDocument.h

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
//
2+
// CXHTMLDocument.h
3+
// TouchCode
4+
//
5+
// Created by Jonathan Wight on 03/07/08.
6+
// Copyright 2008 toxicsoftware.com. All rights reserved.
7+
//
8+
// Permission is hereby granted, free of charge, to any person
9+
// obtaining a copy of this software and associated documentation
10+
// files (the "Software"), to deal in the Software without
11+
// restriction, including without limitation the rights to use,
12+
// copy, modify, merge, publish, distribute, sublicense, and/or sell
13+
// copies of the Software, and to permit persons to whom the
14+
// Software is furnished to do so, subject to the following
15+
// conditions:
16+
//
17+
// The above copyright notice and this permission notice shall be
18+
// included in all copies or substantial portions of the Software.
19+
//
20+
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21+
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22+
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23+
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24+
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25+
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26+
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27+
// OTHER DEALINGS IN THE SOFTWARE.
28+
//
29+
30+
#import "CXMLDocument.h"
31+
32+
33+
@interface CXHTMLDocument : CXMLDocument {
34+
35+
}
36+
37+
- (id)initWithXHTMLData:(NSData *)inData encoding:(NSStringEncoding)encoding options:(NSUInteger)inOptions error:(NSError **)outError;
38+
- (id)initWithXHTMLString:(NSString *)inString options:(NSUInteger)inOptions error:(NSError **)outError;
39+
40+
@end

Source/CXHTMLDocument.m

Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
//
2+
// CXHTMLDocument.m
3+
// TouchCode
4+
//
5+
// Created by Jonathan Wight on 03/07/08.
6+
// Copyright 2008 toxicsoftware.com. All rights reserved.
7+
//
8+
// Permission is hereby granted, free of charge, to any person
9+
// obtaining a copy of this software and associated documentation
10+
// files (the "Software"), to deal in the Software without
11+
// restriction, including without limitation the rights to use,
12+
// copy, modify, merge, publish, distribute, sublicense, and/or sell
13+
// copies of the Software, and to permit persons to whom the
14+
// Software is furnished to do so, subject to the following
15+
// conditions:
16+
//
17+
// The above copyright notice and this permission notice shall be
18+
// included in all copies or substantial portions of the Software.
19+
//
20+
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21+
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22+
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23+
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24+
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25+
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26+
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27+
// OTHER DEALINGS IN THE SOFTWARE.
28+
//
29+
30+
// This is an experiment to see if we can utilize the HTMLParser functionality
31+
// of libXML to serve as a XHTML parser, I question if this is a good idea or not
32+
// need to test some of the following
33+
// [-] How are xml namespaces handled
34+
// [-] Can we support DTD
35+
// [-]
36+
37+
#import "CXHTMLDocument.h"
38+
39+
#include <libxml/parser.h>
40+
#include <libxml/htmlparser.h>
41+
#include <libxml/HTMLtree.h>
42+
#include <libxml/xpath.h>
43+
44+
#import "CXMLNode_PrivateExtensions.h"
45+
#import "CXMLElement.h"
46+
47+
#if TOUCHXMLUSETIDY
48+
#import "CTidy.h"
49+
#endif /* TOUCHXMLUSETIDY */
50+
51+
@implementation CXHTMLDocument
52+
53+
54+
// need to replace this code, for the somewhat trivial task of finding the body element
55+
static xmlNodePtr xhtml_find_body (xmlDocPtr doc)
56+
{
57+
xmlXPathContextPtr xpathCtxt = NULL;
58+
xmlXPathObjectPtr xpathObj = NULL;
59+
xmlNodePtr node = NULL;
60+
61+
xpathCtxt = xmlXPathNewContext (doc);
62+
if (!xpathCtxt)
63+
goto error;
64+
65+
xpathObj = xmlXPathEvalExpression (BAD_CAST("/html/body"), xpathCtxt);
66+
if (!xpathObj)
67+
goto error;
68+
if (!xpathObj->nodesetval->nodeMax)
69+
goto error;
70+
71+
node = xpathObj->nodesetval->nodeTab[0];
72+
error:
73+
if (xpathObj)
74+
xmlXPathFreeObject (xpathObj);
75+
if (xpathCtxt)
76+
xmlXPathFreeContext (xpathCtxt);
77+
return node;
78+
}
79+
80+
// Differs from initWithXMLString by using libXML's HTML parser, which automatically decodes XHTML/HTML entities found within the document
81+
// which eliminates the need to resanitize strings extracted from the document
82+
// libXML treats a htmlDocPtr the same as xmlDocPtr
83+
- (id)initWithXHTMLString:(NSString *)inString options:(NSUInteger)inOptions error:(NSError **)outError
84+
{
85+
#pragma unused (inOptions)
86+
if ((self = [super init]) != NULL)
87+
{
88+
NSError *theError = NULL;
89+
90+
htmlDocPtr theDoc = htmlParseDoc(BAD_CAST[inString UTF8String], xmlGetCharEncodingName(XML_CHAR_ENCODING_UTF8));
91+
92+
if (theDoc != NULL)
93+
{
94+
95+
// TODO: change code to not depend on XPATH, should be a task simple enough to do
96+
// alternatively see if we can prevent the HTML parser from adding implied tags
97+
98+
xmlXPathContextPtr xpathContext = xmlXPathNewContext (theDoc);
99+
100+
xmlXPathObjectPtr xpathObject = NULL;
101+
if (xpathContext)
102+
xpathObject = xmlXPathEvalExpression (BAD_CAST("/html/body"), xpathContext);
103+
104+
xmlNodePtr bodyNode = NULL;
105+
if (xpathObject && xpathObject->nodesetval->nodeMax)
106+
bodyNode = xpathObject->nodesetval->nodeTab[0];
107+
108+
// TODO: Determine if this is sufficient to handle memory in libXML, is the old root removed / deleted, etc
109+
if (bodyNode)
110+
xmlDocSetRootElement(theDoc, bodyNode->children);
111+
112+
_node = (xmlNodePtr)theDoc;
113+
NSAssert(_node->_private == NULL, @"TODO");
114+
_node->_private = self; // Note. NOT retained (TODO think more about _private usage)
115+
116+
if (xpathObject)
117+
xmlXPathFreeObject (xpathObject);
118+
119+
if (xpathContext)
120+
xmlXPathFreeContext (xpathContext);
121+
}
122+
else
123+
{
124+
xmlErrorPtr theLastErrorPtr = xmlGetLastError();
125+
126+
NSDictionary *theUserInfo = [NSDictionary dictionaryWithObjectsAndKeys:
127+
theLastErrorPtr ? [NSString stringWithUTF8String:theLastErrorPtr->message] : @"unknown", NSLocalizedDescriptionKey,
128+
NULL];
129+
130+
theError = [NSError errorWithDomain:@"CXMLErrorDomain" code:1 userInfo:theUserInfo];
131+
132+
xmlResetLastError();
133+
}
134+
135+
if (outError)
136+
*outError = theError;
137+
138+
if (theError != NULL)
139+
{
140+
[self release];
141+
self = NULL;
142+
}
143+
}
144+
return(self);
145+
}
146+
147+
- (id)initWithXHTMLData:(NSData *)inData encoding:(NSStringEncoding)encoding options:(NSUInteger)inOptions error:(NSError **)outError
148+
{
149+
#pragma unused (inOptions)
150+
if ((self = [super init]) != NULL)
151+
{
152+
NSError *theError = NULL;
153+
154+
if (theError == NULL)
155+
{
156+
xmlDocPtr theDoc = NULL;
157+
if (inData && inData.length > 0)
158+
{
159+
CFStringEncoding cfenc = CFStringConvertNSStringEncodingToEncoding(encoding);
160+
CFStringRef cfencstr = CFStringConvertEncodingToIANACharSetName(cfenc);
161+
const char *enc = CFStringGetCStringPtr(cfencstr, 0);
162+
theDoc = htmlReadMemory([inData bytes], [inData length], NULL, enc, HTML_PARSE_NONET | HTML_PARSE_NOBLANKS | HTML_PARSE_NOWARNING);
163+
}
164+
165+
if (theDoc != NULL)
166+
{
167+
_node = (xmlNodePtr)theDoc;
168+
_node->_private = self; // Note. NOT retained (TODO think more about _private usage)
169+
}
170+
else
171+
{
172+
theError = [NSError errorWithDomain:@"CXMLErrorDomain" code:-1 userInfo:NULL];
173+
}
174+
}
175+
176+
if (outError)
177+
*outError = theError;
178+
179+
if (theError != NULL)
180+
{
181+
[self release];
182+
self = NULL;
183+
}
184+
}
185+
return(self);
186+
}
187+
188+
189+
190+
@end

Support/Validator/CMainController.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,13 @@
3535
NSString *XMLString;
3636
NSString *XPath;
3737
NSString *status;
38+
NSUInteger documentType;
3839
}
3940

4041
@property (readwrite, nonatomic, assign) IBOutlet NSWindow *window;
4142
@property (readwrite, nonatomic, copy) NSString *XMLString;
4243
@property (readwrite, nonatomic, copy) NSString *XPath;
4344
@property (readwrite, nonatomic, copy) NSString *status;
45+
@property (readwrite, nonatomic, assign) NSUInteger documentType;
4446

4547
@end

Support/Validator/CMainController.m

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,13 +30,20 @@
3030
#import "CMainController.h"
3131

3232
#import "CXMLDocument.h"
33+
#import "CXHTMLDocument.h"
34+
35+
@interface CMainController()
36+
- (void)updateStatus;
37+
@end
38+
3339

3440
@implementation CMainController
3541

3642
@synthesize window;
3743
@synthesize XMLString;
3844
@synthesize XPath;
3945
@synthesize status;
46+
@synthesize documentType;
4047

4148
- (void)applicationDidFinishLaunching:(NSNotification *)inNotification
4249
{
@@ -79,10 +86,24 @@ - (void)setXPath:(NSString *)inXPath
7986
}
8087
}
8188

89+
- (void)setDocumentType:(NSUInteger)aDocumentType
90+
{
91+
if (aDocumentType != documentType)
92+
{
93+
documentType = aDocumentType;
94+
[self updateStatus];
95+
}
96+
}
97+
8298
- (void)updateStatus
8399
{
84100
NSError *theError = NULL;
85-
CXMLDocument *theXMLDocument = [[[CXMLDocument alloc] initWithXMLString:self.XMLString options:0 error:&theError] autorelease];
101+
CXMLDocument *theXMLDocument = nil;
102+
if (self.documentType == 0)
103+
theXMLDocument = [[[CXMLDocument alloc] initWithXMLString:self.XMLString options:0 error:&theError] autorelease];
104+
else
105+
theXMLDocument = [[[CXHTMLDocument alloc] initWithXHTMLString:self.XMLString options:0 error:&theError] autorelease];
106+
86107
if (theXMLDocument)
87108
{
88109
if (self.XPath.length > 0)

0 commit comments

Comments
 (0)