-
-
Notifications
You must be signed in to change notification settings - Fork 9.6k
[DomCrawler] Optionally use html5-php to parse HTML #29306
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,6 +11,7 @@ | |
|
||
namespace Symfony\Component\DomCrawler; | ||
|
||
use Masterminds\HTML5; | ||
use Symfony\Component\CssSelector\CssSelectorConverter; | ||
|
||
/** | ||
|
@@ -55,15 +56,29 @@ class Crawler implements \Countable, \IteratorAggregate | |
private $isHtml = true; | ||
|
||
/** | ||
* @param mixed $node A Node to use as the base for the crawling | ||
* @param string $uri The current URI | ||
* @param string $baseHref The base href value | ||
* @var HTML5|null | ||
*/ | ||
public function __construct($node = null, string $uri = null, string $baseHref = null) | ||
private $html5Parser; | ||
|
||
/** | ||
* @param mixed $node A Node to use as the base for the crawling | ||
* @param string $uri The current URI | ||
* @param string $baseHref The base href value | ||
* @param bool|null $useHtml5Parser Whether the Crawler should use the HTML5 parser or the native DOM parser | ||
*/ | ||
public function __construct($node = null, string $uri = null, string $baseHref = null, bool $useHtml5Parser = null) | ||
{ | ||
$this->uri = $uri; | ||
$this->baseHref = $baseHref ?: $uri; | ||
|
||
if ($useHtml5Parser && !class_exists(HTML5::class)) { | ||
throw new \LogicException('Using the DomCrawler HTML5 parser requires the html5-php library. Try running "composer require masterminds/html5".'); | ||
} | ||
|
||
if ($useHtml5Parser ?? class_exists(HTML5::class)) { | ||
$this->html5Parser = new HTML5(['disable_html_ns' => true]); | ||
} | ||
|
||
$this->add($node); | ||
} | ||
|
||
|
@@ -183,29 +198,7 @@ public function addContent($content, $type = null) | |
*/ | ||
public function addHtmlContent($content, $charset = 'UTF-8') | ||
{ | ||
$internalErrors = libxml_use_internal_errors(true); | ||
$disableEntities = libxml_disable_entity_loader(true); | ||
|
||
$dom = new \DOMDocument('1.0', $charset); | ||
$dom->validateOnParse = true; | ||
|
||
set_error_handler(function () { throw new \Exception(); }); | ||
|
||
try { | ||
// Convert charset to HTML-entities to work around bugs in DOMDocument::loadHTML() | ||
$content = mb_convert_encoding($content, 'HTML-ENTITIES', $charset); | ||
} catch (\Exception $e) { | ||
} | ||
|
||
restore_error_handler(); | ||
|
||
if ('' !== trim($content)) { | ||
@$dom->loadHTML($content); | ||
} | ||
|
||
libxml_use_internal_errors($internalErrors); | ||
libxml_disable_entity_loader($disableEntities); | ||
|
||
$dom = null !== $this->html5Parser ? $this->parseHtml5($content, $charset) : $this->parseXhtml($content, $charset); | ||
$this->addDocument($dom); | ||
|
||
$base = $this->filterRelativeXPath('descendant-or-self::base')->extract(['href']); | ||
|
@@ -608,6 +601,15 @@ public function html(/* $default = null */) | |
throw new \InvalidArgumentException('The current node list is empty.'); | ||
} | ||
|
||
if (null !== $this->html5Parser) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is an issue here. You instantiate the HTML5 parser in the constructor even when the content added is not HTML5 but XML or existing DOM elements (coming from elsewhere than a parent crawler using HTML5). This means you might be saving with the HTML5 parser when it was not used for parsing. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How do you propose to improve this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. well, we need to distinguish 3 cases:
The boolean argument in the constructor allows us to decide between the first 2 cases at the time we instantiate. But knowing whether this is HTML or no is not something the controller knows (as it can be done later). The solution might be to store the boolean property. Then, based on that, we would decide which parsing strategy to use if we load HTML and instantiate the HTML5 parser if needed. And for subcrawlers, we copy the content of the private property. |
||
$html = ''; | ||
foreach ($this->getNode(0)->childNodes as $child) { | ||
$html .= $this->html5Parser->saveHTML($child); | ||
} | ||
|
||
return $html; | ||
} | ||
|
||
$html = ''; | ||
foreach ($this->getNode(0)->childNodes as $child) { | ||
$html .= $child->ownerDocument->saveHTML($child); | ||
|
@@ -1112,6 +1114,53 @@ protected function sibling($node, $siblingDir = 'nextSibling') | |
return $nodes; | ||
} | ||
|
||
private function parseHtml5(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument | ||
{ | ||
return $this->html5Parser->parse($this->convertToHtmlEntities($htmlContent, $charset), [], $charset); | ||
} | ||
|
||
private function parseXhtml(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument | ||
{ | ||
$htmlContent = $this->convertToHtmlEntities($htmlContent, $charset); | ||
|
||
$internalErrors = libxml_use_internal_errors(true); | ||
$disableEntities = libxml_disable_entity_loader(true); | ||
|
||
$dom = new \DOMDocument('1.0', $charset); | ||
$dom->validateOnParse = true; | ||
|
||
if ('' !== trim($htmlContent)) { | ||
@$dom->loadHTML($htmlContent); | ||
} | ||
|
||
libxml_use_internal_errors($internalErrors); | ||
libxml_disable_entity_loader($disableEntities); | ||
|
||
return $dom; | ||
} | ||
|
||
/** | ||
* Convert charset to HTML-entities to ensure valid parsing. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Converts |
||
*/ | ||
private function convertToHtmlEntities(string $htmlContent, string $charset = 'UTF-8'): string | ||
{ | ||
set_error_handler(function () { throw new \Exception(); }); | ||
|
||
try { | ||
return mb_convert_encoding($htmlContent, 'HTML-ENTITIES', $charset); | ||
} catch (\Exception $e) { | ||
try { | ||
$htmlContent = iconv($charset, 'UTF-8', $htmlContent); | ||
$htmlContent = mb_convert_encoding($htmlContent, 'HTML-ENTITIES', 'UTF-8'); | ||
} catch (\Exception $e) { | ||
} | ||
|
||
return $htmlContent; | ||
} finally { | ||
restore_error_handler(); | ||
} | ||
} | ||
|
||
/** | ||
* @throws \InvalidArgumentException | ||
*/ | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
When creating a child crawler, you should not rely on guessing but pass the existing value used for the parsing (or even better, assign the actual parser instead of instantiating a new one).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You mean in the
createSubCrawler
method?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes