Thanks to visit codestin.com
Credit goes to github.com

Skip to content

[DomCrawler] Optionally use html5-php to parse HTML #29306

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 3, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@
"doctrine/orm": "~2.4,>=2.4.5",
"doctrine/reflection": "~1.0",
"doctrine/doctrine-bundle": "~1.4",
"masterminds/html5": "^2.6",
"monolog/monolog": "~1.11",
"nyholm/psr7": "^1.0",
"ocramius/proxy-manager": "~0.4|~1.0|~2.0",
Expand All @@ -112,6 +113,7 @@
"phpdocumentor/reflection-docblock": "^3.0|^4.0"
},
"conflict": {
"masterminds/html5": "<2.6",
"phpdocumentor/reflection-docblock": "<3.0||>=3.2.0,<3.2.2",
"phpdocumentor/type-resolver": "<0.3.0",
"phpunit/phpunit": "<5.4.3"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,19 +28,19 @@ public function welcomeAction(Request $request, $name = null)
// new session case
if (!$session->has('name')) {
if (!$name) {
return new Response('You are new here and gave no name.');
return new Response('<html><body>You are new here and gave no name.</body></html>');
}

// remember name
$session->set('name', $name);

return new Response(sprintf('Hello %s, nice to meet you.', $name));
return new Response(sprintf('<html><body>Hello %s, nice to meet you.</body></html>', $name));
}

// existing session
$name = $session->get('name');

return new Response(sprintf('Welcome back %s, nice to meet you.', $name));
return new Response(sprintf('<html><body>Welcome back %s, nice to meet you.</body></html>', $name));
}

public function cacheableAction()
Expand All @@ -55,7 +55,7 @@ public function logoutAction(Request $request)
{
$request->getSession()->invalidate();

return new Response('Session cleared.');
return new Response('<html><body>Session cleared.</body></html>');
}

public function setFlashAction(Request $request, $message)
Expand All @@ -76,6 +76,6 @@ public function showFlashAction(Request $request)
$output = 'No flash was set.';
}

return new Response($output);
return new Response('<html><body>'.$output.'</body></html>');
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,11 @@ public function secureAction()

public function profileAction()
{
return new Response('Profile');
return new Response('<html><body>Profile</body></html>');
}

public function homepageAction()
{
return new Response('Homepage');
return new Response('<html><body>Homepage</body></html>');
}
}
2 changes: 2 additions & 0 deletions src/Symfony/Component/DomCrawler/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ CHANGELOG

* Added return of element name (`_name`) in `extract()` method.
* Added ability to return a default value in `text()` and `html()` instead of throwing an exception when node is empty.
* When available, the [html5-php library](https://github.com/Masterminds/html5-php) is used to
parse HTML added to a Crawler for better support of HTML5 tags.

4.2.0
-----
Expand Down
103 changes: 76 additions & 27 deletions src/Symfony/Component/DomCrawler/Crawler.php
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

namespace Symfony\Component\DomCrawler;

use Masterminds\HTML5;
use Symfony\Component\CssSelector\CssSelectorConverter;

/**
Expand Down Expand Up @@ -55,15 +56,29 @@ class Crawler implements \Countable, \IteratorAggregate
private $isHtml = true;

/**
* @param mixed $node A Node to use as the base for the crawling
* @param string $uri The current URI
* @param string $baseHref The base href value
* @var HTML5|null
*/
public function __construct($node = null, string $uri = null, string $baseHref = null)
private $html5Parser;

/**
* @param mixed $node A Node to use as the base for the crawling
* @param string $uri The current URI
* @param string $baseHref The base href value
* @param bool|null $useHtml5Parser Whether the Crawler should use the HTML5 parser or the native DOM parser
*/
public function __construct($node = null, string $uri = null, string $baseHref = null, bool $useHtml5Parser = null)
{
$this->uri = $uri;
$this->baseHref = $baseHref ?: $uri;

if ($useHtml5Parser && !class_exists(HTML5::class)) {
throw new \LogicException('Using the DomCrawler HTML5 parser requires the html5-php library. Try running "composer require masterminds/html5".');
}

if ($useHtml5Parser ?? class_exists(HTML5::class)) {
$this->html5Parser = new HTML5(['disable_html_ns' => true]);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When creating a child crawler, you should not rely on guessing but pass the existing value used for the parsing (or even better, assign the actual parser instead of instantiating a new one).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You mean in the createSubCrawler method?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes

}

$this->add($node);
}

Expand Down Expand Up @@ -183,29 +198,7 @@ public function addContent($content, $type = null)
*/
public function addHtmlContent($content, $charset = 'UTF-8')
{
$internalErrors = libxml_use_internal_errors(true);
$disableEntities = libxml_disable_entity_loader(true);

$dom = new \DOMDocument('1.0', $charset);
$dom->validateOnParse = true;

set_error_handler(function () { throw new \Exception(); });

try {
// Convert charset to HTML-entities to work around bugs in DOMDocument::loadHTML()
$content = mb_convert_encoding($content, 'HTML-ENTITIES', $charset);
} catch (\Exception $e) {
}

restore_error_handler();

if ('' !== trim($content)) {
@$dom->loadHTML($content);
}

libxml_use_internal_errors($internalErrors);
libxml_disable_entity_loader($disableEntities);

$dom = null !== $this->html5Parser ? $this->parseHtml5($content, $charset) : $this->parseXhtml($content, $charset);
$this->addDocument($dom);

$base = $this->filterRelativeXPath('descendant-or-self::base')->extract(['href']);
Expand Down Expand Up @@ -608,6 +601,15 @@ public function html(/* $default = null */)
throw new \InvalidArgumentException('The current node list is empty.');
}

if (null !== $this->html5Parser) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is an issue here. You instantiate the HTML5 parser in the constructor even when the content added is not HTML5 but XML or existing DOM elements (coming from elsewhere than a parent crawler using HTML5). This means you might be saving with the HTML5 parser when it was not used for parsing.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How do you propose to improve this?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

well, we need to distinguish 3 cases:

  • we are parsing some HTML5
  • we are parsing some older HTML
  • we are not parsing HTML at all

The boolean argument in the constructor allows us to decide between the first 2 cases at the time we instantiate. But knowing whether this is HTML or no is not something the controller knows (as it can be done later).

The solution might be to store the boolean property. Then, based on that, we would decide which parsing strategy to use if we load HTML and instantiate the HTML5 parser if needed.
Then, here, we can keep saying "if I used an HTML5 parser, I also use it for saving".

And for subcrawlers, we copy the content of the private property.

$html = '';
foreach ($this->getNode(0)->childNodes as $child) {
$html .= $this->html5Parser->saveHTML($child);
}

return $html;
}

$html = '';
foreach ($this->getNode(0)->childNodes as $child) {
$html .= $child->ownerDocument->saveHTML($child);
Expand Down Expand Up @@ -1112,6 +1114,53 @@ protected function sibling($node, $siblingDir = 'nextSibling')
return $nodes;
}

private function parseHtml5(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument
{
return $this->html5Parser->parse($this->convertToHtmlEntities($htmlContent, $charset), [], $charset);
}

private function parseXhtml(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument
{
$htmlContent = $this->convertToHtmlEntities($htmlContent, $charset);

$internalErrors = libxml_use_internal_errors(true);
$disableEntities = libxml_disable_entity_loader(true);

$dom = new \DOMDocument('1.0', $charset);
$dom->validateOnParse = true;

if ('' !== trim($htmlContent)) {
@$dom->loadHTML($htmlContent);
}

libxml_use_internal_errors($internalErrors);
libxml_disable_entity_loader($disableEntities);

return $dom;
}

/**
* Convert charset to HTML-entities to ensure valid parsing.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Converts

*/
private function convertToHtmlEntities(string $htmlContent, string $charset = 'UTF-8'): string
{
set_error_handler(function () { throw new \Exception(); });

try {
return mb_convert_encoding($htmlContent, 'HTML-ENTITIES', $charset);
} catch (\Exception $e) {
try {
$htmlContent = iconv($charset, 'UTF-8', $htmlContent);
$htmlContent = mb_convert_encoding($htmlContent, 'HTML-ENTITIES', 'UTF-8');
} catch (\Exception $e) {
}

return $htmlContent;
} finally {
restore_error_handler();
}
}

/**
* @throws \InvalidArgumentException
*/
Expand Down
Loading