Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 0c6012f

Browse files
feature #49121 [DomCrawler] Give choice of used parser (victor-prdh)
This PR was squashed before being merged into the 6.3 branch. Discussion ---------- [DomCrawler] Give choice of used parser | Q | A | ------------- | --- | Branch? | 6.3 | Bug fix? | no | New feature? | yes | Deprecations? | no | Tickets | #48950 | License | MIT | Doc PR | symfony/symfony-docs#... <!-- required for new features --> Hi, This first commit is more like than a POC and possible implementation of the feature. I would like to have some feedback before start adding tests and doc for this new feature. Thanks ! Commits ------- 45f03b8 [DomCrawler] Give choice of used parser
2 parents 6858de9 + 45f03b8 commit 0c6012f

File tree

6 files changed

+46
-8
lines changed

6 files changed

+46
-8
lines changed

src/Symfony/Component/BrowserKit/AbstractBrowser.php

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ abstract class AbstractBrowser
3737
protected $internalResponse;
3838
protected $response;
3939
protected $crawler;
40+
protected bool $useHtml5Parser = true;
4041
protected $insulated = false;
4142
protected $redirect;
4243
protected $followRedirects = true;
@@ -207,6 +208,18 @@ public function getCrawler(): Crawler
207208
return $this->crawler;
208209
}
209210

211+
/**
212+
* Sets whether parsing should be done using "masterminds/html5".
213+
*
214+
* @return $this
215+
*/
216+
public function useHtml5Parser(bool $useHtml5Parser): static
217+
{
218+
$this->useHtml5Parser = $useHtml5Parser;
219+
220+
return $this;
221+
}
222+
210223
/**
211224
* Returns the current BrowserKit Response instance.
212225
*/
@@ -497,7 +510,7 @@ protected function createCrawlerFromContent(string $uri, string $content, string
497510
return null;
498511
}
499512

500-
$crawler = new Crawler(null, $uri);
513+
$crawler = new Crawler(null, $uri, null, $this->useHtml5Parser);
501514
$crawler->addContent($content, $type);
502515

503516
return $crawler;

src/Symfony/Component/BrowserKit/CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
11
CHANGELOG
22
=========
33

4+
6.3
5+
---
6+
7+
* Add `AbstractBrowser::useHtml5Parser()`
8+
49
6.1
510
---
611

src/Symfony/Component/DomCrawler/CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ CHANGELOG
44
6.3
55
---
66

7+
* Add `$useHtml5Parser` argument to `Crawler`
78
* Add `CrawlerSelectorCount` test constraint
89
* Add argument `$normalizeWhitespace` to `Crawler::innerText()`
910
* Make `Crawler::innerText()` return the first non-empty text

src/Symfony/Component/DomCrawler/Crawler.php

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -58,16 +58,17 @@ class Crawler implements \Countable, \IteratorAggregate
5858
*/
5959
private bool $isHtml = true;
6060

61-
private HTML5 $html5Parser;
61+
62+
private ?HTML5 $html5Parser = null;
6263

6364
/**
6465
* @param \DOMNodeList|\DOMNode|\DOMNode[]|string|null $node A Node to use as the base for the crawling
6566
*/
66-
public function __construct(\DOMNodeList|\DOMNode|array|string $node = null, string $uri = null, string $baseHref = null)
67+
public function __construct(\DOMNodeList|\DOMNode|array|string $node = null, string $uri = null, string $baseHref = null, bool $useHtml5Parser = true)
6768
{
6869
$this->uri = $uri;
6970
$this->baseHref = $baseHref ?: $uri;
70-
$this->html5Parser = new HTML5(['disable_html_ns' => true]);
71+
$this->html5Parser = $useHtml5Parser ? new HTML5(['disable_html_ns' => true]) : null;
7172
$this->cachedNamespaces = new \ArrayObject();
7273

7374
$this->add($node);
@@ -621,7 +622,7 @@ public function html(string $default = null): string
621622
$node = $this->getNode(0);
622623
$owner = $node->ownerDocument;
623624

624-
if ('<!DOCTYPE html>' === $owner->saveXML($owner->childNodes[0])) {
625+
if ($this->html5Parser && '<!DOCTYPE html>' === $owner->saveXML($owner->childNodes[0])) {
625626
$owner = $this->html5Parser;
626627
}
627628

@@ -642,7 +643,7 @@ public function outerHtml(): string
642643
$node = $this->getNode(0);
643644
$owner = $node->ownerDocument;
644645

645-
if ('<!DOCTYPE html>' === $owner->saveXML($owner->childNodes[0])) {
646+
if ($this->html5Parser && '<!DOCTYPE html>' === $owner->saveXML($owner->childNodes[0])) {
646647
$owner = $this->html5Parser;
647648
}
648649

@@ -1215,6 +1216,10 @@ private function parseHtmlString(string $content, string $charset): \DOMDocument
12151216

12161217
private function canParseHtml5String(string $content): bool
12171218
{
1219+
if (!$this->html5Parser) {
1220+
return false;
1221+
}
1222+
12181223
if (false === ($pos = stripos($content, '<!doctype html>'))) {
12191224
return false;
12201225
}

src/Symfony/Component/DomCrawler/Tests/AbstractCrawlerTestCase.php

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,9 @@ abstract class AbstractCrawlerTestCase extends TestCase
2121
{
2222
abstract public static function getDoctype(): string;
2323

24-
protected function createCrawler($node = null, string $uri = null, string $baseHref = null)
24+
protected function createCrawler($node = null, string $uri = null, string $baseHref = null, bool $useHtml5Parser = true)
2525
{
26-
return new Crawler($node, $uri, $baseHref);
26+
return new Crawler($node, $uri, $baseHref, $useHtml5Parser);
2727
}
2828

2929
public function testConstructor()

src/Symfony/Component/DomCrawler/Tests/Html5ParserCrawlerTest.php

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,20 @@ public function testHtml5ParserWithInvalidHeadedContent(string $content)
4646
self::assertEmpty($crawler->filterXPath('//h1')->text(), '->addHtmlContent failed as expected');
4747
}
4848

49+
public function testHtml5ParserNotSameAsNativeParserForSpecificHtml()
50+
{
51+
// Html who create a bug specific to the DOM extension (see https://github.com/symfony/symfony/issues/28596)
52+
$html = $this->getDoctype().'<html><body><h1><p>Foo</p></h1></body></html>';
53+
54+
$html5Crawler = $this->createCrawler(null, null, null, true);
55+
$html5Crawler->add($html);
56+
57+
$nativeCrawler = $this->createCrawler(null, null, null, false);
58+
$nativeCrawler->add($html);
59+
60+
$this->assertNotEquals($nativeCrawler->filterXPath('//h1')->text(), $html5Crawler->filterXPath('//h1')->text(), 'Native parser and Html5 parser must be different');
61+
}
62+
4963
public static function validHtml5Provider(): iterable
5064
{
5165
$html = self::getDoctype().'<html><body><h1><p>Foo</p></h1></body></html>';

0 commit comments

Comments
 (0)