Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 7e85a6a

Browse files
committed
bug #37803 Fix for issue #37681 (Rav)
This PR was squashed before being merged into the 4.4 branch. Discussion ---------- Fix for issue #37681 | Q | A | ------------- | --- | Branch? | 4.4 | Bug fix? | yes | New feature? | no | Deprecations? | no | Tickets | Fix #37681 | License | MIT | Doc PR | Allow BOM character and comments before `<!DOCTYPE html>` declaration in DomCrawler while choosing a parser implementation Commits ------- 9bc249e Fix for issue #37681
2 parents 8761f80 + 9bc249e commit 7e85a6a

File tree

2 files changed

+84
-2
lines changed

2 files changed

+84
-2
lines changed

src/Symfony/Component/DomCrawler/Crawler.php

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -188,8 +188,7 @@ public function addContent($content, $type = null)
188188
*/
189189
public function addHtmlContent($content, $charset = 'UTF-8')
190190
{
191-
// Use HTML5 parser if the content is HTML5 and the library is available
192-
$dom = null !== $this->html5Parser && strspn($content, " \t\r\n") === stripos($content, '<!doctype html>') ? $this->parseHtml5($content, $charset) : $this->parseXhtml($content, $charset);
191+
$dom = $this->parseHtmlString($content, $charset);
193192
$this->addDocument($dom);
194193

195194
$base = $this->filterRelativeXPath('descendant-or-self::base')->extract(['href']);
@@ -1295,4 +1294,35 @@ private function createCssSelectorConverter(): CssSelectorConverter
12951294

12961295
return new CssSelectorConverter($this->isHtml);
12971296
}
1297+
1298+
/**
1299+
* Parse string into DOMDocument object using HTML5 parser if the content is HTML5 and the library is available.
1300+
* Use libxml parser otherwise.
1301+
*/
1302+
private function parseHtmlString(string $content, string $charset): \DOMDocument
1303+
{
1304+
if ($this->canParseHtml5String($content)) {
1305+
return $this->parseHtml5($content, $charset);
1306+
}
1307+
1308+
return $this->parseXhtml($content, $charset);
1309+
}
1310+
1311+
private function canParseHtml5String(string $content): bool
1312+
{
1313+
if (null === $this->html5Parser) {
1314+
return false;
1315+
}
1316+
if (false === ($pos = stripos($content, '<!doctype html>'))) {
1317+
return false;
1318+
}
1319+
$header = substr($content, 0, $pos);
1320+
1321+
return '' === $header || $this->isValidHtml5Heading($header);
1322+
}
1323+
1324+
private function isValidHtml5Heading(string $heading): bool
1325+
{
1326+
return 1 === preg_match('/^\x{FEFF}?\s*(<!--[^>]*?-->\s*)*$/u', $heading);
1327+
}
12981328
}

src/Symfony/Component/DomCrawler/Tests/Html5ParserCrawlerTest.php

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,4 +25,56 @@ public function testAddHtml5()
2525
$crawler->add($this->getDoctype().'<html><body><h1><p>Foo</p></h1></body></html>');
2626
$this->assertEquals('Foo', $crawler->filterXPath('//h1')->text(), '->add() adds nodes from a string');
2727
}
28+
29+
/** @dataProvider validHtml5Provider */
30+
public function testHtml5ParserParseContentStartingWithValidHeading(string $content): void
31+
{
32+
$this->skipTestIfHTML5LibraryNotAvailable();
33+
34+
$crawler = $this->createCrawler();
35+
$crawler->addHtmlContent($content);
36+
self::assertEquals(
37+
'Foo',
38+
$crawler->filterXPath('//h1')->text(),
39+
'->addHtmlContent() parses valid HTML with comment before doctype'
40+
);
41+
}
42+
43+
/** @dataProvider invalidHtml5Provider */
44+
public function testHtml5ParserWithInvalidHeadedContent(string $content): void
45+
{
46+
$this->skipTestIfHTML5LibraryNotAvailable();
47+
48+
$crawler = $this->createCrawler();
49+
$crawler->addHtmlContent($content);
50+
self::assertEmpty($crawler->filterXPath('//h1')->text(), '->addHtmlContent failed as expected');
51+
}
52+
53+
public function validHtml5Provider(): iterable
54+
{
55+
$html = $this->getDoctype().'<html><body><h1><p>Foo</p></h1></body></html>';
56+
$BOM = \chr(0xEF).\chr(0xBB).\chr(0xBF);
57+
58+
yield 'BOM first' => [$BOM.$html];
59+
yield 'Single comment' => ['<!-- comment -->'.$html];
60+
yield 'Multiline comment' => ["<!-- \n multiline comment \n -->".$html];
61+
yield 'Several comments' => ['<!--c--> <!--cc-->'.$html];
62+
yield 'Whitespaces' => [' '.$html];
63+
yield 'All together' => [$BOM.' '.'<!--c-->'.$html];
64+
}
65+
66+
public function invalidHtml5Provider(): iterable
67+
{
68+
$html = $this->getDoctype().'<html><body><h1><p>Foo</p></h1></body></html>';
69+
70+
yield 'Text' => ['hello world'.$html];
71+
yield 'Text between comments' => ['<!--c--> test <!--cc-->'.$html];
72+
}
73+
74+
private function skipTestIfHTML5LibraryNotAvailable(): void
75+
{
76+
if (!class_exists(\Masterminds\HTML5::class)) {
77+
self::markTestSkipped('HTML5 library is not available');
78+
}
79+
}
2880
}

0 commit comments

Comments
 (0)