From 7802c1f52c88ff5c8af4abcc077b4d51387562bf Mon Sep 17 00:00:00 2001 From: Nicolas Grekas Date: Mon, 31 Jan 2022 15:01:05 +0100 Subject: [PATCH] [DomCrawler] ignore bad charsets --- src/Symfony/Component/DomCrawler/Crawler.php | 21 +++++++------------ .../DomCrawler/Tests/AbstractCrawlerTest.php | 4 ++++ 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/src/Symfony/Component/DomCrawler/Crawler.php b/src/Symfony/Component/DomCrawler/Crawler.php index 36c7a41e5fbc7..aacb94ad37f2f 100644 --- a/src/Symfony/Component/DomCrawler/Crawler.php +++ b/src/Symfony/Component/DomCrawler/Crawler.php @@ -156,24 +156,17 @@ public function addContent($content, $type = null) return; } - $charset = null; - if (false !== $pos = stripos($type, 'charset=')) { - $charset = substr($type, $pos + 8); - if (false !== $pos = strpos($charset, ';')) { - $charset = substr($charset, 0, $pos); - } - } + $charset = preg_match('//u', $content) ? 'UTF-8' : 'ISO-8859-1'; // http://www.w3.org/TR/encoding/#encodings // http://www.w3.org/TR/REC-xml/#NT-EncName - if (null === $charset && - preg_match('/\]+charset *= *["\']?([a-zA-Z\-0-9_:.]+)/i', $content, $matches)) { - $charset = $matches[1]; - } + $content = preg_replace_callback('/(charset *= *["\']?)([a-zA-Z\-0-9_:.]+)/i', function ($m) use (&$charset) { + if ('charset=' === $this->convertToHtmlEntities('charset=', $m[2])) { + $charset = $m[2]; + } - if (null === $charset) { - $charset = preg_match('//u', $content) ? 'UTF-8' : 'ISO-8859-1'; - } + return $m[1].$charset; + }, $content, 1); if ('x' === $xmlMatches[1]) { $this->addXmlContent($content, $charset); diff --git a/src/Symfony/Component/DomCrawler/Tests/AbstractCrawlerTest.php b/src/Symfony/Component/DomCrawler/Tests/AbstractCrawlerTest.php index 697306c53ee28..96d9177673c25 100644 --- a/src/Symfony/Component/DomCrawler/Tests/AbstractCrawlerTest.php +++ b/src/Symfony/Component/DomCrawler/Tests/AbstractCrawlerTest.php @@ -187,6 +187,10 @@ public function testAddContent() $crawler = $this->createCrawler(); $crawler->addContent($this->getDoctype().'中文'); $this->assertEquals('中文', $crawler->filterXPath('//span')->text(), '->addContent() guess wrong charset'); + + $crawler = $this->createCrawler(); + $crawler->addContent($this->getDoctype().'
'); + $this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addContent() ignores bad charset'); } /**