From ca0f8e9cf5a9de5fddaaa1e6b273b90838f5670c Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 25 Apr 2022 10:23:15 +0300 Subject: [PATCH 1/2] gh-91810: ElementTree: Use text file's encoding by default in XML declaration ElementTree method write() and function tostring() now use the text file's encoding ("UTF-8" if not available) instead of locale encoding in XML declaration when encoding="unicode" is specified. --- Lib/test/test_xml_etree.py | 113 ++++++++++++++---- Lib/xml/etree/ElementTree.py | 23 ++-- ...2-04-25-10-23-01.gh-issue-91810.DOHa6B.rst | 5 + 3 files changed, 101 insertions(+), 40 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2022-04-25-10-23-01.gh-issue-91810.DOHa6B.rst diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py index 60a41506d8795d..c9900a65591e48 100644 --- a/Lib/test/test_xml_etree.py +++ b/Lib/test/test_xml_etree.py @@ -10,7 +10,6 @@ import html import io import itertools -import locale import operator import os import pickle @@ -975,15 +974,13 @@ def test_tostring_xml_declaration(self): def test_tostring_xml_declaration_unicode_encoding(self): elem = ET.XML('') - preferredencoding = locale.getpreferredencoding() self.assertEqual( - f"\n", - ET.tostring(elem, encoding='unicode', xml_declaration=True) + ET.tostring(elem, encoding='unicode', xml_declaration=True), + "\n" ) def test_tostring_xml_declaration_cases(self): elem = ET.XML('ø') - preferredencoding = locale.getpreferredencoding() TESTCASES = [ # (expected_retval, encoding, xml_declaration) # ... xml_declaration = None @@ -1010,7 +1007,7 @@ def test_tostring_xml_declaration_cases(self): b"ø", 'US-ASCII', True), (b"\n" b"\xf8", 'ISO-8859-1', True), - (f"\n" + ("\n" "ø", 'unicode', True), ] @@ -1048,11 +1045,10 @@ def test_tostringlist_xml_declaration(self): b"\n" ) - preferredencoding = locale.getpreferredencoding() stringlist = ET.tostringlist(elem, encoding='unicode', xml_declaration=True) self.assertEqual( ''.join(stringlist), - f"\n" + "\n" ) self.assertRegex(stringlist[0], r"^<\?xml version='1.0' encoding='.+'?>") self.assertEqual(['', '', ''], stringlist[1:]) @@ -3712,33 +3708,98 @@ def test_encoding(self): "" % enc).encode(enc)) def test_write_to_filename(self): - self.addCleanup(os_helper.unlink, TESTFN) - tree = ET.ElementTree(ET.XML('''''')) + tree = ET.ElementTree(ET.XML('''\xf8''')) tree.write(TESTFN) with open(TESTFN, 'rb') as f: - self.assertEqual(f.read(), b'''''') + self.assertEqual(f.read(), b'''ø''') + + def test_write_to_filename_with_encoding(self): + self.addCleanup(os_helper.unlink, TESTFN) + tree = ET.ElementTree(ET.XML('''\xf8''')) + tree.write(TESTFN, encoding='utf-8') + with open(TESTFN, 'rb') as f: + self.assertEqual(f.read(), b'''\xc3\xb8''') + + tree.write(TESTFN, encoding='ISO-8859-1') + with open(TESTFN, 'rb') as f: + self.assertEqual(f.read(), + b'''\n''' + b'''\xf8''') + + def test_write_to_filename_as_unicode(self): + self.addCleanup(os_helper.unlink, TESTFN) + with open(TESTFN, 'w') as f: + encoding = f.encoding + os_helper.unlink(TESTFN) + + tree = ET.ElementTree(ET.XML('''\xf8''')) + tree.write(TESTFN, encoding='unicode') + with open(TESTFN, 'rb') as f: + data = f.read() + expected = "\xf8".encode(encoding, 'xmlcharrefreplace') + self.assertIn( + "\xf8".encode(encoding, 'xmlcharrefreplace'), + data) + if encoding.lower() in ('utf-8', 'ascii'): + self.assertEqual(data, expected) + else: + self.assertIn(b"''')) + tree = ET.ElementTree(ET.XML('''\xf8''')) with open(TESTFN, 'w', encoding='utf-8') as f: tree.write(f, encoding='unicode') self.assertFalse(f.closed) with open(TESTFN, 'rb') as f: - self.assertEqual(f.read(), b'''''') + self.assertEqual(f.read(), b'''\xc3\xb8''') + + with open(TESTFN, 'w', encoding='ascii', errors='xmlcharrefreplace') as f: + tree.write(f, encoding='unicode') + self.assertFalse(f.closed) + with open(TESTFN, 'rb') as f: + self.assertEqual(f.read(), + b'''\n''' + b'''ø''') + + with open(TESTFN, 'w', encoding='ISO-8859-1') as f: + tree.write(f, encoding='unicode') + self.assertFalse(f.closed) + with open(TESTFN, 'rb') as f: + self.assertEqual(f.read(), + b'''\n''' + b'''\xf8''') def test_write_to_binary_file(self): self.addCleanup(os_helper.unlink, TESTFN) - tree = ET.ElementTree(ET.XML('''''')) + tree = ET.ElementTree(ET.XML('''\xf8''')) with open(TESTFN, 'wb') as f: tree.write(f) self.assertFalse(f.closed) with open(TESTFN, 'rb') as f: - self.assertEqual(f.read(), b'''''') + self.assertEqual(f.read(), b'''ø''') + + def test_write_to_binary_file_with_encoding(self): + self.addCleanup(os_helper.unlink, TESTFN) + tree = ET.ElementTree(ET.XML('''\xf8''')) + with open(TESTFN, 'wb') as f: + tree.write(f, encoding='utf-8') + self.assertFalse(f.closed) + with open(TESTFN, 'rb') as f: + self.assertEqual(f.read(), b'''\xc3\xb8''') + + with open(TESTFN, 'wb') as f: + tree.write(f, encoding='ISO-8859-1') + self.assertFalse(f.closed) + with open(TESTFN, 'rb') as f: + self.assertEqual(f.read(), + b'''\n''' + b'''\xf8''') def test_write_to_binary_file_with_bom(self): self.addCleanup(os_helper.unlink, TESTFN) - tree = ET.ElementTree(ET.XML('''''')) + tree = ET.ElementTree(ET.XML('''\xf8''')) # test BOM writing to buffered file with open(TESTFN, 'wb') as f: tree.write(f, encoding='utf-16') @@ -3746,7 +3807,7 @@ def test_write_to_binary_file_with_bom(self): with open(TESTFN, 'rb') as f: self.assertEqual(f.read(), '''\n''' - ''''''.encode("utf-16")) + '''\xf8'''.encode("utf-16")) # test BOM writing to non-buffered file with open(TESTFN, 'wb', buffering=0) as f: tree.write(f, encoding='utf-16') @@ -3754,7 +3815,7 @@ def test_write_to_binary_file_with_bom(self): with open(TESTFN, 'rb') as f: self.assertEqual(f.read(), '''\n''' - ''''''.encode("utf-16")) + '''\xf8'''.encode("utf-16")) def test_read_from_stringio(self): tree = ET.ElementTree() @@ -3763,10 +3824,10 @@ def test_read_from_stringio(self): self.assertEqual(tree.getroot().tag, 'site') def test_write_to_stringio(self): - tree = ET.ElementTree(ET.XML('''''')) + tree = ET.ElementTree(ET.XML('''\xf8''')) stream = io.StringIO() tree.write(stream, encoding='unicode') - self.assertEqual(stream.getvalue(), '''''') + self.assertEqual(stream.getvalue(), '''\xf8''') def test_read_from_bytesio(self): tree = ET.ElementTree() @@ -3775,10 +3836,10 @@ def test_read_from_bytesio(self): self.assertEqual(tree.getroot().tag, 'site') def test_write_to_bytesio(self): - tree = ET.ElementTree(ET.XML('''''')) + tree = ET.ElementTree(ET.XML('''\xf8''')) raw = io.BytesIO() tree.write(raw) - self.assertEqual(raw.getvalue(), b'''''') + self.assertEqual(raw.getvalue(), b'''ø''') class dummy: pass @@ -3792,12 +3853,12 @@ def test_read_from_user_text_reader(self): self.assertEqual(tree.getroot().tag, 'site') def test_write_to_user_text_writer(self): - tree = ET.ElementTree(ET.XML('''''')) + tree = ET.ElementTree(ET.XML('''\xf8''')) stream = io.StringIO() writer = self.dummy() writer.write = stream.write tree.write(writer, encoding='unicode') - self.assertEqual(stream.getvalue(), '''''') + self.assertEqual(stream.getvalue(), '''\xf8''') def test_read_from_user_binary_reader(self): raw = io.BytesIO(b'''''') @@ -3809,12 +3870,12 @@ def test_read_from_user_binary_reader(self): tree = ET.ElementTree() def test_write_to_user_binary_writer(self): - tree = ET.ElementTree(ET.XML('''''')) + tree = ET.ElementTree(ET.XML('''\xf8''')) raw = io.BytesIO() writer = self.dummy() writer.write = raw.write tree.write(writer) - self.assertEqual(raw.getvalue(), b'''''') + self.assertEqual(raw.getvalue(), b'''ø''') def test_write_to_user_binary_writer_with_bom(self): tree = ET.ElementTree(ET.XML('''''')) diff --git a/Lib/xml/etree/ElementTree.py b/Lib/xml/etree/ElementTree.py index 5249c7ab82b84b..a5cc65e789c004 100644 --- a/Lib/xml/etree/ElementTree.py +++ b/Lib/xml/etree/ElementTree.py @@ -728,16 +728,10 @@ def write(self, file_or_filename, encoding = "utf-8" else: encoding = "us-ascii" - enc_lower = encoding.lower() - with _get_writer(file_or_filename, enc_lower) as write: + with _get_writer(file_or_filename, encoding) as (write, declared_encoding): if method == "xml" and (xml_declaration or (xml_declaration is None and - enc_lower not in ("utf-8", "us-ascii", "unicode"))): - declared_encoding = encoding - if enc_lower == "unicode": - # Retrieve the default encoding for the xml declaration - import locale - declared_encoding = locale.getpreferredencoding() + declared_encoding.lower() not in ("utf-8", "us-ascii"))): write("\n" % ( declared_encoding,)) if method == "text": @@ -762,19 +756,20 @@ def _get_writer(file_or_filename, encoding): write = file_or_filename.write except AttributeError: # file_or_filename is a file name - if encoding == "unicode": - file = open(file_or_filename, "w") + if encoding.lower() == "unicode": + file = open(file_or_filename, "w", + errors="xmlcharrefreplace") else: file = open(file_or_filename, "w", encoding=encoding, errors="xmlcharrefreplace") with file: - yield file.write + yield file.write, file.encoding else: # file_or_filename is a file-like object # encoding determines if it is a text or binary writer - if encoding == "unicode": + if encoding.lower() == "unicode": # use a text writer as is - yield write + yield write, getattr(file_or_filename, "encoding", None) or "utf-8" else: # wrap a binary writer with TextIOWrapper with contextlib.ExitStack() as stack: @@ -805,7 +800,7 @@ def _get_writer(file_or_filename, encoding): # Keep the original file open when the TextIOWrapper is # destroyed stack.callback(file.detach) - yield file.write + yield file.write, encoding def _namespaces(elem, default_namespace=None): # identify namespaces used in this tree diff --git a/Misc/NEWS.d/next/Library/2022-04-25-10-23-01.gh-issue-91810.DOHa6B.rst b/Misc/NEWS.d/next/Library/2022-04-25-10-23-01.gh-issue-91810.DOHa6B.rst new file mode 100644 index 00000000000000..0711f8466b818f --- /dev/null +++ b/Misc/NEWS.d/next/Library/2022-04-25-10-23-01.gh-issue-91810.DOHa6B.rst @@ -0,0 +1,5 @@ +:class:`~xml.etree.ElementTree.ElementTree` method +:meth:`~xml.etree.ElementTree.ElementTree.write` and function +:func:`~xml.etree.ElementTree.tostring` now use the text file's encoding +("UTF-8" if not available) instead of locale encoding in XML declaration +when ``encoding="unicode"`` is specified. From 9d215498c3329315b7ed3dd25cdc36f3917813d4 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Wed, 27 Apr 2022 12:22:40 +0300 Subject: [PATCH 2/2] Fix tests on Windows. --- Lib/test/test_xml_etree.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py index c9900a65591e48..c5f70cf45b9c3e 100644 --- a/Lib/test/test_xml_etree.py +++ b/Lib/test/test_xml_etree.py @@ -129,6 +129,9 @@ def newtest(*args, **kwargs): return newtest return decorator +def convlinesep(data): + return data.replace(b'\n', os.linesep.encode()) + class ModuleTest(unittest.TestCase): def test_sanity(self): @@ -3722,9 +3725,9 @@ def test_write_to_filename_with_encoding(self): tree.write(TESTFN, encoding='ISO-8859-1') with open(TESTFN, 'rb') as f: - self.assertEqual(f.read(), + self.assertEqual(f.read(), convlinesep( b'''\n''' - b'''\xf8''') + b'''\xf8''')) def test_write_to_filename_as_unicode(self): self.addCleanup(os_helper.unlink, TESTFN) @@ -3759,17 +3762,17 @@ def test_write_to_text_file(self): tree.write(f, encoding='unicode') self.assertFalse(f.closed) with open(TESTFN, 'rb') as f: - self.assertEqual(f.read(), + self.assertEqual(f.read(), convlinesep( b'''\n''' - b'''ø''') + b'''ø''')) with open(TESTFN, 'w', encoding='ISO-8859-1') as f: tree.write(f, encoding='unicode') self.assertFalse(f.closed) with open(TESTFN, 'rb') as f: - self.assertEqual(f.read(), + self.assertEqual(f.read(), convlinesep( b'''\n''' - b'''\xf8''') + b'''\xf8''')) def test_write_to_binary_file(self): self.addCleanup(os_helper.unlink, TESTFN)