Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit c17f172

Browse files
committed
Issue #8047: Fix the xml.etree serializer to return bytes by default.
Use ``encoding="unicode"`` to generate a Unicode string.
1 parent 1a0a737 commit c17f172

4 files changed

Lines changed: 73 additions & 57 deletions

File tree

Doc/library/xml.etree.elementtree.rst

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -148,20 +148,22 @@ Functions
148148
arguments. Returns an element instance.
149149

150150

151-
.. function:: tostring(element, encoding=None, method="xml")
151+
.. function:: tostring(element, encoding="us-ascii", method="xml")
152152

153153
Generates a string representation of an XML element, including all
154154
subelements. *element* is an :class:`Element` instance. *encoding* [1]_ is
155-
the output encoding (default is None). *method* is either ``"xml"``,
155+
the output encoding (default is US-ASCII). Use ``encoding="unicode"`` to
156+
generate a Unicode string. *method* is either ``"xml"``,
156157
``"html"`` or ``"text"`` (default is ``"xml"``). Returns an (optionally)
157158
encoded string containing the XML data.
158159

159160

160-
.. function:: tostringlist(element, encoding=None, method="xml")
161+
.. function:: tostringlist(element, encoding="us-ascii", method="xml")
161162

162163
Generates a string representation of an XML element, including all
163164
subelements. *element* is an :class:`Element` instance. *encoding* [1]_ is
164-
the output encoding (default is None). *method* is either ``"xml"``,
165+
the output encoding (default is US-ASCII). Use ``encoding="unicode"`` to
166+
generate a Unicode string. *method* is either ``"xml"``,
165167
``"html"`` or ``"text"`` (default is ``"xml"``). Returns a list of
166168
(optionally) encoded strings containing the XML data. It does not guarantee
167169
any specific sequence, except that ``"".join(tostringlist(element)) ==
@@ -430,6 +432,7 @@ ElementTree Objects
430432

431433

432434
.. method:: getroot()
435+
433436
Returns the root element for this tree.
434437

435438

@@ -457,15 +460,16 @@ ElementTree Objects
457460
root element.
458461

459462

460-
.. method:: write(file, encoding=None, xml_declaration=None, method="xml")
463+
.. method:: write(file, encoding="us-ascii", xml_declaration=None, method="xml")
461464

462465
Writes the element tree to a file, as XML. *file* is a file name, or a
463466
file object opened for writing. *encoding* [1]_ is the output encoding
464-
(default is None). *xml_declaration* controls if an XML declaration
467+
(default is US-ASCII). Use ``encoding="unicode"`` to write a Unicode string.
468+
*xml_declaration* controls if an XML declaration
465469
should be added to the file. Use False for never, True for always, None
466-
for only if not US-ASCII or UTF-8 (default is None). *method* is either
467-
``"xml"``, ``"html"`` or ``"text"`` (default is ``"xml"``). Returns an
468-
(optionally) encoded string.
470+
for only if not US-ASCII or UTF-8 or Unicode (default is None). *method* is
471+
either ``"xml"``, ``"html"`` or ``"text"`` (default is ``"xml"``).
472+
Returns an (optionally) encoded string.
469473

470474
This is the XML file that is going to be manipulated::
471475

Lib/test/test_xml_etree.py

Lines changed: 23 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -71,14 +71,14 @@ def check_method(method):
7171
if not hasattr(method, '__call__'):
7272
print(method, "not callable")
7373

74-
def serialize(elem, to_string=True, **options):
74+
def serialize(elem, to_string=True, encoding='unicode', **options):
7575
import io
76-
if options.get("encoding"):
76+
if encoding != 'unicode':
7777
file = io.BytesIO()
7878
else:
7979
file = io.StringIO()
8080
tree = ET.ElementTree(elem)
81-
tree.write(file, **options)
81+
tree.write(file, encoding=encoding, **options)
8282
if to_string:
8383
return file.getvalue()
8484
else:
@@ -537,7 +537,7 @@ def attrib():
537537
>>> elem.set('testa', 'testval')
538538
>>> elem.set('testb', 'test2')
539539
>>> ET.tostring(elem)
540-
'<test testa="testval" testb="test2">aa</test>'
540+
b'<test testa="testval" testb="test2">aa</test>'
541541
>>> sorted(elem.keys())
542542
['testa', 'testb']
543543
>>> sorted(elem.items())
@@ -547,7 +547,7 @@ def attrib():
547547
>>> elem.attrib['testb'] = 'test1'
548548
>>> elem.attrib['testc'] = 'test2'
549549
>>> ET.tostring(elem)
550-
'<test testa="testval" testb="test1" testc="test2">aa</test>'
550+
b'<test testa="testval" testb="test1" testc="test2">aa</test>'
551551
"""
552552

553553
def makeelement():
@@ -587,15 +587,15 @@ def parsefile():
587587
588588
>>> tree = ET.parse(SIMPLE_XMLFILE)
589589
>>> normalize_crlf(tree)
590-
>>> tree.write(sys.stdout)
590+
>>> tree.write(sys.stdout, encoding='unicode')
591591
<root>
592592
<element key="value">text</element>
593593
<element>text</element>tail
594594
<empty-element />
595595
</root>
596596
>>> tree = ET.parse(SIMPLE_NS_XMLFILE)
597597
>>> normalize_crlf(tree)
598-
>>> tree.write(sys.stdout)
598+
>>> tree.write(sys.stdout, encoding='unicode')
599599
<ns0:root xmlns:ns0="namespace">
600600
<ns0:element key="value">text</ns0:element>
601601
<ns0:element>text</ns0:element>tail
@@ -636,17 +636,17 @@ def parsefile():
636636
def parseliteral():
637637
"""
638638
>>> element = ET.XML("<html><body>text</body></html>")
639-
>>> ET.ElementTree(element).write(sys.stdout)
639+
>>> ET.ElementTree(element).write(sys.stdout, encoding='unicode')
640640
<html><body>text</body></html>
641641
>>> element = ET.fromstring("<html><body>text</body></html>")
642-
>>> ET.ElementTree(element).write(sys.stdout)
642+
>>> ET.ElementTree(element).write(sys.stdout, encoding='unicode')
643643
<html><body>text</body></html>
644644
>>> sequence = ["<html><body>", "text</bo", "dy></html>"]
645645
>>> element = ET.fromstringlist(sequence)
646646
>>> print(ET.tostring(element))
647-
<html><body>text</body></html>
648-
>>> print("".join(ET.tostringlist(element)))
649-
<html><body>text</body></html>
647+
b'<html><body>text</body></html>'
648+
>>> print(b"".join(ET.tostringlist(element)))
649+
b'<html><body>text</body></html>'
650650
>>> ET.tostring(element, "ascii")
651651
b"<?xml version='1.0' encoding='ascii'?>\\n<html><body>text</body></html>"
652652
>>> _, ids = ET.XMLID("<html><body>text</body></html>")
@@ -875,10 +875,10 @@ def writestring():
875875
"""
876876
>>> elem = ET.XML("<html><body>text</body></html>")
877877
>>> ET.tostring(elem)
878-
'<html><body>text</body></html>'
878+
b'<html><body>text</body></html>'
879879
>>> elem = ET.fromstring("<html><body>text</body></html>")
880880
>>> ET.tostring(elem)
881-
'<html><body>text</body></html>'
881+
b'<html><body>text</body></html>'
882882
"""
883883

884884
def check_encoding(encoding):
@@ -1233,14 +1233,14 @@ def processinginstruction():
12331233
Test ProcessingInstruction directly
12341234
12351235
>>> ET.tostring(ET.ProcessingInstruction('test', 'instruction'))
1236-
'<?test instruction?>'
1236+
b'<?test instruction?>'
12371237
>>> ET.tostring(ET.PI('test', 'instruction'))
1238-
'<?test instruction?>'
1238+
b'<?test instruction?>'
12391239
12401240
Issue #2746
12411241
12421242
>>> ET.tostring(ET.PI('test', '<testing&>'))
1243-
'<?test <testing&>?>'
1243+
b'<?test <testing&>?>'
12441244
>>> ET.tostring(ET.PI('test', '<testing&>\xe3'), 'latin1')
12451245
b"<?xml version='1.0' encoding='latin1'?>\\n<?test <testing&>\\xe3?>"
12461246
"""
@@ -1643,11 +1643,11 @@ def bug_200708_newline():
16431643
16441644
>>> e = ET.Element('SomeTag', text="def _f():\n return 3\n")
16451645
>>> ET.tostring(e)
1646-
'<SomeTag text="def _f():&#10; return 3&#10;" />'
1646+
b'<SomeTag text="def _f():&#10; return 3&#10;" />'
16471647
>>> ET.XML(ET.tostring(e)).get("text")
16481648
'def _f():\n return 3\n'
16491649
>>> ET.tostring(ET.XML(ET.tostring(e)))
1650-
'<SomeTag text="def _f():&#10; return 3&#10;" />'
1650+
b'<SomeTag text="def _f():&#10; return 3&#10;" />'
16511651
16521652
"""
16531653

@@ -1698,15 +1698,15 @@ def bug_200709_register_namespace():
16981698
"""
16991699
17001700
>>> ET.tostring(ET.Element("{http://namespace.invalid/does/not/exist/}title"))
1701-
'<ns0:title xmlns:ns0="http://namespace.invalid/does/not/exist/" />'
1701+
b'<ns0:title xmlns:ns0="http://namespace.invalid/does/not/exist/" />'
17021702
>>> ET.register_namespace("foo", "http://namespace.invalid/does/not/exist/")
17031703
>>> ET.tostring(ET.Element("{http://namespace.invalid/does/not/exist/}title"))
1704-
'<foo:title xmlns:foo="http://namespace.invalid/does/not/exist/" />'
1704+
b'<foo:title xmlns:foo="http://namespace.invalid/does/not/exist/" />'
17051705
17061706
And the Dublin Core namespace is in the default list:
17071707
17081708
>>> ET.tostring(ET.Element("{http://purl.org/dc/elements/1.1/}title"))
1709-
'<dc:title xmlns:dc="http://purl.org/dc/elements/1.1/" />'
1709+
b'<dc:title xmlns:dc="http://purl.org/dc/elements/1.1/" />'
17101710
17111711
"""
17121712

@@ -1792,7 +1792,7 @@ def check_issue3151():
17921792
'{${stuff}}localname'
17931793
>>> t = ET.ElementTree(e)
17941794
>>> ET.tostring(e)
1795-
'<ns0:localname xmlns:ns0="${stuff}" />'
1795+
b'<ns0:localname xmlns:ns0="${stuff}" />'
17961796
17971797
"""
17981798

Lib/xml/etree/ElementTree.py

Lines changed: 34 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -792,12 +792,13 @@ def iterfind(self, path, namespaces=None):
792792
# @def write(file, **options)
793793
# @param file A file name, or a file object opened for writing.
794794
# @param **options Options, given as keyword arguments.
795-
# @keyparam encoding Optional output encoding (default is None).
795+
# @keyparam encoding Optional output encoding (default is US-ASCII).
796+
# Use "unicode" to return a Unicode string.
796797
# @keyparam method Optional output method ("xml", "html", "text" or
797798
# "c14n"; default is "xml").
798799
# @keyparam xml_declaration Controls if an XML declaration should
799800
# be added to the file. Use False for never, True for always,
800-
# None for only if not US-ASCII or UTF-8. None is default.
801+
# None for only if not US-ASCII or UTF-8 or Unicode. None is default.
801802

802803
def write(self, file_or_filename,
803804
# keyword arguments
@@ -811,14 +812,23 @@ def write(self, file_or_filename,
811812
elif method not in _serialize:
812813
# FIXME: raise an ImportError for c14n if ElementC14N is missing?
813814
raise ValueError("unknown method %r" % method)
815+
if not encoding:
816+
if method == "c14n":
817+
encoding = "utf-8"
818+
else:
819+
encoding = "us-ascii"
820+
elif encoding == str: # lxml.etree compatibility.
821+
encoding = "unicode"
822+
else:
823+
encoding = encoding.lower()
814824
if hasattr(file_or_filename, "write"):
815825
file = file_or_filename
816826
else:
817-
if encoding:
827+
if encoding != "unicode":
818828
file = open(file_or_filename, "wb")
819829
else:
820830
file = open(file_or_filename, "w")
821-
if encoding:
831+
if encoding != "unicode":
822832
def write(text):
823833
try:
824834
return file.write(text.encode(encoding,
@@ -827,20 +837,15 @@ def write(text):
827837
_raise_serialization_error(text)
828838
else:
829839
write = file.write
830-
if not encoding:
831-
if method == "c14n":
832-
encoding = "utf-8"
833-
else:
834-
encoding = None
835-
elif xml_declaration or (xml_declaration is None and
836-
encoding not in ("utf-8", "us-ascii")):
837-
if method == "xml":
838-
encoding_ = encoding
839-
if not encoding:
840-
# Retrieve the default encoding for the xml declaration
841-
import locale
842-
encoding_ = locale.getpreferredencoding()
843-
write("<?xml version='1.0' encoding='%s'?>\n" % encoding_)
840+
if method == "xml" and (xml_declaration or
841+
(xml_declaration is None and
842+
encoding not in ("utf-8", "us-ascii", "unicode"))):
843+
declared_encoding = encoding
844+
if encoding == "unicode":
845+
# Retrieve the default encoding for the xml declaration
846+
import locale
847+
declared_encoding = locale.getpreferredencoding()
848+
write("<?xml version='1.0' encoding='%s'?>\n" % declared_encoding)
844849
if method == "text":
845850
_serialize_text(write, self._root)
846851
else:
@@ -1127,11 +1132,12 @@ def _escape_attrib_html(text):
11271132

11281133
##
11291134
# Generates a string representation of an XML element, including all
1130-
# subelements. If encoding is None, the return type is a string;
1135+
# subelements. If encoding is "unicode", the return type is a string;
11311136
# otherwise it is a bytes array.
11321137
#
11331138
# @param element An Element instance.
1134-
# @keyparam encoding Optional output encoding (default is None).
1139+
# @keyparam encoding Optional output encoding (default is US-ASCII).
1140+
# Use "unicode" to return a Unicode string.
11351141
# @keyparam method Optional output method ("xml", "html", "text" or
11361142
# "c14n"; default is "xml").
11371143
# @return An (optionally) encoded string containing the XML data.
@@ -1144,17 +1150,20 @@ class dummy:
11441150
file = dummy()
11451151
file.write = data.append
11461152
ElementTree(element).write(file, encoding, method=method)
1147-
if encoding:
1148-
return b"".join(data)
1149-
else:
1153+
if encoding in (str, "unicode"):
11501154
return "".join(data)
1155+
else:
1156+
return b"".join(data)
11511157

11521158
##
11531159
# Generates a string representation of an XML element, including all
1154-
# subelements. The string is returned as a sequence of string fragments.
1160+
# subelements. If encoding is False, the string is returned as a
1161+
# sequence of string fragments; otherwise it is a sequence of
1162+
# bytestrings.
11551163
#
11561164
# @param element An Element instance.
11571165
# @keyparam encoding Optional output encoding (default is US-ASCII).
1166+
# Use "unicode" to return a Unicode string.
11581167
# @keyparam method Optional output method ("xml", "html", "text" or
11591168
# "c14n"; default is "xml").
11601169
# @return A sequence object containing the XML data.
@@ -1184,7 +1193,7 @@ def dump(elem):
11841193
# debugging
11851194
if not isinstance(elem, ElementTree):
11861195
elem = ElementTree(elem)
1187-
elem.write(sys.stdout)
1196+
elem.write(sys.stdout, encoding="unicode")
11881197
tail = elem.getroot().tail
11891198
if not tail or tail[-1] != "\n":
11901199
sys.stdout.write("\n")

Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,9 @@ Extensions
5555
Library
5656
-------
5757

58+
- Issue #8047: Fix the xml.etree serializer to return bytes by default. Use
59+
``encoding="unicode"`` to generate a Unicode string.
60+
5861
- Fix Issue8280 - urllib2's Request method will remove fragements in the url.
5962
This is how it is supposed to work, wget and curl do the same. Previous
6063
behavior was wrong.

0 commit comments

Comments
 (0)