[Python-checkins] [3.9] gh-91810: ElementTree: Use text file's encoding by default in XML declaration (GH-91903) (GH-92665)
miss-islington
webhook-mailer at python.org
Wed May 11 13:40:16 EDT 2022
https://github.com/python/cpython/commit/bfc88d3418af6f4ef16aa306f12dd2d36ef957ae
commit: bfc88d3418af6f4ef16aa306f12dd2d36ef957ae
branch: 3.9
author: Miss Islington (bot) <31488909+miss-islington at users.noreply.github.com>
committer: miss-islington <31488909+miss-islington at users.noreply.github.com>
date: 2022-05-11T10:40:05-07:00
summary:
[3.9] gh-91810: ElementTree: Use text file's encoding by default in XML declaration (GH-91903) (GH-92665)
ElementTree method write() and function tostring() now use the text file's
encoding ("UTF-8" if not available) instead of locale encoding in XML
declaration when encoding="unicode" is specified.
(cherry picked from commit 707839b0fe02ba2c891a40f40e7a869d84c2c9c5)
Co-authored-by: Serhiy Storchaka <storchaka at gmail.com>
Automerge-Triggered-By: GH:serhiy-storchaka
files:
A Misc/NEWS.d/next/Library/2022-04-25-10-23-01.gh-issue-91810.DOHa6B.rst
M Lib/test/test_xml_etree.py
M Lib/xml/etree/ElementTree.py
diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py
index 22f14a2b4d633..956d4c587c36e 100644
--- a/Lib/test/test_xml_etree.py
+++ b/Lib/test/test_xml_etree.py
@@ -10,7 +10,6 @@
import html
import io
import itertools
-import locale
import operator
import os
import pickle
@@ -960,15 +959,13 @@ def test_tostring_xml_declaration(self):
def test_tostring_xml_declaration_unicode_encoding(self):
elem = ET.XML('<body><tag/></body>')
- preferredencoding = locale.getpreferredencoding()
self.assertEqual(
- f"<?xml version='1.0' encoding='{preferredencoding}'?>\n<body><tag /></body>",
- ET.tostring(elem, encoding='unicode', xml_declaration=True)
+ ET.tostring(elem, encoding='unicode', xml_declaration=True),
+ "<?xml version='1.0' encoding='utf-8'?>\n<body><tag /></body>"
)
def test_tostring_xml_declaration_cases(self):
elem = ET.XML('<body><tag>ø</tag></body>')
- preferredencoding = locale.getpreferredencoding()
TESTCASES = [
# (expected_retval, encoding, xml_declaration)
# ... xml_declaration = None
@@ -995,7 +992,7 @@ def test_tostring_xml_declaration_cases(self):
b"<body><tag>ø</tag></body>", 'US-ASCII', True),
(b"<?xml version='1.0' encoding='ISO-8859-1'?>\n"
b"<body><tag>\xf8</tag></body>", 'ISO-8859-1', True),
- (f"<?xml version='1.0' encoding='{preferredencoding}'?>\n"
+ ("<?xml version='1.0' encoding='utf-8'?>\n"
"<body><tag>ø</tag></body>", 'unicode', True),
]
@@ -1033,11 +1030,10 @@ def test_tostringlist_xml_declaration(self):
b"<?xml version='1.0' encoding='us-ascii'?>\n<body><tag /></body>"
)
- preferredencoding = locale.getpreferredencoding()
stringlist = ET.tostringlist(elem, encoding='unicode', xml_declaration=True)
self.assertEqual(
''.join(stringlist),
- f"<?xml version='1.0' encoding='{preferredencoding}'?>\n<body><tag /></body>"
+ "<?xml version='1.0' encoding='utf-8'?>\n<body><tag /></body>"
)
self.assertRegex(stringlist[0], r"^<\?xml version='1.0' encoding='.+'?>")
self.assertEqual(['<body', '>', '<tag', ' />', '</body>'], stringlist[1:])
@@ -3681,17 +3677,16 @@ def test_write_to_filename_as_unicode(self):
encoding = f.encoding
support.unlink(TESTFN)
- try:
- '\xf8'.encode(encoding)
- except UnicodeEncodeError:
- self.skipTest(f'default file encoding {encoding} not supported')
-
tree = ET.ElementTree(ET.XML('''<site>\xf8</site>'''))
tree.write(TESTFN, encoding='unicode')
with open(TESTFN, 'rb') as f:
data = f.read()
expected = "<site>\xf8</site>".encode(encoding, 'xmlcharrefreplace')
- self.assertEqual(data, expected)
+ if encoding.lower() in ('utf-8', 'ascii'):
+ self.assertEqual(data, expected)
+ else:
+ self.assertIn(b"<?xml version='1.0' encoding=", data)
+ self.assertIn(expected, data)
def test_write_to_text_file(self):
self.addCleanup(support.unlink, TESTFN)
@@ -3706,13 +3701,17 @@ def test_write_to_text_file(self):
tree.write(f, encoding='unicode')
self.assertFalse(f.closed)
with open(TESTFN, 'rb') as f:
- self.assertEqual(f.read(), b'''<site>ø</site>''')
+ self.assertEqual(f.read(), convlinesep(
+ b'''<?xml version='1.0' encoding='ascii'?>\n'''
+ b'''<site>ø</site>'''))
with open(TESTFN, 'w', encoding='ISO-8859-1') as f:
tree.write(f, encoding='unicode')
self.assertFalse(f.closed)
with open(TESTFN, 'rb') as f:
- self.assertEqual(f.read(), b'''<site>\xf8</site>''')
+ self.assertEqual(f.read(), convlinesep(
+ b'''<?xml version='1.0' encoding='ISO-8859-1'?>\n'''
+ b'''<site>\xf8</site>'''))
def test_write_to_binary_file(self):
self.addCleanup(support.unlink, TESTFN)
diff --git a/Lib/xml/etree/ElementTree.py b/Lib/xml/etree/ElementTree.py
index dae2251d859da..66c43c2d0d32b 100644
--- a/Lib/xml/etree/ElementTree.py
+++ b/Lib/xml/etree/ElementTree.py
@@ -728,16 +728,10 @@ def write(self, file_or_filename,
encoding = "utf-8"
else:
encoding = "us-ascii"
- enc_lower = encoding.lower()
- with _get_writer(file_or_filename, enc_lower) as write:
+ with _get_writer(file_or_filename, encoding) as (write, declared_encoding):
if method == "xml" and (xml_declaration or
(xml_declaration is None and
- enc_lower not in ("utf-8", "us-ascii", "unicode"))):
- declared_encoding = encoding
- if enc_lower == "unicode":
- # Retrieve the default encoding for the xml declaration
- import locale
- declared_encoding = locale.getpreferredencoding()
+ declared_encoding.lower() not in ("utf-8", "us-ascii"))):
write("<?xml version='1.0' encoding='%s'?>\n" % (
declared_encoding,))
if method == "text":
@@ -762,19 +756,20 @@ def _get_writer(file_or_filename, encoding):
write = file_or_filename.write
except AttributeError:
# file_or_filename is a file name
- if encoding == "unicode":
- file = open(file_or_filename, "w")
+ if encoding.lower() == "unicode":
+ file = open(file_or_filename, "w",
+ errors="xmlcharrefreplace")
else:
file = open(file_or_filename, "w", encoding=encoding,
errors="xmlcharrefreplace")
with file:
- yield file.write
+ yield file.write, file.encoding
else:
# file_or_filename is a file-like object
# encoding determines if it is a text or binary writer
- if encoding == "unicode":
+ if encoding.lower() == "unicode":
# use a text writer as is
- yield write
+ yield write, getattr(file_or_filename, "encoding", None) or "utf-8"
else:
# wrap a binary writer with TextIOWrapper
with contextlib.ExitStack() as stack:
@@ -805,7 +800,7 @@ def _get_writer(file_or_filename, encoding):
# Keep the original file open when the TextIOWrapper is
# destroyed
stack.callback(file.detach)
- yield file.write
+ yield file.write, encoding
def _namespaces(elem, default_namespace=None):
# identify namespaces used in this tree
diff --git a/Misc/NEWS.d/next/Library/2022-04-25-10-23-01.gh-issue-91810.DOHa6B.rst b/Misc/NEWS.d/next/Library/2022-04-25-10-23-01.gh-issue-91810.DOHa6B.rst
new file mode 100644
index 0000000000000..0711f8466b818
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2022-04-25-10-23-01.gh-issue-91810.DOHa6B.rst
@@ -0,0 +1,5 @@
+:class:`~xml.etree.ElementTree.ElementTree` method
+:meth:`~xml.etree.ElementTree.ElementTree.write` and function
+:func:`~xml.etree.ElementTree.tostring` now use the text file's encoding
+("UTF-8" if not available) instead of locale encoding in XML declaration
+when ``encoding="unicode"`` is specified.
More information about the Python-checkins
mailing list