[Python-checkins] [3.9] gh-91810: ElementTree: Use text file's encoding by default in XML declaration (GH-91903) (GH-92665)

miss-islington webhook-mailer at python.org
Wed May 11 13:40:16 EDT 2022


https://github.com/python/cpython/commit/bfc88d3418af6f4ef16aa306f12dd2d36ef957ae
commit: bfc88d3418af6f4ef16aa306f12dd2d36ef957ae
branch: 3.9
author: Miss Islington (bot) <31488909+miss-islington at users.noreply.github.com>
committer: miss-islington <31488909+miss-islington at users.noreply.github.com>
date: 2022-05-11T10:40:05-07:00
summary:

[3.9] gh-91810: ElementTree: Use text file's encoding by default in XML declaration (GH-91903) (GH-92665)



ElementTree method write() and function tostring() now use the text file's
encoding ("UTF-8" if not available) instead of locale encoding in XML
declaration when encoding="unicode" is specified.
(cherry picked from commit 707839b0fe02ba2c891a40f40e7a869d84c2c9c5)


Co-authored-by: Serhiy Storchaka <storchaka at gmail.com>

Automerge-Triggered-By: GH:serhiy-storchaka

files:
A Misc/NEWS.d/next/Library/2022-04-25-10-23-01.gh-issue-91810.DOHa6B.rst
M Lib/test/test_xml_etree.py
M Lib/xml/etree/ElementTree.py

diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py
index 22f14a2b4d633..956d4c587c36e 100644
--- a/Lib/test/test_xml_etree.py
+++ b/Lib/test/test_xml_etree.py
@@ -10,7 +10,6 @@
 import html
 import io
 import itertools
-import locale
 import operator
 import os
 import pickle
@@ -960,15 +959,13 @@ def test_tostring_xml_declaration(self):
 
     def test_tostring_xml_declaration_unicode_encoding(self):
         elem = ET.XML('<body><tag/></body>')
-        preferredencoding = locale.getpreferredencoding()
         self.assertEqual(
-            f"<?xml version='1.0' encoding='{preferredencoding}'?>\n<body><tag /></body>",
-            ET.tostring(elem, encoding='unicode', xml_declaration=True)
+            ET.tostring(elem, encoding='unicode', xml_declaration=True),
+            "<?xml version='1.0' encoding='utf-8'?>\n<body><tag /></body>"
         )
 
     def test_tostring_xml_declaration_cases(self):
         elem = ET.XML('<body><tag>ø</tag></body>')
-        preferredencoding = locale.getpreferredencoding()
         TESTCASES = [
         #   (expected_retval,                  encoding, xml_declaration)
             # ... xml_declaration = None
@@ -995,7 +992,7 @@ def test_tostring_xml_declaration_cases(self):
              b"<body><tag>ø</tag></body>", 'US-ASCII', True),
             (b"<?xml version='1.0' encoding='ISO-8859-1'?>\n"
              b"<body><tag>\xf8</tag></body>", 'ISO-8859-1', True),
-            (f"<?xml version='1.0' encoding='{preferredencoding}'?>\n"
+            ("<?xml version='1.0' encoding='utf-8'?>\n"
              "<body><tag>ø</tag></body>", 'unicode', True),
 
         ]
@@ -1033,11 +1030,10 @@ def test_tostringlist_xml_declaration(self):
             b"<?xml version='1.0' encoding='us-ascii'?>\n<body><tag /></body>"
         )
 
-        preferredencoding = locale.getpreferredencoding()
         stringlist = ET.tostringlist(elem, encoding='unicode', xml_declaration=True)
         self.assertEqual(
             ''.join(stringlist),
-            f"<?xml version='1.0' encoding='{preferredencoding}'?>\n<body><tag /></body>"
+            "<?xml version='1.0' encoding='utf-8'?>\n<body><tag /></body>"
         )
         self.assertRegex(stringlist[0], r"^<\?xml version='1.0' encoding='.+'?>")
         self.assertEqual(['<body', '>', '<tag', ' />', '</body>'], stringlist[1:])
@@ -3681,17 +3677,16 @@ def test_write_to_filename_as_unicode(self):
             encoding = f.encoding
         support.unlink(TESTFN)
 
-        try:
-            '\xf8'.encode(encoding)
-        except UnicodeEncodeError:
-            self.skipTest(f'default file encoding {encoding} not supported')
-
         tree = ET.ElementTree(ET.XML('''<site>\xf8</site>'''))
         tree.write(TESTFN, encoding='unicode')
         with open(TESTFN, 'rb') as f:
             data = f.read()
             expected = "<site>\xf8</site>".encode(encoding, 'xmlcharrefreplace')
-            self.assertEqual(data, expected)
+            if encoding.lower() in ('utf-8', 'ascii'):
+                self.assertEqual(data, expected)
+            else:
+                self.assertIn(b"<?xml version='1.0' encoding=", data)
+                self.assertIn(expected, data)
 
     def test_write_to_text_file(self):
         self.addCleanup(support.unlink, TESTFN)
@@ -3706,13 +3701,17 @@ def test_write_to_text_file(self):
             tree.write(f, encoding='unicode')
             self.assertFalse(f.closed)
         with open(TESTFN, 'rb') as f:
-            self.assertEqual(f.read(), b'''<site>ø</site>''')
+            self.assertEqual(f.read(),  convlinesep(
+                             b'''<?xml version='1.0' encoding='ascii'?>\n'''
+                             b'''<site>ø</site>'''))
 
         with open(TESTFN, 'w', encoding='ISO-8859-1') as f:
             tree.write(f, encoding='unicode')
             self.assertFalse(f.closed)
         with open(TESTFN, 'rb') as f:
-            self.assertEqual(f.read(), b'''<site>\xf8</site>''')
+            self.assertEqual(f.read(),  convlinesep(
+                             b'''<?xml version='1.0' encoding='ISO-8859-1'?>\n'''
+                             b'''<site>\xf8</site>'''))
 
     def test_write_to_binary_file(self):
         self.addCleanup(support.unlink, TESTFN)
diff --git a/Lib/xml/etree/ElementTree.py b/Lib/xml/etree/ElementTree.py
index dae2251d859da..66c43c2d0d32b 100644
--- a/Lib/xml/etree/ElementTree.py
+++ b/Lib/xml/etree/ElementTree.py
@@ -728,16 +728,10 @@ def write(self, file_or_filename,
                 encoding = "utf-8"
             else:
                 encoding = "us-ascii"
-        enc_lower = encoding.lower()
-        with _get_writer(file_or_filename, enc_lower) as write:
+        with _get_writer(file_or_filename, encoding) as (write, declared_encoding):
             if method == "xml" and (xml_declaration or
                     (xml_declaration is None and
-                     enc_lower not in ("utf-8", "us-ascii", "unicode"))):
-                declared_encoding = encoding
-                if enc_lower == "unicode":
-                    # Retrieve the default encoding for the xml declaration
-                    import locale
-                    declared_encoding = locale.getpreferredencoding()
+                     declared_encoding.lower() not in ("utf-8", "us-ascii"))):
                 write("<?xml version='1.0' encoding='%s'?>\n" % (
                     declared_encoding,))
             if method == "text":
@@ -762,19 +756,20 @@ def _get_writer(file_or_filename, encoding):
         write = file_or_filename.write
     except AttributeError:
         # file_or_filename is a file name
-        if encoding == "unicode":
-            file = open(file_or_filename, "w")
+        if encoding.lower() == "unicode":
+            file = open(file_or_filename, "w",
+                        errors="xmlcharrefreplace")
         else:
             file = open(file_or_filename, "w", encoding=encoding,
                         errors="xmlcharrefreplace")
         with file:
-            yield file.write
+            yield file.write, file.encoding
     else:
         # file_or_filename is a file-like object
         # encoding determines if it is a text or binary writer
-        if encoding == "unicode":
+        if encoding.lower() == "unicode":
             # use a text writer as is
-            yield write
+            yield write, getattr(file_or_filename, "encoding", None) or "utf-8"
         else:
             # wrap a binary writer with TextIOWrapper
             with contextlib.ExitStack() as stack:
@@ -805,7 +800,7 @@ def _get_writer(file_or_filename, encoding):
                 # Keep the original file open when the TextIOWrapper is
                 # destroyed
                 stack.callback(file.detach)
-                yield file.write
+                yield file.write, encoding
 
 def _namespaces(elem, default_namespace=None):
     # identify namespaces used in this tree
diff --git a/Misc/NEWS.d/next/Library/2022-04-25-10-23-01.gh-issue-91810.DOHa6B.rst b/Misc/NEWS.d/next/Library/2022-04-25-10-23-01.gh-issue-91810.DOHa6B.rst
new file mode 100644
index 0000000000000..0711f8466b818
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2022-04-25-10-23-01.gh-issue-91810.DOHa6B.rst
@@ -0,0 +1,5 @@
+:class:`~xml.etree.ElementTree.ElementTree` method
+:meth:`~xml.etree.ElementTree.ElementTree.write` and function
+:func:`~xml.etree.ElementTree.tostring` now use the text file's encoding
+("UTF-8" if not available) instead of locale encoding in XML declaration
+when ``encoding="unicode"`` is specified.



More information about the Python-checkins mailing list