[Python-checkins] cpython: #24211: Add RFC6532 support to the email library.

r.david.murray python-checkins at python.org
Sun May 17 17:29:50 CEST 2015


https://hg.python.org/cpython/rev/9f0d5e33230f
changeset:   96116:9f0d5e33230f
user:        R David Murray <rdmurray at bitdance.com>
date:        Sun May 17 11:29:21 2015 -0400
summary:
  #24211: Add RFC6532 support to the email library.

This could use more edge case tests, but the basic functionality is tested.
(Note that this changeset does not add tailored support for the RFC 6532
message/global MIME type, but the email package generic facilities will handle
it.)

Reviewed by Maciej Szulik.

files:
  Doc/library/email.policy.rst          |   8 +++++
  Doc/whatsnew/3.5.rst                  |   6 ++++
  Lib/email/_header_value_parser.py     |  11 ++++--
  Lib/email/policy.py                   |  15 +++++++++-
  Lib/test/test_email/test_generator.py |  22 +++++++++++++++
  Lib/test/test_email/test_policy.py    |   4 ++
  Misc/NEWS                             |   3 ++
  7 files changed, 64 insertions(+), 5 deletions(-)


diff --git a/Doc/library/email.policy.rst b/Doc/library/email.policy.rst
--- a/Doc/library/email.policy.rst
+++ b/Doc/library/email.policy.rst
@@ -378,6 +378,14 @@
    In addition to the settable attributes listed above that apply to all
    policies, this policy adds the following additional attributes:
 
+   .. attribute:: utf8
+
+      If ``False``, follow :rfc:`5322`, supporting non-ASCII characters in
+      headers by encoding them as "encoded words".  If ``True``, follow
+      :rfc:`6532` and use ``utf-8`` encoding for headers.  Messages
+      formatted in this way may be passed to SMTP servers that support
+      the ``SMTPUTF8`` extension (:rfc:`6531`).
+
    .. attribute:: refold_source
 
       If the value for a header in the ``Message`` object originated from a
diff --git a/Doc/whatsnew/3.5.rst b/Doc/whatsnew/3.5.rst
--- a/Doc/whatsnew/3.5.rst
+++ b/Doc/whatsnew/3.5.rst
@@ -356,6 +356,12 @@
   header (``None`` if there is no such header).  (Contributed by Abhilash Raj
   in :issue:`21083`.)
 
+* A new policy option :attr:`~email.policy.EmailPolicy.utf8` can be set
+  ``True`` to encode email headers using the utf8 charset instead of using
+  encoded words.  This allows ``Messages`` to be formatted according to
+  :rfc:`6532` and used with an SMTP server that supports the :rfc:`6531`
+  ``SMTPUTF8`` extension.  (Contributed by R. David Murray in :issue:`24211`.)
+
 glob
 ----
 
diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py
--- a/Lib/email/_header_value_parser.py
+++ b/Lib/email/_header_value_parser.py
@@ -320,17 +320,18 @@
         return ''.join(res)
 
     def _fold(self, folded):
+        encoding = 'utf-8' if folded.policy.utf8 else 'ascii'
         for part in self.parts:
             tstr = str(part)
             tlen = len(tstr)
             try:
-                str(part).encode('us-ascii')
+                str(part).encode(encoding)
             except UnicodeEncodeError:
                 if any(isinstance(x, errors.UndecodableBytesDefect)
                         for x in part.all_defects):
                     charset = 'unknown-8bit'
                 else:
-                    # XXX: this should be a policy setting
+                    # XXX: this should be a policy setting when utf8 is False.
                     charset = 'utf-8'
                 tstr = part.cte_encode(charset, folded.policy)
                 tlen = len(tstr)
@@ -394,11 +395,12 @@
 
     def _fold(self, folded):
         last_ew = None
+        encoding = 'utf-8' if folded.policy.utf8 else 'ascii'
         for part in self.parts:
             tstr = str(part)
             is_ew = False
             try:
-                str(part).encode('us-ascii')
+                str(part).encode(encoding)
             except UnicodeEncodeError:
                 if any(isinstance(x, errors.UndecodableBytesDefect)
                        for x in part.all_defects):
@@ -475,12 +477,13 @@
         # comment that becomes a barrier across which we can't compose encoded
         # words.
         last_ew = None
+        encoding = 'utf-8' if folded.policy.utf8 else 'ascii'
         for part in self.parts:
             tstr = str(part)
             tlen = len(tstr)
             has_ew = False
             try:
-                str(part).encode('us-ascii')
+                str(part).encode(encoding)
             except UnicodeEncodeError:
                 if any(isinstance(x, errors.UndecodableBytesDefect)
                         for x in part.all_defects):
diff --git a/Lib/email/policy.py b/Lib/email/policy.py
--- a/Lib/email/policy.py
+++ b/Lib/email/policy.py
@@ -35,6 +35,13 @@
     In addition to the settable attributes listed above that apply to
     all Policies, this policy adds the following additional attributes:
 
+    utf8                -- if False (the default) message headers will be
+                           serialized as ASCII, using encoded words to encode
+                           any non-ASCII characters in the source strings.  If
+                           True, the message headers will be serialized using
+                           utf8 and will not contain encoded words (see RFC
+                           6532 for more on this serialization format).
+
     refold_source       -- if the value for a header in the Message object
                            came from the parsing of some source, this attribute
                            indicates whether or not a generator should refold
@@ -72,6 +79,7 @@
 
     """
 
+    utf8 = False
     refold_source = 'long'
     header_factory = HeaderRegistry()
     content_manager = raw_data_manager
@@ -175,9 +183,13 @@
         refold_header setting, since there is no way to know whether the binary
         data consists of single byte characters or multibyte characters.
 
+        If utf8 is true, headers are encoded to utf8, otherwise to ascii with
+        non-ASCII unicode rendered as encoded words.
+
         """
         folded = self._fold(name, value, refold_binary=self.cte_type=='7bit')
-        return folded.encode('ascii', 'surrogateescape')
+        charset = 'utf8' if self.utf8 else 'ascii'
+        return folded.encode(charset, 'surrogateescape')
 
     def _fold(self, name, value, refold_binary=False):
         if hasattr(value, 'name'):
@@ -199,3 +211,4 @@
 strict = default.clone(raise_on_defect=True)
 SMTP = default.clone(linesep='\r\n')
 HTTP = default.clone(linesep='\r\n', max_line_length=None)
+SMTPUTF8 = SMTP.clone(utf8=True)
diff --git a/Lib/test/test_email/test_generator.py b/Lib/test/test_email/test_generator.py
--- a/Lib/test/test_email/test_generator.py
+++ b/Lib/test/test_email/test_generator.py
@@ -2,6 +2,7 @@
 import textwrap
 import unittest
 from email import message_from_string, message_from_bytes
+from email.message import EmailMessage
 from email.generator import Generator, BytesGenerator
 from email import policy
 from test.test_email import TestEmailBase, parameterize
@@ -194,6 +195,27 @@
         g.flatten(msg)
         self.assertEqual(s.getvalue(), expected)
 
+    def test_smtputf8_policy(self):
+        msg = EmailMessage()
+        msg['From'] = "Páolo <főo at bar.com>"
+        msg['To'] = 'Dinsdale'
+        msg['Subject'] = 'Nudge nudge, wink, wink \u1F609'
+        msg.set_content("oh là là, know what I mean, know what I mean?")
+        expected = textwrap.dedent("""\
+            From: Páolo <főo at bar.com>
+            To: Dinsdale
+            Subject: Nudge nudge, wink, wink \u1F609
+            Content-Type: text/plain; charset="utf-8"
+            Content-Transfer-Encoding: 8bit
+            MIME-Version: 1.0
+
+            oh là là, know what I mean, know what I mean?
+            """).encode('utf-8').replace(b'\n', b'\r\n')
+        s = io.BytesIO()
+        g = BytesGenerator(s, policy=policy.SMTPUTF8)
+        g.flatten(msg)
+        self.assertEqual(s.getvalue(), expected)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/Lib/test/test_email/test_policy.py b/Lib/test/test_email/test_policy.py
--- a/Lib/test/test_email/test_policy.py
+++ b/Lib/test/test_email/test_policy.py
@@ -27,6 +27,7 @@
     # If any of these defaults change, the docs must be updated.
     policy_defaults = compat32_defaults.copy()
     policy_defaults.update({
+        'utf8':                     False,
         'raise_on_defect':          False,
         'header_factory':           email.policy.EmailPolicy.header_factory,
         'refold_source':            'long',
@@ -42,6 +43,9 @@
         email.policy.default: make_defaults(policy_defaults, {}),
         email.policy.SMTP: make_defaults(policy_defaults,
                                          {'linesep': '\r\n'}),
+        email.policy.SMTPUTF8: make_defaults(policy_defaults,
+                                             {'linesep': '\r\n',
+                                              'utf8': True}),
         email.policy.HTTP: make_defaults(policy_defaults,
                                          {'linesep': '\r\n',
                                           'max_line_length': None}),
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -47,6 +47,9 @@
 Library
 -------
 
+- Issue #24211: The email library now supports RFC 6532: it can generate
+  headers using utf-8 instead of encoded words.
+
 - Issue #16314: Added support for the LZMA compression in distutils.
 
 - Issue #21804: poplib now supports RFC 6856 (UTF8).

-- 
Repository URL: https://hg.python.org/cpython


More information about the Python-checkins mailing list