[Python-checkins] bpo-21315: Fix parsing of encoded words with missing leading ws. (GH-13425) (#13846)

Barry Warsaw webhook-mailer at python.org
Thu Jun 6 13:08:47 EDT 2019


https://github.com/python/cpython/commit/dc20fc4311dece19488299a7cd11317ffbe4d3c3
commit: dc20fc4311dece19488299a7cd11317ffbe4d3c3
branch: 3.7
author: Miss Islington (bot) <31488909+miss-islington at users.noreply.github.com>
committer: Barry Warsaw <barry at python.org>
date: 2019-06-06T10:08:43-07:00
summary:

bpo-21315: Fix parsing of encoded words with missing leading ws. (GH-13425) (#13846)

* bpo-21315: Fix parsing of encoded words with missing leading ws.

Because of missing leading whitespace, encoded word would get parsed as
unstructured token. This patch fixes that by looking for encoded words when
splitting tokens with whitespace.

Missing trailing whitespace around encoded word now register a defect
instead.

Original patch suggestion by David R. Murray on bpo-21315.
(cherry picked from commit 66c4f3f38b867d8329b28c032bb907fd1a2f22d2)

Co-authored-by: Abhilash Raj <maxking at users.noreply.github.com>

files:
A Misc/NEWS.d/next/Library/2019-05-19-10-48-46.bpo-21315.PgXVqF.rst
M Lib/email/_header_value_parser.py
M Lib/test/test_email/test__header_value_parser.py
M Lib/test/test_email/test_headerregistry.py

diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py
index 958ef5018c25..18aecbffa71a 100644
--- a/Lib/email/_header_value_parser.py
+++ b/Lib/email/_header_value_parser.py
@@ -97,6 +97,18 @@
 def quote_string(value):
     return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"'
 
+# Match a RFC 2047 word, looks like =?utf-8?q?someword?=
+rfc2047_matcher = re.compile(r'''
+   =\?            # literal =?
+   [^?]*          # charset
+   \?             # literal ?
+   [qQbB]         # literal 'q' or 'b', case insensitive
+   \?             # literal ?
+  .*?             # encoded word
+  \?=             # literal ?=
+''', re.VERBOSE | re.MULTILINE)
+
+
 #
 # TokenList and its subclasses
 #
@@ -1050,6 +1062,10 @@ def get_encoded_word(value):
         _validate_xtext(vtext)
         ew.append(vtext)
         text = ''.join(remainder)
+    # Encoded words should be followed by a WS
+    if value and value[0] not in WSP:
+        ew.defects.append(errors.InvalidHeaderDefect(
+            "missing trailing whitespace after encoded-word"))
     return ew, value
 
 def get_unstructured(value):
@@ -1102,6 +1118,11 @@ def get_unstructured(value):
                 unstructured.append(token)
                 continue
         tok, *remainder = _wsp_splitter(value, 1)
+        # Split in the middle of an atom if there is a rfc2047 encoded word
+        # which does not have WSP on both sides. The defect will be registered
+        # the next time through the loop.
+        if rfc2047_matcher.search(tok):
+            tok, *remainder = value.partition('=?')
         vtext = ValueTerminal(tok, 'vtext')
         _validate_xtext(vtext)
         unstructured.append(vtext)
diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py
index 676732bb3d02..693487bc960f 100644
--- a/Lib/test/test_email/test__header_value_parser.py
+++ b/Lib/test/test_email/test__header_value_parser.py
@@ -118,7 +118,7 @@ def test_get_encoded_word_gets_first_even_if_no_space(self):
                          '=?us-ascii?q?first?==?utf-8?q?second?=',
                          'first',
                          'first',
-                         [],
+                         [errors.InvalidHeaderDefect],
                          '=?utf-8?q?second?=')
 
     def test_get_encoded_word_sets_extra_attributes(self):
@@ -361,6 +361,25 @@ def test_get_unstructured_no_whitespace_between_ews(self):
             '=?utf-8?q?foo?==?utf-8?q?bar?=',
             'foobar',
             'foobar',
+            [errors.InvalidHeaderDefect,
+            errors.InvalidHeaderDefect],
+            '')
+
+    def test_get_unstructured_ew_without_leading_whitespace(self):
+        self._test_get_x(
+            self._get_unst,
+            'nowhitespace=?utf-8?q?somevalue?=',
+            'nowhitespacesomevalue',
+            'nowhitespacesomevalue',
+            [errors.InvalidHeaderDefect],
+            '')
+
+    def test_get_unstructured_ew_without_trailing_whitespace(self):
+        self._test_get_x(
+            self._get_unst,
+            '=?utf-8?q?somevalue?=nowhitespace',
+            'somevaluenowhitespace',
+            'somevaluenowhitespace',
             [errors.InvalidHeaderDefect],
             '')
 
@@ -546,7 +565,8 @@ def test_encoded_word_inside_quotes(self):
             '"=?utf-8?Q?not_really_valid?="',
             '"not really valid"',
             'not really valid',
-            [errors.InvalidHeaderDefect],
+            [errors.InvalidHeaderDefect,
+             errors.InvalidHeaderDefect],
             '')
 
     # get_comment
diff --git a/Lib/test/test_email/test_headerregistry.py b/Lib/test/test_email/test_headerregistry.py
index d1007099f666..e6db3acedcc1 100644
--- a/Lib/test/test_email/test_headerregistry.py
+++ b/Lib/test/test_email/test_headerregistry.py
@@ -1180,7 +1180,8 @@ class TestAddressHeader(TestHeaderBase):
 
         'rfc2047_atom_in_quoted_string_is_decoded':
             ('"=?utf-8?q?=C3=89ric?=" <foo at example.com>',
-            [errors.InvalidHeaderDefect],
+            [errors.InvalidHeaderDefect,
+            errors.InvalidHeaderDefect],
             'Éric <foo at example.com>',
             'Éric',
             'foo at example.com',
diff --git a/Misc/NEWS.d/next/Library/2019-05-19-10-48-46.bpo-21315.PgXVqF.rst b/Misc/NEWS.d/next/Library/2019-05-19-10-48-46.bpo-21315.PgXVqF.rst
new file mode 100644
index 000000000000..dd0dd7f72c0a
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2019-05-19-10-48-46.bpo-21315.PgXVqF.rst
@@ -0,0 +1,4 @@
+Email headers containing RFC2047 encoded words are parsed despite the missing
+whitespace, and a defect registered. Also missing trailing whitespace after
+encoded words is now registered as a defect.
+



More information about the Python-checkins mailing list