[Python-checkins] bpo-27240 Rewrite the email header folding algorithm. (GH-3488) (#4693)

R. David Murray webhook-mailer at python.org
Sun Dec 3 19:46:26 EST 2017

commit: a87ba60fe56ae2ebe80ab9ada6d280a6a1f3d552
branch: 3.6
author: Miss Islington (bot) <31488909+miss-islington at users.noreply.github.com>
committer: R. David Murray <rdmurray at bitdance.com>
date: 2017-12-03T19:46:23-05:00

bpo-27240 Rewrite the email header folding algorithm. (GH-3488) (#4693)

The original algorithm tried to delegate the folding to the tokens so
that those tokens whose folding rules differed could specify the
differences.  However, this resulted in a lot of duplicated code because
most of the rules were the same.

The new algorithm moves all folding logic into a set of functions
external to the token classes, but puts the information about which
tokens can be folded in which ways on the tokens...with the exception of
mime-parameters, which are a special case (which was not even
implemented in the old folder).

This algorithm can still probably be improved and hopefully simplified

Note that some of the test expectations are changed.  I believe the
changes are toward more desirable and consistent behavior: in general
when (re) folding a line the canonical version of the tokens is
generated, rather than preserving errors or extra whitespace.
(cherry picked from commit 85d5c18c9d83a1d54eecc4c2ad4dce63194107c6)

A Misc/NEWS.d/next/Library/2017-12-02-16-06-00.bpo-27240.Kji34M.rst
M Lib/email/_header_value_parser.py
M Lib/email/headerregistry.py
M Lib/test/test_email/test__header_value_parser.py
M Lib/test/test_email/test_generator.py
M Lib/test/test_email/test_headerregistry.py

diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py
index 9b9697f7734..3ebbbe5383a 100644
--- a/Lib/email/_header_value_parser.py
+++ b/Lib/email/_header_value_parser.py
@@ -96,90 +96,6 @@
 def quote_string(value):
     return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"'
-# Accumulator for header folding
-class _Folded:
-    def __init__(self, maxlen, policy):
-        self.maxlen = maxlen
-        self.policy = policy
-        self.lastlen = 0
-        self.stickyspace = None
-        self.firstline = True
-        self.done = []
-        self.current = []
-    def newline(self):
-        self.done.extend(self.current)
-        self.done.append(self.policy.linesep)
-        self.current.clear()
-        self.lastlen = 0
-    def finalize(self):
-        if self.current:
-            self.newline()
-    def __str__(self):
-        return ''.join(self.done)
-    def append(self, stoken):
-        self.current.append(stoken)
-    def append_if_fits(self, token, stoken=None):
-        if stoken is None:
-            stoken = str(token)
-        l = len(stoken)
-        if self.stickyspace is not None:
-            stickyspace_len = len(self.stickyspace)
-            if self.lastlen + stickyspace_len + l <= self.maxlen:
-                self.current.append(self.stickyspace)
-                self.lastlen += stickyspace_len
-                self.current.append(stoken)
-                self.lastlen += l
-                self.stickyspace = None
-                self.firstline = False
-                return True
-            if token.has_fws:
-                ws = token.pop_leading_fws()
-                if ws is not None:
-                    self.stickyspace += str(ws)
-                    stickyspace_len += len(ws)
-                token._fold(self)
-                return True
-            if stickyspace_len and l + 1 <= self.maxlen:
-                margin = self.maxlen - l
-                if 0 < margin < stickyspace_len:
-                    trim = stickyspace_len - margin
-                    self.current.append(self.stickyspace[:trim])
-                    self.stickyspace = self.stickyspace[trim:]
-                    stickyspace_len = trim
-                self.newline()
-                self.current.append(self.stickyspace)
-                self.current.append(stoken)
-                self.lastlen = l + stickyspace_len
-                self.stickyspace = None
-                self.firstline = False
-                return True
-            if not self.firstline:
-                self.newline()
-            self.current.append(self.stickyspace)
-            self.current.append(stoken)
-            self.stickyspace = None
-            self.firstline = False
-            return True
-        if self.lastlen + l <= self.maxlen:
-            self.current.append(stoken)
-            self.lastlen += l
-            return True
-        if l < self.maxlen:
-            self.newline()
-            self.current.append(stoken)
-            self.lastlen = l
-            return True
-        return False
 # TokenList and its subclasses
@@ -187,6 +103,8 @@ def append_if_fits(self, token, stoken=None):
 class TokenList(list):
     token_type = None
+    syntactic_break = True
+    ew_combine_allowed = True
     def __init__(self, *args, **kw):
         super().__init__(*args, **kw)
@@ -207,84 +125,13 @@ def value(self):
     def all_defects(self):
         return sum((x.all_defects for x in self), self.defects)
-    #
-    # Folding API
-    #
-    # parts():
-    #
-    # return a list of objects that constitute the "higher level syntactic
-    # objects" specified by the RFC as the best places to fold a header line.
-    # The returned objects must include leading folding white space, even if
-    # this means mutating the underlying parse tree of the object.  Each object
-    # is only responsible for returning *its* parts, and should not drill down
-    # to any lower level except as required to meet the leading folding white
-    # space constraint.
-    #
-    # _fold(folded):
-    #
-    #   folded: the result accumulator.  This is an instance of _Folded.
-    #       (XXX: I haven't finished factoring this out yet, the folding code
-    #       pretty much uses this as a state object.) When the folded.current
-    #       contains as much text as will fit, the _fold method should call
-    #       folded.newline.
-    #  folded.lastlen: the current length of the test stored in folded.current.
-    #  folded.maxlen: The maximum number of characters that may appear on a
-    #       folded line.  Differs from the policy setting in that "no limit" is
-    #       represented by +inf, which means it can be used in the trivially
-    #       logical fashion in comparisons.
-    #
-    # Currently no subclasses implement parts, and I think this will remain
-    # true.  A subclass only needs to implement _fold when the generic version
-    # isn't sufficient.  _fold will need to be implemented primarily when it is
-    # possible for encoded words to appear in the specialized token-list, since
-    # there is no generic algorithm that can know where exactly the encoded
-    # words are allowed.  A _fold implementation is responsible for filling
-    # lines in the same general way that the top level _fold does. It may, and
-    # should, call the _fold method of sub-objects in a similar fashion to that
-    # of the top level _fold.
-    #
-    # XXX: I'm hoping it will be possible to factor the existing code further
-    # to reduce redundancy and make the logic clearer.
-    @property
-    def parts(self):
-        klass = self.__class__
-        this = []
-        for token in self:
-            if token.startswith_fws():
-                if this:
-                    yield this[0] if len(this)==1 else klass(this)
-                    this.clear()
-            end_ws = token.pop_trailing_ws()
-            this.append(token)
-            if end_ws:
-                yield klass(this)
-                this = [end_ws]
-        if this:
-            yield this[0] if len(this)==1 else klass(this)
     def startswith_fws(self):
         return self[0].startswith_fws()
-    def pop_leading_fws(self):
-        if self[0].token_type == 'fws':
-            return self.pop(0)
-        return self[0].pop_leading_fws()
-    def pop_trailing_ws(self):
-        if self[-1].token_type == 'cfws':
-            return self.pop(-1)
-        return self[-1].pop_trailing_ws()
-    def has_fws(self):
-        for part in self:
-            if part.has_fws:
-                return True
-        return False
-    def has_leading_comment(self):
-        return self[0].has_leading_comment()
+    def as_ew_allowed(self):
+        """True if all top level tokens of this part may be RFC2047 encoded."""
+        return all(part.as_ew_allowed for part in self)
     def comments(self):
@@ -294,69 +141,13 @@ def comments(self):
         return comments
     def fold(self, *, policy):
-        # max_line_length 0/None means no limit, ie: infinitely long.
-        maxlen = policy.max_line_length or float("+inf")
-        folded = _Folded(maxlen, policy)
-        self._fold(folded)
-        folded.finalize()
-        return str(folded)
-    def as_encoded_word(self, charset):
-        # This works only for things returned by 'parts', which include
-        # the leading fws, if any, that should be used.
-        res = []
-        ws = self.pop_leading_fws()
-        if ws:
-            res.append(ws)
-        trailer = self.pop(-1) if self[-1].token_type=='fws' else ''
-        res.append(_ew.encode(str(self), charset))
-        res.append(trailer)
-        return ''.join(res)
-    def cte_encode(self, charset, policy):
-        res = []
-        for part in self:
-            res.append(part.cte_encode(charset, policy))
-        return ''.join(res)
-    def _fold(self, folded):
-        encoding = 'utf-8' if folded.policy.utf8 else 'ascii'
-        for part in self.parts:
-            tstr = str(part)
-            tlen = len(tstr)
-            try:
-                str(part).encode(encoding)
-            except UnicodeEncodeError:
-                if any(isinstance(x, errors.UndecodableBytesDefect)
-                        for x in part.all_defects):
-                    charset = 'unknown-8bit'
-                else:
-                    # XXX: this should be a policy setting when utf8 is False.
-                    charset = 'utf-8'
-                tstr = part.cte_encode(charset, folded.policy)
-                tlen = len(tstr)
-            if folded.append_if_fits(part, tstr):
-                continue
-            # Peel off the leading whitespace if any and make it sticky, to
-            # avoid infinite recursion.
-            ws = part.pop_leading_fws()
-            if ws is not None:
-                folded.stickyspace = str(ws)
-                if folded.append_if_fits(part):
-                    continue
-            if part.has_fws:
-                part._fold(folded)
-                continue
-            # There are no fold points in this one; it is too long for a single
-            # line and can't be split...we just have to put it on its own line.
-            folded.append(tstr)
-            folded.newline()
+        return _refold_parse_tree(self, policy=policy)
     def pprint(self, indent=''):
-        print('\n'.join(self._pp(indent='')))
+        print(self.ppstr(indent=indent))
     def ppstr(self, indent=''):
-        return '\n'.join(self._pp(indent=''))
+        return '\n'.join(self._pp(indent=indent))
     def _pp(self, indent=''):
         yield '{}{}/{}('.format(
@@ -391,173 +182,11 @@ class UnstructuredTokenList(TokenList):
     token_type = 'unstructured'
-    def _fold(self, folded):
-        last_ew = None
-        encoding = 'utf-8' if folded.policy.utf8 else 'ascii'
-        for part in self.parts:
-            tstr = str(part)
-            is_ew = False
-            try:
-                str(part).encode(encoding)
-            except UnicodeEncodeError:
-                if any(isinstance(x, errors.UndecodableBytesDefect)
-                       for x in part.all_defects):
-                    charset = 'unknown-8bit'
-                else:
-                    charset = 'utf-8'
-                if last_ew is not None:
-                    # We've already done an EW, combine this one with it
-                    # if there's room.
-                    chunk = get_unstructured(
-                        ''.join(folded.current[last_ew:]+[tstr])).as_encoded_word(charset)
-                    oldlastlen = sum(len(x) for x in folded.current[:last_ew])
-                    schunk = str(chunk)
-                    lchunk = len(schunk)
-                    if oldlastlen + lchunk <= folded.maxlen:
-                        del folded.current[last_ew:]
-                        folded.append(schunk)
-                        folded.lastlen = oldlastlen + lchunk
-                        continue
-                tstr = part.as_encoded_word(charset)
-                is_ew = True
-            if folded.append_if_fits(part, tstr):
-                if is_ew:
-                    last_ew = len(folded.current) - 1
-                continue
-            if is_ew or last_ew:
-                # It's too big to fit on the line, but since we've
-                # got encoded words we can use encoded word folding.
-                part._fold_as_ew(folded)
-                continue
-            # Peel off the leading whitespace if any and make it sticky, to
-            # avoid infinite recursion.
-            ws = part.pop_leading_fws()
-            if ws is not None:
-                folded.stickyspace = str(ws)
-                if folded.append_if_fits(part):
-                    continue
-            if part.has_fws:
-                part._fold(folded)
-                continue
-            # It can't be split...we just have to put it on its own line.
-            folded.append(tstr)
-            folded.newline()
-            last_ew = None
-    def cte_encode(self, charset, policy):
-        res = []
-        last_ew = None
-        for part in self:
-            spart = str(part)
-            try:
-                spart.encode('us-ascii')
-                res.append(spart)
-            except UnicodeEncodeError:
-                if last_ew is None:
-                    res.append(part.cte_encode(charset, policy))
-                    last_ew = len(res)
-                else:
-                    tl = get_unstructured(''.join(res[last_ew:] + [spart]))
-                    res.append(tl.as_encoded_word(charset))
-        return ''.join(res)
 class Phrase(TokenList):
     token_type = 'phrase'
-    def _fold(self, folded):
-        # As with Unstructured, we can have pure ASCII with or without
-        # surrogateescape encoded bytes, or we could have unicode.  But this
-        # case is more complicated, since we have to deal with the various
-        # sub-token types and how they can be composed in the face of
-        # unicode-that-needs-CTE-encoding, and the fact that if a token a
-        # comment that becomes a barrier across which we can't compose encoded
-        # words.
-        last_ew = None
-        encoding = 'utf-8' if folded.policy.utf8 else 'ascii'
-        for part in self.parts:
-            tstr = str(part)
-            tlen = len(tstr)
-            has_ew = False
-            try:
-                str(part).encode(encoding)
-            except UnicodeEncodeError:
-                if any(isinstance(x, errors.UndecodableBytesDefect)
-                        for x in part.all_defects):
-                    charset = 'unknown-8bit'
-                else:
-                    charset = 'utf-8'
-                if last_ew is not None and not part.has_leading_comment():
-                    # We've already done an EW, let's see if we can combine
-                    # this one with it.  The last_ew logic ensures that all we
-                    # have at this point is atoms, no comments or quoted
-                    # strings.  So we can treat the text between the last
-                    # encoded word and the content of this token as
-                    # unstructured text, and things will work correctly.  But
-                    # we have to strip off any trailing comment on this token
-                    # first, and if it is a quoted string we have to pull out
-                    # the content (we're encoding it, so it no longer needs to
-                    # be quoted).
-                    if part[-1].token_type == 'cfws' and part.comments:
-                        remainder = part.pop(-1)
-                    else:
-                        remainder = ''
-                    for i, token in enumerate(part):
-                        if token.token_type == 'bare-quoted-string':
-                            part[i] = UnstructuredTokenList(token[:])
-                    chunk = get_unstructured(
-                        ''.join(folded.current[last_ew:]+[tstr])).as_encoded_word(charset)
-                    schunk = str(chunk)
-                    lchunk = len(schunk)
-                    if last_ew + lchunk <= folded.maxlen:
-                        del folded.current[last_ew:]
-                        folded.append(schunk)
-                        folded.lastlen = sum(len(x) for x in folded.current)
-                        continue
-                tstr = part.as_encoded_word(charset)
-                tlen = len(tstr)
-                has_ew = True
-            if folded.append_if_fits(part, tstr):
-                if has_ew and not part.comments:
-                    last_ew = len(folded.current) - 1
-                elif part.comments or part.token_type == 'quoted-string':
-                    # If a comment is involved we can't combine EWs.  And if a
-                    # quoted string is involved, it's not worth the effort to
-                    # try to combine them.
-                    last_ew = None
-                continue
-            part._fold(folded)
-    def cte_encode(self, charset, policy):
-        res = []
-        last_ew = None
-        is_ew = False
-        for part in self:
-            spart = str(part)
-            try:
-                spart.encode('us-ascii')
-                res.append(spart)
-            except UnicodeEncodeError:
-                is_ew = True
-                if last_ew is None:
-                    if not part.comments:
-                        last_ew = len(res)
-                    res.append(part.cte_encode(charset, policy))
-                elif not part.has_leading_comment():
-                    if part[-1].token_type == 'cfws' and part.comments:
-                        remainder = part.pop(-1)
-                    else:
-                        remainder = ''
-                    for i, token in enumerate(part):
-                        if token.token_type == 'bare-quoted-string':
-                            part[i] = UnstructuredTokenList(token[:])
-                    tl = get_unstructured(''.join(res[last_ew:] + [spart]))
-                    res[last_ew:] = [tl.as_encoded_word(charset)]
-            if part.comments or (not is_ew and part.token_type == 'quoted-string'):
-                last_ew = None
-        return ''.join(res)
 class Word(TokenList):
     token_type = 'word'
@@ -567,9 +196,6 @@ class CFWSList(WhiteSpaceTokenList):
     token_type = 'cfws'
-    def has_leading_comment(self):
-        return bool(self.comments)
 class Atom(TokenList):
@@ -579,6 +205,7 @@ class Atom(TokenList):
 class Token(TokenList):
     token_type = 'token'
+    encode_as_ew = False
 class EncodedWord(TokenList):
@@ -588,13 +215,6 @@ class EncodedWord(TokenList):
     charset = None
     lang = None
-    @property
-    def encoded(self):
-        if self.cte is not None:
-            return self.cte
-        _ew.encode(str(self), self.charset)
 class QuotedString(TokenList):
@@ -865,6 +485,7 @@ def display_name(self):
 class Domain(TokenList):
     token_type = 'domain'
+    as_ew_allowed = False
     def domain(self):
@@ -879,11 +500,13 @@ class DotAtom(TokenList):
 class DotAtomText(TokenList):
     token_type = 'dot-atom-text'
+    as_ew_allowed = True
 class AddrSpec(TokenList):
     token_type = 'addr-spec'
+    as_ew_allowed = False
     def local_part(self):
@@ -916,11 +539,13 @@ def addr_spec(self):
 class ObsLocalPart(TokenList):
     token_type = 'obs-local-part'
+    as_ew_allowed = False
 class DisplayName(Phrase):
     token_type = 'display-name'
+    ew_combine_allowed = False
     def display_name(self):
@@ -960,6 +585,7 @@ def value(self):
 class LocalPart(TokenList):
     token_type = 'local-part'
+    as_ew_allowed = False
     def value(self):
@@ -995,6 +621,7 @@ def local_part(self):
 class DomainLiteral(TokenList):
     token_type = 'domain-literal'
+    as_ew_allowed = False
     def domain(self):
@@ -1081,6 +708,7 @@ def stripped_value(self):
 class MimeParameters(TokenList):
     token_type = 'mime-parameters'
+    syntactic_break = False
     def params(self):
@@ -1165,6 +793,10 @@ def __str__(self):
 class ParameterizedHeaderValue(TokenList):
+    # Set this false so that the value doesn't wind up on a new line even
+    # if it and the parameters would fit there but not on the first line.
+    syntactic_break = False
     def params(self):
         for token in reversed(self):
@@ -1172,18 +804,11 @@ def params(self):
                 return token.params
         return {}
-    @property
-    def parts(self):
-        if self and self[-1].token_type == 'mime-parameters':
-            # We don't want to start a new line if all of the params don't fit
-            # after the value, so unwrap the parameter list.
-            return TokenList(self[:-1] + self[-1])
-        return TokenList(self).parts
 class ContentType(ParameterizedHeaderValue):
     token_type = 'content-type'
+    as_ew_allowed = False
     maintype = 'text'
     subtype = 'plain'
@@ -1191,40 +816,27 @@ class ContentType(ParameterizedHeaderValue):
 class ContentDisposition(ParameterizedHeaderValue):
     token_type = 'content-disposition'
+    as_ew_allowed = False
     content_disposition = None
 class ContentTransferEncoding(TokenList):
     token_type = 'content-transfer-encoding'
+    as_ew_allowed = False
     cte = '7bit'
 class HeaderLabel(TokenList):
     token_type = 'header-label'
+    as_ew_allowed = False
 class Header(TokenList):
     token_type = 'header'
-    def _fold(self, folded):
-        folded.append(str(self.pop(0)))
-        folded.lastlen = len(folded.current[0])
-        # The first line of the header is different from all others: we don't
-        # want to start a new object on a new line if it has any fold points in
-        # it that would allow part of it to be on the first header line.
-        # Further, if the first fold point would fit on the new line, we want
-        # to do that, but if it doesn't we want to put it on the first line.
-        # Folded supports this via the stickyspace attribute.  If this
-        # attribute is not None, it does the special handling.
-        folded.stickyspace = str(self.pop(0)) if self[0].token_type == 'cfws' else ''
-        rest = self.pop(0)
-        if self:
-            raise ValueError("Malformed Header token list")
-        rest._fold(folded)
 # Terminal classes and instances
@@ -1232,6 +844,10 @@ def _fold(self, folded):
 class Terminal(str):
+    as_ew_allowed = True
+    ew_combine_allowed = True
+    syntactic_break = True
     def __new__(cls, value, token_type):
         self = super().__new__(cls, value)
         self.token_type = token_type
@@ -1241,6 +857,9 @@ def __new__(cls, value, token_type):
     def __repr__(self):
         return "{}({})".format(self.__class__.__name__, super().__repr__())
+    def pprint(self):
+        print(self.__class__.__name__ + '/' + self.token_type)
     def all_defects(self):
         return list(self.defects)
@@ -1254,29 +873,14 @@ def _pp(self, indent=''):
             '' if not self.defects else ' {}'.format(self.defects),
-    def cte_encode(self, charset, policy):
-        value = str(self)
-        try:
-            value.encode('us-ascii')
-            return value
-        except UnicodeEncodeError:
-            return _ew.encode(value, charset)
     def pop_trailing_ws(self):
         # This terminates the recursion.
         return None
-    def pop_leading_fws(self):
-        # This terminates the recursion.
-        return None
     def comments(self):
         return []
-    def has_leading_comment(self):
-        return False
     def __getnewargs__(self):
         return(str(self), self.token_type)
@@ -1290,8 +894,6 @@ def value(self):
     def startswith_fws(self):
         return True
-    has_fws = True
 class ValueTerminal(Terminal):
@@ -1302,11 +904,6 @@ def value(self):
     def startswith_fws(self):
         return False
-    has_fws = False
-    def as_encoded_word(self, charset):
-        return _ew.encode(str(self), charset)
 class EWWhiteSpaceTerminal(WhiteSpaceTerminal):
@@ -1314,15 +911,9 @@ class EWWhiteSpaceTerminal(WhiteSpaceTerminal):
     def value(self):
         return ''
-    @property
-    def encoded(self):
-        return self[:]
     def __str__(self):
         return ''
-    has_fws = True
 # XXX these need to become classes and used as instances so
 # that a program can't change them in a parse tree and screw
@@ -2752,7 +2343,7 @@ def get_parameter(value):
         if value[0] != "'":
             raise errors.HeaderParseError("Expected RFC2231 char/lang encoding "
                                           "delimiter, but found {!r}".format(value))
-        appendto.append(ValueTerminal("'", 'RFC2231 delimiter'))
+        appendto.append(ValueTerminal("'", 'RFC2231-delimiter'))
         value = value[1:]
         if value and value[0] != "'":
             token, value = get_attrtext(value)
@@ -2761,7 +2352,7 @@ def get_parameter(value):
             if not value or value[0] != "'":
                 raise errors.HeaderParseError("Expected RFC2231 char/lang encoding "
                                   "delimiter, but found {}".format(value))
-        appendto.append(ValueTerminal("'", 'RFC2231 delimiter'))
+        appendto.append(ValueTerminal("'", 'RFC2231-delimiter'))
         value = value[1:]
     if remainder is not None:
         # Treat the rest of value as bare quoted string content.
@@ -2966,3 +2557,255 @@ def parse_content_transfer_encoding_header(value):
             token, value = get_phrase(value)
     return cte_header
+# Header folding
+# Header folding is complex, with lots of rules and corner cases.  The
+# following code does its best to obey the rules and handle the corner
+# cases, but you can be sure there are few bugs:)
+# This folder generally canonicalizes as it goes, preferring the stringified
+# version of each token.  The tokens contain information that supports the
+# folder, including which tokens can be encoded in which ways.
+# Folded text is accumulated in a simple list of strings ('lines'), each
+# one of which should be less than policy.max_line_length ('maxlen').
+def _steal_trailing_WSP_if_exists(lines):
+    wsp = ''
+    if lines and lines[-1] and lines[-1][-1] in WSP:
+        wsp = lines[-1][-1]
+        lines[-1] = lines[-1][:-1]
+    return wsp
+def _refold_parse_tree(parse_tree, *, policy):
+    """Return string of contents of parse_tree folded according to RFC rules.
+    """
+    # max_line_length 0/None means no limit, ie: infinitely long.
+    maxlen = policy.max_line_length or float("+inf")
+    encoding = 'utf-8' if policy.utf8 else 'us-ascii'
+    lines = ['']
+    last_ew = None
+    wrap_as_ew_blocked = 0
+    want_encoding = False
+    end_ew_not_allowed = Terminal('', 'wrap_as_ew_blocked')
+    parts = list(parse_tree)
+    while parts:
+        part = parts.pop(0)
+        if part is end_ew_not_allowed:
+            wrap_as_ew_blocked -= 1
+            continue
+        tstr = str(part)
+        try:
+            tstr.encode(encoding)
+            charset = encoding
+        except UnicodeEncodeError:
+            if any(isinstance(x, errors.UndecodableBytesDefect)
+                   for x in part.all_defects):
+                charset = 'unknown-8bit'
+            else:
+                # If policy.utf8 is false this should really be taken from a
+                # 'charset' property on the policy.
+                charset = 'utf-8'
+            want_encoding = True
+        if part.token_type == 'mime-parameters':
+            # Mime parameter folding (using RFC2231) is extra special.
+            _fold_mime_parameters(part, lines, maxlen, encoding)
+            continue
+        if want_encoding and not wrap_as_ew_blocked:
+            if not part.as_ew_allowed:
+                want_encoding = False
+                last_ew = None
+                if part.syntactic_break:
+                    encoded_part = part.fold(policy=policy)[:-1] # strip nl
+                    if policy.linesep not in encoded_part:
+                        # It fits on a single line
+                        if len(encoded_part) > maxlen - len(lines[-1]):
+                            # But not on this one, so start a new one.
+                            newline = _steal_trailing_WSP_if_exists(lines)
+                            # XXX what if encoded_part has no leading FWS?
+                            lines.append(newline)
+                        lines[-1] += encoded_part
+                        continue
+                # Either this is not a major syntactic break, so we don't
+                # want it on a line by itself even if it fits, or it
+                # doesn't fit on a line by itself.  Either way, fall through
+                # to unpacking the subparts and wrapping them.
+            if not hasattr(part, 'encode'):
+                # It's not a Terminal, do each piece individually.
+                parts = list(part) + parts
+            else:
+                # It's a terminal, wrap it as an encoded word, possibly
+                # combining it with previously encoded words if allowed.
+                last_ew = _fold_as_ew(tstr, lines, maxlen, last_ew,
+                                      part.ew_combine_allowed, charset)
+            want_encoding = False
+            continue
+        if len(tstr) <= maxlen - len(lines[-1]):
+            lines[-1] += tstr
+            continue
+        # This part is too long to fit.  The RFC wants us to break at
+        # "major syntactic breaks", so unless we don't consider this
+        # to be one, check if it will fit on the next line by itself.
+        if (part.syntactic_break and
+                len(tstr) + 1 <= maxlen):
+            newline = _steal_trailing_WSP_if_exists(lines)
+            if newline or part.startswith_fws():
+                lines.append(newline + tstr)
+                continue
+        if not hasattr(part, 'encode'):
+            # It's not a terminal, try folding the subparts.
+            newparts = list(part)
+            if not part.as_ew_allowed:
+                wrap_as_ew_blocked += 1
+                newparts.append(end_ew_not_allowed)
+            parts = newparts + parts
+            continue
+        if part.as_ew_allowed and not wrap_as_ew_blocked:
+            # It doesn't need CTE encoding, but encode it anyway so we can
+            # wrap it.
+            parts.insert(0, part)
+            want_encoding = True
+            continue
+        # We can't figure out how to wrap, it, so give up.
+        newline = _steal_trailing_WSP_if_exists(lines)
+        if newline or part.startswith_fws():
+            lines.append(newline + tstr)
+        else:
+            # We can't fold it onto the next line either...
+            lines[-1] += tstr
+    return policy.linesep.join(lines) + policy.linesep
+def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset):
+    """Fold string to_encode into lines as encoded word, combining if allowed.
+    Return the new value for last_ew, or None if ew_combine_allowed is False.
+    If there is already an encoded word in the last line of lines (indicated by
+    a non-None value for last_ew) and ew_combine_allowed is true, decode the
+    existing ew, combine it with to_encode, and re-encode.  Otherwise, encode
+    to_encode.  In either case, split to_encode as necessary so that the
+    encoded segments fit within maxlen.
+    """
+    if last_ew is not None and ew_combine_allowed:
+        to_encode = str(
+            get_unstructured(lines[-1][last_ew:] + to_encode))
+        lines[-1] = lines[-1][:last_ew]
+    if to_encode[0] in WSP:
+        # We're joining this to non-encoded text, so don't encode
+        # the leading blank.
+        leading_wsp = to_encode[0]
+        to_encode = to_encode[1:]
+        if (len(lines[-1]) == maxlen):
+            lines.append(_steal_trailing_WSP_if_exists(lines))
+        lines[-1] += leading_wsp
+    trailing_wsp = ''
+    if to_encode[-1] in WSP:
+        # Likewise for the trailing space.
+        trailing_wsp = to_encode[-1]
+        to_encode = to_encode[:-1]
+    new_last_ew = len(lines[-1]) if last_ew is None else last_ew
+    while to_encode:
+        remaining_space = maxlen - len(lines[-1])
+        # The RFC2047 chrome takes up 7 characters plus the length
+        # of the charset name.
+        encode_as = 'utf-8' if charset == 'us-ascii' else charset
+        text_space = remaining_space - len(encode_as) - 7
+        if text_space <= 0:
+            lines.append(' ')
+            # XXX We'll get an infinite loop here if maxlen is <= 7
+            continue
+        first_part = to_encode[:text_space]
+        ew = _ew.encode(first_part, charset=encode_as)
+        excess = len(ew) - remaining_space
+        if excess > 0:
+            # encode always chooses the shortest encoding, so this
+            # is guaranteed to fit at this point.
+            first_part = first_part[:-excess]
+            ew = _ew.encode(first_part)
+        lines[-1] += ew
+        to_encode = to_encode[len(first_part):]
+        if to_encode:
+            lines.append(' ')
+            new_last_ew = len(lines[-1])
+    lines[-1] += trailing_wsp
+    return new_last_ew if ew_combine_allowed else None
+def _fold_mime_parameters(part, lines, maxlen, encoding):
+    """Fold TokenList 'part' into the 'lines' list as mime parameters.
+    Using the decoded list of parameters and values, format them according to
+    the RFC rules, including using RFC2231 encoding if the value cannot be
+    expressed in 'encoding' and/or the paramter+value is too long to fit within
+    'maxlen'.
+    """
+    # Special case for RFC2231 encoding: start from decoded values and use
+    # RFC2231 encoding iff needed.
+    #
+    # Note that the 1 and 2s being added to the length calculations are
+    # accounting for the possibly-needed spaces and semicolons we'll be adding.
+    #
+    for name, value in part.params:
+        # XXX What if this ';' puts us over maxlen the first time through the
+        # loop?  We should split the header value onto a newline in that case,
+        # but to do that we need to recognize the need earlier or reparse the
+        # header, so I'm going to ignore that bug for now.  It'll only put us
+        # one character over.
+        if not lines[-1].rstrip().endswith(';'):
+            lines[-1] += ';'
+        charset = encoding
+        error_handler = 'strict'
+        try:
+            value.encode(encoding)
+            encoding_required = False
+        except UnicodeEncodeError:
+            encoding_required = True
+            if utils._has_surrogates(value):
+                charset = 'unknown-8bit'
+                error_handler = 'surrogateescape'
+            else:
+                charset = 'utf-8'
+        if encoding_required:
+            encoded_value = urllib.parse.quote(
+                value, safe='', errors=error_handler)
+            tstr = "{}*={}''{}".format(name, charset, encoded_value)
+        else:
+            tstr = '{}={}'.format(name, quote_string(value))
+        if len(lines[-1]) + len(tstr) + 1 < maxlen:
+            lines[-1] = lines[-1] + ' ' + tstr
+            continue
+        elif len(tstr) + 2 <= maxlen:
+            lines.append(' ' + tstr)
+            continue
+        # We need multiple sections.  We are allowed to mix encoded and
+        # non-encoded sections, but we aren't going to.  We'll encode them all.
+        section = 0
+        extra_chrome = charset + "''"
+        while value:
+            chrome_len = len(name) + len(str(section)) + 3 + len(extra_chrome)
+            if maxlen <= chrome_len + 3:
+                # We need room for the leading blank, the trailing semicolon,
+                # and at least one character of the value.  If we don't
+                # have that, we'd be stuck, so in that case fall back to
+                # the RFC standard width.
+                maxlen = 78
+            splitpoint = maxchars = maxlen - chrome_len - 2
+            while True:
+                partial = value[:splitpoint]
+                encoded_value = urllib.parse.quote(
+                    partial, safe='', errors=error_handler)
+                if len(encoded_value) <= maxchars:
+                    break
+                splitpoint -= 1
+            lines.append(" {}*{}*={}{}".format(
+                name, section, extra_chrome, encoded_value))
+            extra_chrome = ''
+            section += 1
+            value = value[splitpoint:]
+            if value:
+                lines[-1] += ';'
diff --git a/Lib/email/headerregistry.py b/Lib/email/headerregistry.py
index 0fc2231e5cb..f5be87f4d24 100644
--- a/Lib/email/headerregistry.py
+++ b/Lib/email/headerregistry.py
@@ -245,13 +245,16 @@ def fold(self, *, policy):
         the header name and the ': ' separator.
-        # At some point we need to only put fws here if it was in the source.
+        # At some point we need to put fws here iif it was in the source.
         header = parser.Header([
                 parser.ValueTerminal(self.name, 'header-name'),
                 parser.ValueTerminal(':', 'header-sep')]),
-            parser.CFWSList([parser.WhiteSpaceTerminal(' ', 'fws')]),
-                             self._parse_tree])
+            ])
+        if self._parse_tree:
+            header.append(
+                parser.CFWSList([parser.WhiteSpaceTerminal(' ', 'fws')]))
+        header.append(self._parse_tree)
         return header.fold(policy=policy)
diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py
index e0ec87d2080..1667617b9e4 100644
--- a/Lib/test/test_email/test__header_value_parser.py
+++ b/Lib/test/test_email/test__header_value_parser.py
@@ -14,18 +14,7 @@ def test_EWWhiteSpaceTerminal(self):
         self.assertEqual(x, ' \t')
         self.assertEqual(str(x), '')
         self.assertEqual(x.value, '')
-        self.assertEqual(x.encoded, ' \t')
-    # UnstructuredTokenList
-    def test_undecodable_bytes_error_preserved(self):
-        badstr = b"le pouf c\xaflebre".decode('ascii', 'surrogateescape')
-        unst = parser.get_unstructured(badstr)
-        self.assertDefectsEqual(unst.all_defects, [errors.UndecodableBytesDefect])
-        parts = list(unst.parts)
-        self.assertDefectsEqual(parts[0].all_defects, [])
-        self.assertDefectsEqual(parts[1].all_defects, [])
-        self.assertDefectsEqual(parts[2].all_defects, [errors.UndecodableBytesDefect])
+        self.assertEqual(x.token_type, 'fws')
 class TestParserMixin:
@@ -139,7 +128,6 @@ def test_get_encoded_word_sets_extra_attributes(self):
                          'first second',
-        self.assertEqual(ew.encoded, '=?us-ascii*jive?q?first_second?=')
         self.assertEqual(ew.charset, 'us-ascii')
         self.assertEqual(ew.lang, 'jive')
@@ -150,7 +138,6 @@ def test_get_encoded_word_lang_default_is_blank(self):
                          'first second',
-        self.assertEqual(ew.encoded, '=?us-ascii?q?first_second?=')
         self.assertEqual(ew.charset, 'us-ascii')
         self.assertEqual(ew.lang, '')
@@ -2700,28 +2687,37 @@ def test_address_list_with_unicode_names_in_quotes(self):
     # and with unicode tokens in the comments.  Spaces inside the quotes
     # currently don't do the right thing.
-    def test_initial_whitespace_splitting(self):
+    def test_split_at_whitespace_after_header_before_long_token(self):
         body = parser.get_unstructured('   ' + 'x'*77)
         header = parser.Header([
             parser.HeaderLabel([parser.ValueTerminal('test:', 'atext')]),
             parser.CFWSList([parser.WhiteSpaceTerminal(' ', 'fws')]), body])
         self._test(header, 'test:   \n ' + 'x'*77 + '\n')
-    def test_whitespace_splitting(self):
+    def test_split_at_whitespace_before_long_token(self):
         self._test(parser.get_unstructured('xxx   ' + 'y'*77),
                    'xxx  \n ' + 'y'*77 + '\n')
+    def test_overlong_encodeable_is_wrapped(self):
+        first_token_with_whitespace = 'xxx   '
+        chrome_leader = '=?utf-8?q?'
+        len_chrome = len(chrome_leader) + 2
+        len_non_y = len_chrome + len(first_token_with_whitespace)
+        self._test(parser.get_unstructured(first_token_with_whitespace +
+                                           'y'*80),
+                   first_token_with_whitespace + chrome_leader +
+                       'y'*(78-len_non_y) + '?=\n' +
+                       ' ' + chrome_leader + 'y'*(80-(78-len_non_y)) + '?=\n')
     def test_long_filename_attachment(self):
-        folded = self.policy.fold('Content-Disposition', 'attachment; filename="TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TES.txt"')
-        self.assertEqual(
-            'Content-Disposition: attachment;\n filename="TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TES.txt"\n',
-            folded
-        )
-        folded = self.policy.fold('Content-Disposition', 'attachment; filename="TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_T.txt"')
-        self.assertEqual(
-            'Content-Disposition: attachment;\n filename="TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_T.txt"\n',
-            folded
-        )
+        self._test(parser.parse_content_disposition_header(
+            'attachment; filename="TEST_TEST_TEST_TEST'
+                '_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TEST_TES.txt"'),
+            "attachment;\n"
+            " filename*0*=us-ascii''TEST_TEST_TEST_TEST_TEST_TEST"
+                "_TEST_TEST_TEST_TEST_TEST;\n"
+            " filename*1*=_TEST_TES.txt\n",
+            )
 if __name__ == '__main__':
diff --git a/Lib/test/test_email/test_generator.py b/Lib/test/test_email/test_generator.py
index c4f182903af..c1aeaefab77 100644
--- a/Lib/test/test_email/test_generator.py
+++ b/Lib/test/test_email/test_generator.py
@@ -27,7 +27,6 @@ def msgmaker(self, msg, policy=None):
-        # From is wrapped because wrapped it fits in 40.
         40: textwrap.dedent("""\
             To: whom_it_may_concern at example.com
@@ -40,11 +39,11 @@ def msgmaker(self, msg, policy=None):
-        # Neither to nor from fit even if put on a new line,
-        # so we leave them sticking out on the first line.
         20: textwrap.dedent("""\
-            To: whom_it_may_concern at example.com
-            From: nobody_you_want_to_know at example.com
+            To:
+             whom_it_may_concern at example.com
+            From:
+             nobody_you_want_to_know at example.com
             Subject: We the
              willing led by the
              unknowing are doing
@@ -169,6 +168,53 @@ def test_compat32_max_line_length_does_not_fold_when_none(self):
         self.assertEqual(s.getvalue(), self.typ(self.refold_long_expected[0]))
+    def test_rfc2231_wrapping(self):
+        # This is pretty much just to make sure we don't have an infinite
+        # loop; I don't expect anyone to hit this in the field.
+        msg = self.msgmaker(self.typ(textwrap.dedent("""\
+            To: nobody
+            Content-Disposition: attachment;
+             filename="afilenamelongenoghtowraphere"
+            None
+            """)))
+        expected = textwrap.dedent("""\
+            To: nobody
+            Content-Disposition: attachment;
+             filename*0*=us-ascii''afilename;
+             filename*1*=longenoghtowraphere
+            None
+            """)
+        s = self.ioclass()
+        g = self.genclass(s, policy=self.policy.clone(max_line_length=33))
+        g.flatten(msg)
+        self.assertEqual(s.getvalue(), self.typ(expected))
+    def test_rfc2231_wrapping_switches_to_default_len_if_too_narrow(self):
+        # This is just to make sure we don't have an infinite loop; I don't
+        # expect anyone to hit this in the field, so I'm not bothering to make
+        # the result optimal (the encoding isn't needed).
+        msg = self.msgmaker(self.typ(textwrap.dedent("""\
+            To: nobody
+            Content-Disposition: attachment;
+             filename="afilenamelongenoghtowraphere"
+            None
+            """)))
+        expected = textwrap.dedent("""\
+            To: nobody
+            Content-Disposition:
+             attachment;
+             filename*0*=us-ascii''afilenamelongenoghtowraphere
+            None
+            """)
+        s = self.ioclass()
+        g = self.genclass(s, policy=self.policy.clone(max_line_length=20))
+        g.flatten(msg)
+        self.assertEqual(s.getvalue(), self.typ(expected))
 class TestGenerator(TestGeneratorBase, TestEmailBase):
diff --git a/Lib/test/test_email/test_headerregistry.py b/Lib/test/test_email/test_headerregistry.py
index af836dc9726..30ce0ba54e4 100644
--- a/Lib/test/test_email/test_headerregistry.py
+++ b/Lib/test/test_email/test_headerregistry.py
@@ -229,14 +229,14 @@ def content_type_as_value(self,
         defects =  args[1] if l>1 else []
         decoded =  args[2] if l>2 and args[2] is not DITTO else source
         header = 'Content-Type:' + ' ' if source else ''
-        folded = args[3] if l>3 else header + source + '\n'
+        folded = args[3] if l>3 else header + decoded + '\n'
         h = self.make_header('Content-Type', source)
         self.assertEqual(h.content_type, content_type)
         self.assertEqual(h.maintype, maintype)
         self.assertEqual(h.subtype, subtype)
         self.assertEqual(h.params, parmdict)
         with self.assertRaises(TypeError):
-            h.params['abc'] = 'xyz'   # params is read-only.
+            h.params['abc'] = 'xyz'   # make sure params is read-only.
         self.assertDefectsEqual(h.defects, defects)
         self.assertEqual(h, decoded)
         self.assertEqual(h.fold(policy=policy.default), folded)
@@ -373,9 +373,10 @@ def content_type_as_value(self,
             'text/plain; Charset="utf-8"'),
         # Since this is pretty much the ur-mimeheader, we'll put all the tests
-        # that exercise the parameter parsing and formatting here.
-        #
-        # XXX: question: is minimal quoting preferred?
+        # that exercise the parameter parsing and formatting here.  Note that
+        # when we refold we may canonicalize, so things like whitespace,
+        # quoting, and rfc2231 encoding may change from what was in the input
+        # header.
         'unquoted_param_value': (
             'text/plain; title=foo',
@@ -384,7 +385,8 @@ def content_type_as_value(self,
             {'title': 'foo'},
-            'text/plain; title="foo"'),
+            'text/plain; title="foo"',
+            ),
         'param_value_with_tspecials': (
             'text/plain; title="(bar)foo blue"',
@@ -415,7 +417,8 @@ def content_type_as_value(self,
             {'boundary': 'CPIMSSMTPC06p5f3tG'},
-            'Multipart/mixed; boundary="CPIMSSMTPC06p5f3tG"'),
+            'Multipart/mixed; boundary="CPIMSSMTPC06p5f3tG"',
+            ),
         'spaces_around_semis': (
             ('image/jpeg; name="wibble.JPG" ; x-mac-type="4A504547" ; '
@@ -429,14 +432,31 @@ def content_type_as_value(self,
             ('image/jpeg; name="wibble.JPG"; x-mac-type="4A504547"; '
-            # XXX: it could be that we will eventually prefer to fold starting
-            # from the decoded value, in which case these spaces and similar
-            # spaces in other tests will be wrong.
-            ('Content-Type: image/jpeg; name="wibble.JPG" ; '
-                'x-mac-type="4A504547" ;\n'
+            ('Content-Type: image/jpeg; name="wibble.JPG";'
+                ' x-mac-type="4A504547";\n'
              ' x-mac-creator="474B4F4E"\n'),
+        'lots_of_mime_params': (
+            ('image/jpeg; name="wibble.JPG"; x-mac-type="4A504547"; '
+                'x-mac-creator="474B4F4E"; x-extrastuff="make it longer"'),
+            'image/jpeg',
+            'image',
+            'jpeg',
+            {'name': 'wibble.JPG',
+             'x-mac-type': '4A504547',
+             'x-mac-creator': '474B4F4E',
+             'x-extrastuff': 'make it longer'},
+            [],
+            ('image/jpeg; name="wibble.JPG"; x-mac-type="4A504547"; '
+                'x-mac-creator="474B4F4E"; x-extrastuff="make it longer"'),
+            # In this case the whole of the MimeParameters does *not* fit
+            # one one line, so we break at a lower syntactic level.
+            ('Content-Type: image/jpeg; name="wibble.JPG";'
+                ' x-mac-type="4A504547";\n'
+             ' x-mac-creator="474B4F4E"; x-extrastuff="make it longer"\n'),
+            ),
         'semis_inside_quotes': (
             'image/jpeg; name="Jim&&Jill"',
@@ -460,19 +480,25 @@ def content_type_as_value(self,
             r'image/jpeg; name="Jim \"Bob\" Jill"'),
-        # XXX: This test works except for the refolding of the header.  I'll
-        # deal with that bug when I deal with the other folding bugs.
-        #'non_ascii_in_params': (
-        #    ('foo\xa7/bar; b\xa7r=two; '
-        #        'baz=thr\xa7e'.encode('latin-1').decode('us-ascii',
-        #                                                'surrogateescape')),
-        #    'foo\uFFFD/bar',
-        #    'foo\uFFFD',
-        #    'bar',
-        #    {'b\uFFFDr': 'two', 'baz': 'thr\uFFFDe'},
-        #    [errors.UndecodableBytesDefect]*3,
-        #    'foo�/bar; b�r="two"; baz="thr�e"',
-        #    ),
+        'non_ascii_in_params': (
+            ('foo\xa7/bar; b\xa7r=two; '
+                'baz=thr\xa7e'.encode('latin-1').decode('us-ascii',
+                                                        'surrogateescape')),
+            'foo\uFFFD/bar',
+            'foo\uFFFD',
+            'bar',
+            {'b\uFFFDr': 'two', 'baz': 'thr\uFFFDe'},
+            [errors.UndecodableBytesDefect]*3,
+            'foo�/bar; b�r="two"; baz="thr�e"',
+            # XXX Two bugs here: the mime type is not allowed to be an encoded
+            # word, and we shouldn't be emitting surrogates in the parameter
+            # names.  But I don't know what the behavior should be here, so I'm
+            # punting for now.  In practice this is unlikely to be encountered
+            # since headers with binary in them only come from a binary source
+            # and are almost certain to be re-emitted without refolding.
+            'Content-Type: =?unknown-8bit?q?foo=A7?=/bar; b\udca7r="two";\n'
+            " baz*=unknown-8bit''thr%A7e\n",
+            ),
         # RFC 2231 parameter tests.
@@ -494,19 +520,20 @@ def content_type_as_value(self,
             r'image/jpeg; bar="baz\"foobar\"baz"'),
-        # XXX: This test works except for the refolding of the header.  I'll
-        # deal with that bug when I deal with the other folding bugs.
-        #'non_ascii_rfc2231_value': (
-        #    ('text/plain; charset=us-ascii; '
-        #     "title*=us-ascii'en'This%20is%20"
-        #     'not%20f\xa7n').encode('latin-1').decode('us-ascii',
-        #                                             'surrogateescape'),
-        #    'text/plain',
-        #    'text',
-        #    'plain',
-        #    {'charset': 'us-ascii', 'title': 'This is not f\uFFFDn'},
-        #     [errors.UndecodableBytesDefect],
-        #     'text/plain; charset="us-ascii"; title="This is not f�n"'),
+        'non_ascii_rfc2231_value': (
+            ('text/plain; charset=us-ascii; '
+             "title*=us-ascii'en'This%20is%20"
+             'not%20f\xa7n').encode('latin-1').decode('us-ascii',
+                                                     'surrogateescape'),
+            'text/plain',
+            'text',
+            'plain',
+            {'charset': 'us-ascii', 'title': 'This is not f\uFFFDn'},
+             [errors.UndecodableBytesDefect],
+             'text/plain; charset="us-ascii"; title="This is not f�n"',
+            'Content-Type: text/plain; charset="us-ascii";\n'
+            " title*=unknown-8bit''This%20is%20not%20f%A7n\n",
+            ),
         'rfc2231_encoded_charset': (
             'text/plain; charset*=ansi-x3.4-1968\'\'us-ascii',
@@ -529,8 +556,6 @@ def content_type_as_value(self,
             {'name': 'This is ***fun*** is it not.pdf'},
             'text/plain; name="This is ***fun*** is it not.pdf"',
-            ('Content-Type: text/plain;\tname*0*=\'\'This%20is%20;\n'
-             '\tname*1*=%2A%2A%2Afun%2A%2A%2A%20;\tname*2="is it not.pdf"\n'),
         # Make sure we also handle it if there are spurious double quotes.
@@ -545,9 +570,6 @@ def content_type_as_value(self,
             {'name': 'This is even more ***fun*** is it not.pdf'},
             'text/plain; name="This is even more ***fun*** is it not.pdf"',
-            ('Content-Type: text/plain;\t'
-                'name*0*="us-ascii\'\'This%20is%20even%20more%20";\n'
-             '\tname*1*="%2A%2A%2Afun%2A%2A%2A%20";\tname*2="is it not.pdf"\n'),
         'rfc2231_single_quote_inside_double_quotes': (
@@ -562,9 +584,8 @@ def content_type_as_value(self,
             ('text/plain; charset="us-ascii"; '
                'title="This is really ***fun*** isn\'t it!"'),
-            ('Content-Type: text/plain; charset=us-ascii;\n'
-             '\ttitle*0*="us-ascii\'en\'This%20is%20really%20";\n'
-             '\ttitle*1*="%2A%2A%2Afun%2A%2A%2A%20";\ttitle*2="isn\'t it!"\n'),
+            ('Content-Type: text/plain; charset="us-ascii";\n'
+                ' title="This is really ***fun*** isn\'t it!"\n'),
         'rfc2231_single_quote_in_value_with_charset_and_lang': (
@@ -576,9 +597,6 @@ def content_type_as_value(self,
             {'name': "Frank's Document"},
             'application/x-foo; name="Frank\'s Document"',
-            ('Content-Type: application/x-foo;\t'
-                'name*0*="us-ascii\'en-us\'Frank\'s";\n'
-             ' name*1*=" Document"\n'),
         'rfc2231_single_quote_in_non_encoded_value': (
@@ -590,9 +608,6 @@ def content_type_as_value(self,
             {'name': "us-ascii'en-us'Frank's Document"},
             'application/x-foo; name="us-ascii\'en-us\'Frank\'s Document"',
-            ('Content-Type: application/x-foo;\t'
-                'name*0="us-ascii\'en-us\'Frank\'s";\n'
-             ' name*1=" Document"\n'),
         'rfc2231_no_language_or_charset': (
@@ -615,12 +630,8 @@ def content_type_as_value(self,
             {'name': 'This is even more ***fun*** is it.pdf'},
             'text/plain; name="This is even more ***fun*** is it.pdf"',
-            ('Content-Type: text/plain;\t'
-                'name*0*="\'\'This%20is%20even%20more%20";\n'
-             '\tname*1*="%2A%2A%2Afun%2A%2A%2A%20";\tname*2="is it.pdf"\n'),
-        # XXX: see below...the first name line here should be *0 not *0*.
         'rfc2231_partly_encoded': (
@@ -632,9 +643,6 @@ def content_type_as_value(self,
             {'name': 'This is even more ***fun*** is it.pdf'},
             'text/plain; name="This is even more ***fun*** is it.pdf"',
-            ('Content-Type: text/plain;\t'
-                'name*0*="\'\'This%20is%20even%20more%20";\n'
-             '\tname*1*="%2A%2A%2Afun%2A%2A%2A%20";\tname*2="is it.pdf"\n'),
         'rfc2231_partly_encoded_2': (
@@ -647,10 +655,11 @@ def content_type_as_value(self,
             {'name': 'This is even more %2A%2A%2Afun%2A%2A%2A%20is it.pdf'},
-            'text/plain; name="This is even more %2A%2A%2Afun%2A%2A%2A%20is it.pdf"',
-            ('Content-Type: text/plain;\t'
-                'name*0*="\'\'This%20is%20even%20more%20";\n'
-             '\tname*1="%2A%2A%2Afun%2A%2A%2A%20";\tname*2="is it.pdf"\n'),
+            ('text/plain;'
+             ' name="This is even more %2A%2A%2Afun%2A%2A%2A%20is it.pdf"'),
+            ('Content-Type: text/plain;\n'
+             ' name="This is even more %2A%2A%2Afun%2A%2A%2A%20is'
+                ' it.pdf"\n'),
         'rfc2231_unknown_charset_treated_as_ascii': (
@@ -669,9 +678,12 @@ def content_type_as_value(self,
             {'charset': 'utf-8\uFFFD\uFFFD\uFFFD'},
-            'text/plain; charset="utf-8\uFFFD\uFFFD\uFFFD"'),
+            'text/plain; charset="utf-8\uFFFD\uFFFD\uFFFD"',
+            "Content-Type: text/plain;"
+            " charset*=unknown-8bit''utf-8%F1%F2%F3\n",
+            ),
-        'rfc2231_utf_8_in_supposedly_ascii_charset_parameter_value': (
+        'rfc2231_utf8_in_supposedly_ascii_charset_parameter_value': (
             "text/plain; charset*=ascii''utf-8%E2%80%9D",
@@ -679,9 +691,11 @@ def content_type_as_value(self,
             {'charset': 'utf-8”'},
             'text/plain; charset="utf-8”"',
+            # XXX Should folding change the charset to utf8?  Currently it just
+            # reproduces the original, which is arguably fine.
+            "Content-Type: text/plain;"
+            " charset*=unknown-8bit''utf-8%E2%80%9D\n",
-            # XXX: if the above were *re*folded, it would get tagged as utf-8
-            # instead of ascii in the param, since it now contains non-ASCII.
         'rfc2231_encoded_then_unencoded_segments': (
@@ -694,9 +708,6 @@ def content_type_as_value(self,
             {'name': 'My Document For You'},
             'application/x-foo; name="My Document For You"',
-            ('Content-Type: application/x-foo;\t'
-                'name*0*="us-ascii\'en-us\'My";\n'
-             '\tname*1=" Document";\tname*2=" For You"\n'),
         # My reading of the RFC is that this is an invalid header.  The RFC
@@ -713,11 +724,6 @@ def content_type_as_value(self,
             {'name': 'My Document For You'},
             'application/x-foo; name="My Document For You"',
-            ("Content-Type: application/x-foo;\tname*0=us-ascii'en-us'My;\t"
-                # XXX: the newline is in the wrong place, come back and fix
-                # this when the rest of tests pass.
-                'name*1*=" Document"\n;'
-             '\tname*2*=" For You"\n'),
         # XXX: I would say this one should default to ascii/en for the
@@ -730,8 +736,7 @@ def content_type_as_value(self,
         # charset'lang'value pattern exactly *and* there is at least one
         # encoded segment.  Implementing that algorithm will require some
         # refactoring, so I haven't done it (yet).
-        'rfc2231_qouted_unencoded_then_encoded_segments': (
+        'rfc2231_quoted_unencoded_then_encoded_segments': (
                 '\tname*1*=" Document";'
@@ -742,9 +747,25 @@ def content_type_as_value(self,
             {'name': "us-ascii'en-us'My Document For You"},
             'application/x-foo; name="us-ascii\'en-us\'My Document For You"',
-            ('Content-Type: application/x-foo;\t'
-                'name*0="us-ascii\'en-us\'My";\n'
-             '\tname*1*=" Document";\tname*2*=" For You"\n'),
+            ),
+        # Make sure our folding algorithm produces multiple sections correctly.
+        # We could mix encoded and non-encoded segments, but we don't, we just
+        # make them all encoded.  It might be worth fixing that, since the
+        # sections can get used for wrapping ascii text.
+        'rfc2231_folded_segments_correctly_formatted': (
+            ('application/x-foo;'
+                '\tname="' + "with spaces"*8 + '"'),
+            'application/x-foo',
+            'application',
+            'x-foo',
+            {'name': "with spaces"*8},
+            [],
+            'application/x-foo; name="' + "with spaces"*8 + '"',
+            "Content-Type: application/x-foo;\n"
+            " name*0*=us-ascii''with%20spaceswith%20spaceswith%20spaceswith"
+                "%20spaceswith;\n"
+            " name*1*=%20spaceswith%20spaceswith%20spaceswith%20spaces\n"
@@ -827,8 +848,8 @@ def content_disp_as_value(self,
             ('attachment; filename="genome.jpeg"; '
                  'modification-date="Wed, 12 Feb 1997 16:29:51 -0500"'),
-            ('Content-Disposition: attachment; filename=genome.jpeg;\n'
-             '  modification-date="Wed, 12 Feb 1997 16:29:51 -0500";\n'),
+            ('Content-Disposition: attachment; filename="genome.jpeg";\n'
+             ' modification-date="Wed, 12 Feb 1997 16:29:51 -0500"\n'),
         'no_value': (
@@ -873,7 +894,7 @@ def version_string_as_MIME_Version(self,
         if source:
             source = ' ' + source
-                        'MIME-Version:' + source + '\n')
+                         'MIME-Version:' + source + '\n')
     version_string_params = {
@@ -1546,15 +1567,39 @@ def test_fold_unstructured_with_overlong_word(self):
-            'Subject: thisisaverylonglineconsistingofasinglewordthatwontfit\n')
+            'Subject: \n'
+            ' =?utf-8?q?thisisa?=\n'
+            ' =?utf-8?q?verylon?=\n'
+            ' =?utf-8?q?glineco?=\n'
+            ' =?utf-8?q?nsistin?=\n'
+            ' =?utf-8?q?gofasin?=\n'
+            ' =?utf-8?q?gleword?=\n'
+            ' =?utf-8?q?thatwon?=\n'
+            ' =?utf-8?q?tfit?=\n'
+            )
     def test_fold_unstructured_with_two_overlong_words(self):
         h = self.make_header('Subject', 'thisisaverylonglineconsistingofa'
             'singlewordthatwontfit plusanotherverylongwordthatwontfit')
-            'Subject: thisisaverylonglineconsistingofasinglewordthatwontfit\n'
-                ' plusanotherverylongwordthatwontfit\n')
+            'Subject: \n'
+            ' =?utf-8?q?thisisa?=\n'
+            ' =?utf-8?q?verylon?=\n'
+            ' =?utf-8?q?glineco?=\n'
+            ' =?utf-8?q?nsistin?=\n'
+            ' =?utf-8?q?gofasin?=\n'
+            ' =?utf-8?q?gleword?=\n'
+            ' =?utf-8?q?thatwon?=\n'
+            ' =?utf-8?q?tfit_pl?=\n'
+            ' =?utf-8?q?usanoth?=\n'
+            ' =?utf-8?q?erveryl?=\n'
+            ' =?utf-8?q?ongword?=\n'
+            ' =?utf-8?q?thatwon?=\n'
+            ' =?utf-8?q?tfit?=\n'
+            )
+    # XXX Need test for when max_line_length is less than the chrome size.
     def test_fold_unstructured_with_slightly_long_word(self):
         h = self.make_header('Subject', 'thislongwordislessthanmaxlinelen')
@@ -1590,6 +1635,18 @@ def test_fold_date_header(self):
                         'Date: Sat, 02 Feb 2002 17:00:06 -0800\n')
+    def test_fold_overlong_words_using_RFC2047(self):
+        h = self.make_header(
+            'X-Report-Abuse',
+            '<https://www.mailitapp.com/report_abuse.php?'
+              'mid=xxx-xxx-xxxxxxxxxxxxxxxxxxxxxxxx==-xxx-xx-xx>')
+        self.assertEqual(
+            h.fold(policy=policy.default),
+            'X-Report-Abuse: =?utf-8?q?=3Chttps=3A//www=2Emailitapp=2E'
+                'com/report=5F?=\n'
+            ' =?utf-8?q?abuse=2Ephp=3Fmid=3Dxxx-xxx-xxxx'
+                'xxxxxxxxxxxxxxxxxxxx=3D=3D-xxx-?=\n'
+            ' =?utf-8?q?xx-xx=3E?=\n')
 if __name__ == '__main__':
diff --git a/Misc/NEWS.d/next/Library/2017-12-02-16-06-00.bpo-27240.Kji34M.rst b/Misc/NEWS.d/next/Library/2017-12-02-16-06-00.bpo-27240.Kji34M.rst
new file mode 100644
index 00000000000..c933ee7d916
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2017-12-02-16-06-00.bpo-27240.Kji34M.rst
@@ -0,0 +1,3 @@
+The header folding algorithm for the new email policies has been rewritten,
+which also fixes bpo-30788, bpo-31831, and bpo-32182.  In particular, RFC2231
+folding is now done correctly.

More information about the Python-checkins mailing list