[Python-checkins] python/dist/src/Lib/email Charset.py,1.7.2.2,1.7.2.3 Generator.py,1.6.10.2,1.6.10.3 Header.py,1.13.2.1,1.13.2.2 __init__.py,1.4.10.3,1.4.10.4
bwarsaw@users.sourceforge.net
bwarsaw@users.sourceforge.net
Mon, 14 Oct 2002 10:26:06 -0700
Update of /cvsroot/python/python/dist/src/Lib/email
In directory usw-pr-cvs1:/tmp/cvs-serv8082/Lib/email
Modified Files:
Tag: release22-maint
Charset.py Generator.py Header.py __init__.py
Log Message:
Backport bugfix microrelease of email 2.4.3 from cvs trunk.
Index: Charset.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/email/Charset.py,v
retrieving revision 1.7.2.2
retrieving revision 1.7.2.3
diff -C2 -d -r1.7.2.2 -r1.7.2.3
*** Charset.py 10 Oct 2002 19:09:23 -0000 1.7.2.2
--- Charset.py 14 Oct 2002 17:26:00 -0000 1.7.2.3
***************
*** 44,47 ****
--- 44,49 ----
'koi8-r': (BASE64, BASE64, None),
'utf-8': (SHORTEST, BASE64, 'utf-8'),
+ # We're making this one up to represent raw unencoded 8-bit
+ '8bit': (None, BASE64, 'utf-8'),
}
***************
*** 54,72 ****
}
! # Map charsets to their Unicode codec strings. Note that the Japanese
! # examples included below do not (yet) come with Python! They are available
! # from http://pseudo.grad.sccs.chukyo-u.ac.jp/~kajiyama/python/
!
! # The Chinese and Korean codecs are available from SourceForge:
! #
! # http://sourceforge.net/projects/python-codecs/
! #
! # although you'll need to check them out of cvs since they haven't been file
! # released yet. You might also try to use
#
! # http://www.freshports.org/port-description.php3?port=6702
#
! # if you can get logged in. AFAICT, both the Chinese and Korean codecs are
! # fairly experimental at this point.
CODEC_MAP = {
'euc-jp': 'japanese.euc-jp',
--- 56,69 ----
}
! # Map charsets to their Unicode codec strings. Note that Python doesn't come
! # with any Asian codecs by default. Here's where to get them:
#
! # Japanese -- http://www.asahi-net.or.jp/~rd6t-kjym/python
! # Korean -- http://sf.net/projects/koco
! # Chinese -- http://sf.net/projects/python-codecs
#
! # Note that these codecs have their own lifecycle and may be in varying states
! # of stability and useability.
!
CODEC_MAP = {
'euc-jp': 'japanese.euc-jp',
Index: Generator.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/email/Generator.py,v
retrieving revision 1.6.10.2
retrieving revision 1.6.10.3
diff -C2 -d -r1.6.10.2 -r1.6.10.3
*** Generator.py 4 Oct 2002 17:24:23 -0000 1.6.10.2
--- Generator.py 14 Oct 2002 17:26:01 -0000 1.6.10.3
***************
*** 9,13 ****
import random
! from types import ListType
from cStringIO import StringIO
--- 9,13 ----
import random
! from types import ListType, StringType
from cStringIO import StringIO
***************
*** 36,39 ****
--- 36,47 ----
fcre = re.compile(r'^From ', re.MULTILINE)
+ def _is8bitstring(s):
+ if isinstance(s, StringType):
+ try:
+ unicode(s, 'us-ascii')
+ except UnicodeError:
+ return True
+ return False
+
***************
*** 174,177 ****
--- 182,193 ----
# No line was actually longer than maxheaderlen characters, so
# just return the original unchanged.
+ return text
+ # If we have raw 8bit data in a byte string, we have no idea what the
+ # encoding is. I think there is no safe way to split this string. If
+ # it's ascii-subset, then we could do a normal ascii split, but if
+ # it's multibyte then we could break the string. There's no way to
+ # know so the least harm seems to be to not split the string and risk
+ # it being too long.
+ if _is8bitstring(text):
return text
# The `text' argument already has the field name prepended, so don't
Index: Header.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/email/Header.py,v
retrieving revision 1.13.2.1
retrieving revision 1.13.2.2
diff -C2 -d -r1.13.2.1 -r1.13.2.2
*** Header.py 4 Oct 2002 17:24:24 -0000 1.13.2.1
--- Header.py 14 Oct 2002 17:26:02 -0000 1.13.2.2
***************
*** 154,157 ****
--- 154,159 ----
if charset is None:
charset = USASCII
+ if not isinstance(charset, Charset):
+ charset = Charset(charset)
self._charset = charset
self._continuation_ws = continuation_ws
***************
*** 217,239 ****
elif not isinstance(charset, Charset):
charset = Charset(charset)
! # Normalize and check the string
! if isinstance(s, StringType):
! # Possibly raise UnicodeError if it can't e encoded
! unicode(s, charset.get_output_charset())
! elif isinstance(s, UnicodeType):
! # Convert Unicode to byte string for later concatenation
! for charset in USASCII, charset, UTF8:
! try:
! s = s.encode(charset.get_output_charset())
! break
! except UnicodeError:
! pass
! else:
! assert False, 'Could not encode to utf-8'
self._chunks.append((s, charset))
def _split(self, s, charset, firstline=False):
! # Split up a header safely for use with encode_chunks. BAW: this
! # appears to be a private convenience method.
splittable = charset.to_splittable(s)
encoded = charset.from_splittable(splittable)
--- 219,254 ----
elif not isinstance(charset, Charset):
charset = Charset(charset)
! # If the charset is our faux 8bit charset, leave the string unchanged
! if charset <> '8bit':
! # We need to test that the string can be converted to unicode and
! # back to a byte string, given the input and output codecs of the
! # charset.
! if isinstance(s, StringType):
! # Possibly raise UnicodeError if the byte string can't be
! # converted to a unicode with the input codec of the charset.
! incodec = charset.input_codec or 'us-ascii'
! ustr = unicode(s, incodec)
! # Now make sure that the unicode could be converted back to a
! # byte string with the output codec, which may be different
! # than the iput coded. Still, use the original byte string.
! outcodec = charset.output_codec or 'us-ascii'
! ustr.encode(outcodec)
! elif isinstance(s, UnicodeType):
! # Now we have to be sure the unicode string can be converted
! # to a byte string with a reasonable output codec. We want to
! # use the byte string in the chunk.
! for charset in USASCII, charset, UTF8:
! try:
! outcodec = charset.output_codec or 'us-ascii'
! s = s.encode(outcodec)
! break
! except UnicodeError:
! pass
! else:
! assert False, 'utf-8 conversion failed'
self._chunks.append((s, charset))
def _split(self, s, charset, firstline=False):
! # Split up a header safely for use with encode_chunks.
splittable = charset.to_splittable(s)
encoded = charset.from_splittable(splittable)
***************
*** 242,245 ****
--- 257,268 ----
if elen <= self._maxlinelen:
return [(encoded, charset)]
+ # If we have undetermined raw 8bit characters sitting in a byte
+ # string, we really don't know what the right thing to do is. We
+ # can't really split it because it might be multibyte data which we
+ # could break if we split it between pairs. The least harm seems to
+ # be to not split the header at all, but that means they could go out
+ # longer than maxlinelen.
+ elif charset == '8bit':
+ return [(s, charset)]
# BAW: I'm not sure what the right test here is. What we're trying to
# do is be faithful to RFC 2822's recommendation that ($2.2.3):
***************
*** 347,371 ****
return [(chunk, charset) for chunk in rtn]
! def _encode_chunks(self):
! """MIME-encode a header with many different charsets and/or encodings.
!
! Given a list of pairs (string, charset), return a MIME-encoded string
! suitable for use in a header field. Each pair may have different
! charsets and/or encodings, and the resulting header will accurately
! reflect each setting.
!
! Each encoding can be email.Utils.QP (quoted-printable, for ASCII-like
! character sets like iso-8859-1), email.Utils.BASE64 (Base64, for
! non-ASCII like character sets like KOI8-R and iso-2022-jp), or None
! (no encoding).
!
! Each pair will be represented on a separate line; the resulting string
! will be in the format:
!
! "=?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n
! =?charset2?b?SvxyZ2VuIEL2aW5n?="
! """
chunks = []
! for header, charset in self._chunks:
if charset is None or charset.header_encoding is None:
# There's no encoding for this chunk's charsets
--- 370,394 ----
return [(chunk, charset) for chunk in rtn]
! def _encode_chunks(self, newchunks):
! # MIME-encode a header with many different charsets and/or encodings.
! #
! # Given a list of pairs (string, charset), return a MIME-encoded
! # string suitable for use in a header field. Each pair may have
! # different charsets and/or encodings, and the resulting header will
! # accurately reflect each setting.
! #
! # Each encoding can be email.Utils.QP (quoted-printable, for
! # ASCII-like character sets like iso-8859-1), email.Utils.BASE64
! # (Base64, for non-ASCII like character sets like KOI8-R and
! # iso-2022-jp), or None (no encoding).
! #
! # Each pair will be represented on a separate line; the resulting
! # string will be in the format:
! #
! # =?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n
! # =?charset2?b?SvxyZ2VuIEL2aW5n?="
! #
chunks = []
! for header, charset in newchunks:
if charset is None or charset.header_encoding is None:
# There's no encoding for this chunk's charsets
***************
*** 398,401 ****
for s, charset in self._chunks:
newchunks += self._split(s, charset, True)
! self._chunks = newchunks
! return self._encode_chunks()
--- 421,423 ----
for s, charset in self._chunks:
newchunks += self._split(s, charset, True)
! return self._encode_chunks(newchunks)
Index: __init__.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/email/__init__.py,v
retrieving revision 1.4.10.3
retrieving revision 1.4.10.4
diff -C2 -d -r1.4.10.3 -r1.4.10.4
*** __init__.py 10 Oct 2002 19:09:24 -0000 1.4.10.3
--- __init__.py 14 Oct 2002 17:26:02 -0000 1.4.10.4
***************
*** 5,9 ****
"""
! __version__ = '2.4.2'
__all__ = [
--- 5,9 ----
"""
! __version__ = '2.4.3'
__all__ = [