[Python-checkins] python/dist/src/Lib/email Charset.py,1.7.2.2,1.7.2.3 Generator.py,1.6.10.2,1.6.10.3 Header.py,1.13.2.1,1.13.2.2 __init__.py,1.4.10.3,1.4.10.4

bwarsaw@users.sourceforge.net bwarsaw@users.sourceforge.net
Mon, 14 Oct 2002 10:26:06 -0700


Update of /cvsroot/python/python/dist/src/Lib/email
In directory usw-pr-cvs1:/tmp/cvs-serv8082/Lib/email

Modified Files:
      Tag: release22-maint
	Charset.py Generator.py Header.py __init__.py 
Log Message:
Backport bugfix microrelease of email 2.4.3 from cvs trunk.


Index: Charset.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/email/Charset.py,v
retrieving revision 1.7.2.2
retrieving revision 1.7.2.3
diff -C2 -d -r1.7.2.2 -r1.7.2.3
*** Charset.py	10 Oct 2002 19:09:23 -0000	1.7.2.2
--- Charset.py	14 Oct 2002 17:26:00 -0000	1.7.2.3
***************
*** 44,47 ****
--- 44,49 ----
      'koi8-r':      (BASE64,    BASE64,  None),
      'utf-8':       (SHORTEST,  BASE64, 'utf-8'),
+     # We're making this one up to represent raw unencoded 8-bit
+     '8bit':        (None,      BASE64, 'utf-8'),
      }
  
***************
*** 54,72 ****
      }
  
! # Map charsets to their Unicode codec strings.  Note that the Japanese
! # examples included below do not (yet) come with Python!  They are available
! # from http://pseudo.grad.sccs.chukyo-u.ac.jp/~kajiyama/python/
! 
! # The Chinese and Korean codecs are available from SourceForge:
! #
! #     http://sourceforge.net/projects/python-codecs/
! #
! # although you'll need to check them out of cvs since they haven't been file
! # released yet.  You might also try to use
  #
! #     http://www.freshports.org/port-description.php3?port=6702
  #
! # if you can get logged in.  AFAICT, both the Chinese and Korean codecs are
! # fairly experimental at this point.
  CODEC_MAP = {
      'euc-jp':      'japanese.euc-jp',
--- 56,69 ----
      }
  
! # Map charsets to their Unicode codec strings.  Note that Python doesn't come
! # with any Asian codecs by default.  Here's where to get them:
  #
! # Japanese -- http://www.asahi-net.or.jp/~rd6t-kjym/python
! # Korean   -- http://sf.net/projects/koco
! # Chinese  -- http://sf.net/projects/python-codecs
  #
! # Note that these codecs have their own lifecycle and may be in varying states
! # of stability and useability.
! 
  CODEC_MAP = {
      'euc-jp':      'japanese.euc-jp',

Index: Generator.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/email/Generator.py,v
retrieving revision 1.6.10.2
retrieving revision 1.6.10.3
diff -C2 -d -r1.6.10.2 -r1.6.10.3
*** Generator.py	4 Oct 2002 17:24:23 -0000	1.6.10.2
--- Generator.py	14 Oct 2002 17:26:01 -0000	1.6.10.3
***************
*** 9,13 ****
  import random
  
! from types import ListType
  from cStringIO import StringIO
  
--- 9,13 ----
  import random
  
! from types import ListType, StringType
  from cStringIO import StringIO
  
***************
*** 36,39 ****
--- 36,47 ----
  fcre = re.compile(r'^From ', re.MULTILINE)
  
+ def _is8bitstring(s):
+     if isinstance(s, StringType):
+         try:
+             unicode(s, 'us-ascii')
+         except UnicodeError:
+             return True
+     return False
+ 
  
  
***************
*** 174,177 ****
--- 182,193 ----
              # No line was actually longer than maxheaderlen characters, so
              # just return the original unchanged.
+             return text
+         # If we have raw 8bit data in a byte string, we have no idea what the
+         # encoding is.  I think there is no safe way to split this string.  If
+         # it's ascii-subset, then we could do a normal ascii split, but if
+         # it's multibyte then we could break the string.  There's no way to
+         # know so the least harm seems to be to not split the string and risk
+         # it being too long.
+         if _is8bitstring(text):
              return text
          # The `text' argument already has the field name prepended, so don't

Index: Header.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/email/Header.py,v
retrieving revision 1.13.2.1
retrieving revision 1.13.2.2
diff -C2 -d -r1.13.2.1 -r1.13.2.2
*** Header.py	4 Oct 2002 17:24:24 -0000	1.13.2.1
--- Header.py	14 Oct 2002 17:26:02 -0000	1.13.2.2
***************
*** 154,157 ****
--- 154,159 ----
          if charset is None:
              charset = USASCII
+         if not isinstance(charset, Charset):
+             charset = Charset(charset)
          self._charset = charset
          self._continuation_ws = continuation_ws
***************
*** 217,239 ****
          elif not isinstance(charset, Charset):
              charset = Charset(charset)
!         # Normalize and check the string
!         if isinstance(s, StringType):
!             # Possibly raise UnicodeError if it can't e encoded
!             unicode(s, charset.get_output_charset())
!         elif isinstance(s, UnicodeType):
!             # Convert Unicode to byte string for later concatenation
!             for charset in USASCII, charset, UTF8:
!                 try:
!                     s = s.encode(charset.get_output_charset())
!                     break
!                 except UnicodeError:
!                     pass
!             else:
!                 assert False, 'Could not encode to utf-8'
          self._chunks.append((s, charset))
  
      def _split(self, s, charset, firstline=False):
!         # Split up a header safely for use with encode_chunks.  BAW: this
!         # appears to be a private convenience method.
          splittable = charset.to_splittable(s)
          encoded = charset.from_splittable(splittable)
--- 219,254 ----
          elif not isinstance(charset, Charset):
              charset = Charset(charset)
!         # If the charset is our faux 8bit charset, leave the string unchanged
!         if charset <> '8bit':
!             # We need to test that the string can be converted to unicode and
!             # back to a byte string, given the input and output codecs of the
!             # charset.
!             if isinstance(s, StringType):
!                 # Possibly raise UnicodeError if the byte string can't be
!                 # converted to a unicode with the input codec of the charset.
!                 incodec = charset.input_codec or 'us-ascii'
!                 ustr = unicode(s, incodec)
!                 # Now make sure that the unicode could be converted back to a
!                 # byte string with the output codec, which may be different
!                 # than the iput coded.  Still, use the original byte string.
!                 outcodec = charset.output_codec or 'us-ascii'
!                 ustr.encode(outcodec)
!             elif isinstance(s, UnicodeType):
!                 # Now we have to be sure the unicode string can be converted
!                 # to a byte string with a reasonable output codec.  We want to
!                 # use the byte string in the chunk.
!                 for charset in USASCII, charset, UTF8:
!                     try:
!                         outcodec = charset.output_codec or 'us-ascii'
!                         s = s.encode(outcodec)
!                         break
!                     except UnicodeError:
!                         pass
!                 else:
!                     assert False, 'utf-8 conversion failed'
          self._chunks.append((s, charset))
  
      def _split(self, s, charset, firstline=False):
!         # Split up a header safely for use with encode_chunks.
          splittable = charset.to_splittable(s)
          encoded = charset.from_splittable(splittable)
***************
*** 242,245 ****
--- 257,268 ----
          if elen <= self._maxlinelen:
              return [(encoded, charset)]
+         # If we have undetermined raw 8bit characters sitting in a byte
+         # string, we really don't know what the right thing to do is.  We
+         # can't really split it because it might be multibyte data which we
+         # could break if we split it between pairs.  The least harm seems to
+         # be to not split the header at all, but that means they could go out
+         # longer than maxlinelen.
+         elif charset == '8bit':
+             return [(s, charset)]
          # BAW: I'm not sure what the right test here is.  What we're trying to
          # do is be faithful to RFC 2822's recommendation that ($2.2.3):
***************
*** 347,371 ****
          return [(chunk, charset) for chunk in rtn]
  
!     def _encode_chunks(self):
!         """MIME-encode a header with many different charsets and/or encodings.
! 
!         Given a list of pairs (string, charset), return a MIME-encoded string
!         suitable for use in a header field.  Each pair may have different
!         charsets and/or encodings, and the resulting header will accurately
!         reflect each setting.
! 
!         Each encoding can be email.Utils.QP (quoted-printable, for ASCII-like
!         character sets like iso-8859-1), email.Utils.BASE64 (Base64, for
!         non-ASCII like character sets like KOI8-R and iso-2022-jp), or None
!         (no encoding).
! 
!         Each pair will be represented on a separate line; the resulting string
!         will be in the format:
! 
!         "=?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n
!           =?charset2?b?SvxyZ2VuIEL2aW5n?="
!         """
          chunks = []
!         for header, charset in self._chunks:
              if charset is None or charset.header_encoding is None:
                  # There's no encoding for this chunk's charsets
--- 370,394 ----
          return [(chunk, charset) for chunk in rtn]
  
!     def _encode_chunks(self, newchunks):
!         # MIME-encode a header with many different charsets and/or encodings.
!         #
!         # Given a list of pairs (string, charset), return a MIME-encoded
!         # string suitable for use in a header field.  Each pair may have
!         # different charsets and/or encodings, and the resulting header will
!         # accurately reflect each setting.
!         #
!         # Each encoding can be email.Utils.QP (quoted-printable, for
!         # ASCII-like character sets like iso-8859-1), email.Utils.BASE64
!         # (Base64, for non-ASCII like character sets like KOI8-R and
!         # iso-2022-jp), or None (no encoding).
!         #
!         # Each pair will be represented on a separate line; the resulting
!         # string will be in the format:
!         #
!         # =?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n
!         #  =?charset2?b?SvxyZ2VuIEL2aW5n?="
!         #
          chunks = []
!         for header, charset in newchunks:
              if charset is None or charset.header_encoding is None:
                  # There's no encoding for this chunk's charsets
***************
*** 398,401 ****
          for s, charset in self._chunks:
              newchunks += self._split(s, charset, True)
!         self._chunks = newchunks
!         return self._encode_chunks()
--- 421,423 ----
          for s, charset in self._chunks:
              newchunks += self._split(s, charset, True)
!         return self._encode_chunks(newchunks)

Index: __init__.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/email/__init__.py,v
retrieving revision 1.4.10.3
retrieving revision 1.4.10.4
diff -C2 -d -r1.4.10.3 -r1.4.10.4
*** __init__.py	10 Oct 2002 19:09:24 -0000	1.4.10.3
--- __init__.py	14 Oct 2002 17:26:02 -0000	1.4.10.4
***************
*** 5,9 ****
  """
  
! __version__ = '2.4.2'
  
  __all__ = [
--- 5,9 ----
  """
  
! __version__ = '2.4.3'
  
  __all__ = [