[Python-checkins] python/dist/src/Lib/email Parser.py,1.21,1.22

bwarsaw at users.sourceforge.net bwarsaw at users.sourceforge.net
Sat May 8 23:46:44 EDT 2004


Update of /cvsroot/python/python/dist/src/Lib/email
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv13873

Modified Files:
	Parser.py 
Log Message:
Update to Python 2.3, getting rid of backward compatiblity crud.

This Parser is now just a backward compatible front-end to the FeedParser.


Index: Parser.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/email/Parser.py,v
retrieving revision 1.21
retrieving revision 1.22
diff -C2 -d -r1.21 -r1.22
*** Parser.py	20 Mar 2004 17:31:29 -0000	1.21
--- Parser.py	9 May 2004 03:46:42 -0000	1.22
***************
*** 1,99 ****
! # Copyright (C) 2001,2002 Python Software Foundation
! # Author: barry at zope.com (Barry Warsaw)
  
! """A parser of RFC 2822 and MIME email messages.
! """
  
  import re
  from cStringIO import StringIO
! from types import ListType
! 
! from email import Errors
! from email import Message
! 
! EMPTYSTRING = ''
! NL = '\n'
! 
! try:
!     True, False
! except NameError:
!     True = 1
!     False = 0
  
  NLCRE = re.compile('\r\n|\r|\n')
  
- class TextUtil:
-     """ A utility class for wrapping a file object and providing a 
-         couple of additional useful functions.
-     """
- 
-     def __init__(self, fp):
-         self.fp = fp
-         self.unread = []
- 
-     def readline(self):
-         """ Return a line of data.
- 
-         If data has been pushed back with unreadline(), the most recently
-         returned unreadline()d data will be returned.
-         """
-         if self.unread:
-             return self.unread.pop()
-         else:
-             return self.fp.readline()
- 
-     def unreadline(self, line):
-         """Push a line back into the object. 
-         """
-         self.unread.append(line)
- 
-     def peekline(self):
-         """Non-destructively look at the next line"""
-         line = self.readline()
-         self.unreadline(line)
-         return line
- 
-     def read(self):
-         """Return the remaining data
-         """
-         r = self.fp.read()
-         if self.unread:
-             r = "\n".join(self.unread) + r
-             self.unread = []
-         return r
- 
-     def readuntil(self, re, afterblank=0, includematch=0):
-         """Read a line at a time until we get the specified RE. 
- 
-         Returns the text up to (and including, if includematch is true) the 
-         matched text, and the RE match object. If afterblank is true, 
-         there must be a blank line before the matched text. Moves current 
-         filepointer to the line following the matched line. If we reach 
-         end-of-file, return what we've got so far, and return None as the
-         RE match object.
-         """
-         prematch = []
-         blankseen = 0
-         while 1:
-             line = self.readline()
-             if not line:
-                 # end of file
-                 return EMPTYSTRING.join(prematch), None
-             if afterblank:
-                 if NLCRE.match(line):
-                     blankseen = 1
-                     continue
-                 else:
-                     blankseen = 0
-             m = re.match(line)
-             if (m and not afterblank) or (m and afterblank and blankseen):
-                 if includematch:
-                     prematch.append(line)
-                 return EMPTYSTRING.join(prematch), m
-             prematch.append(line)
  
  
  class Parser:
!     def __init__(self, _class=Message.Message, strict=False):
          """Parser of RFC 2822 and MIME email messages.
  
--- 1,19 ----
! # Copyright (C) 2001-2004 Python Software Foundation
! # Author: Barry Warsaw, Thomas Wouters, Anthony Baxter
! # Contact: email-sig at python.org
  
! """A parser of RFC 2822 and MIME email messages."""
  
  import re
  from cStringIO import StringIO
! from email.FeedParser import FeedParser
! from email.Message import Message
  
  NLCRE = re.compile('\r\n|\r|\n')
  
  
  
  class Parser:
!     def __init__(self, _class=Message, strict=False):
          """Parser of RFC 2822 and MIME email messages.
  
***************
*** 118,122 ****
          """
          self._class = _class
-         self._strict = strict
  
      def parse(self, fp, headersonly=False):
--- 38,41 ----
***************
*** 128,140 ****
          meaning it parses the entire contents of the file.
          """
!         root = self._class()
!         fp = TextUtil(fp)
!         self._parseheaders(root, fp)
!         if not headersonly:
!             obj = self._parsemessage(root, fp)
!             trailer = fp.read()
!             if obj and trailer:
!                 self._attach_trailer(obj, trailer)
!         return root
  
      def parsestr(self, text, headersonly=False):
--- 47,59 ----
          meaning it parses the entire contents of the file.
          """
!         feedparser = FeedParser(self._class)
!         if headersonly:
!             feedparser._set_headersonly()
!         while True:
!             data = fp.read(8192)
!             if not data:
!                 break
!             feedparser.feed(data)
!         return feedparser.close()
  
      def parsestr(self, text, headersonly=False):
***************
*** 148,337 ****
          return self.parse(StringIO(text), headersonly=headersonly)
  
-     def _parseheaders(self, container, fp):
-         # Parse the headers, returning a list of header/value pairs.  None as
-         # the header means the Unix-From header.
-         lastheader = ''
-         lastvalue = []
-         lineno = 0
-         while True:
-             # Don't strip the line before we test for the end condition,
-             # because whitespace-only header lines are RFC compliant
-             # continuation lines.
-             line = fp.readline()
-             if not line:
-                 break
-             line = line.splitlines()[0]
-             if not line:
-                 break
-             # Ignore the trailing newline
-             lineno += 1
-             # Check for initial Unix From_ line
-             if line.startswith('From '):
-                 if lineno == 1:
-                     container.set_unixfrom(line)
-                     continue
-                 elif self._strict:
-                     raise Errors.HeaderParseError(
-                         'Unix-from in headers after first rfc822 header')
-                 else:
-                     # ignore the wierdly placed From_ line
-                     # XXX: maybe set unixfrom anyway? or only if not already?
-                     continue
-             # Header continuation line
-             if line[0] in ' \t':
-                 if not lastheader:
-                     raise Errors.HeaderParseError(
-                         'Continuation line seen before first header')
-                 lastvalue.append(line)
-                 continue
-             # Normal, non-continuation header.  BAW: this should check to make
-             # sure it's a legal header, e.g. doesn't contain spaces.  Also, we
-             # should expose the header matching algorithm in the API, and
-             # allow for a non-strict parsing mode (that ignores the line
-             # instead of raising the exception).
-             i = line.find(':')
-             if i < 0:
-                 if self._strict:
-                     raise Errors.HeaderParseError(
-                         "Not a header, not a continuation: ``%s''" % line)
-                 elif lineno == 1 and line.startswith('--'):
-                     # allow through duplicate boundary tags.
-                     continue
-                 else:
-                     # There was no separating blank line as mandated by RFC
-                     # 2822, but we're in non-strict mode.  So just offer up
-                     # this current line as the first body line.
-                     fp.unreadline(line)
-                     break
-             if lastheader:
-                 container[lastheader] = NL.join(lastvalue)
-             lastheader = line[:i]
-             lastvalue = [line[i+1:].lstrip()]
-         # Make sure we retain the last header
-         if lastheader:
-             container[lastheader] = NL.join(lastvalue)
-         return 
- 
-     def _parsemessage(self, container, fp):
-         # Parse the body. We walk through the body from top to bottom,
-         # keeping track of the current multipart nesting as we go.
-         # We return the object that gets the data at the end of this 
-         # block.
-         boundary = container.get_boundary()
-         isdigest = (container.get_content_type() == 'multipart/digest')
-         if boundary: 
-             separator = '--' + boundary
-             boundaryRE = re.compile(
-                     r'(?P<sep>' + re.escape(separator) + 
-                     r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)$')
-             preamble, matchobj = fp.readuntil(boundaryRE)
-             if not matchobj:
-                 # Broken - we hit the end of file. Just set the body 
-                 # to the text.
-                 container.set_payload(preamble)
-                 return container
-             if preamble:
-                 container.preamble = preamble
-             else:
-                 # The module docs specify an empty preamble is None, not ''
-                 container.preamble = None
-             while 1:
-                 subobj = self._class()
-                 if isdigest:
-                     subobj.set_default_type('message/rfc822')
-                     firstline = fp.peekline()
-                     if firstline.strip():
-                         # we have MIME headers. all good. 
-                         self._parseheaders(subobj, fp)
-                     else:
-                         # no MIME headers. this is allowed for multipart/digest
-                         # Consume the extra blank line
-                         fp.readline()
-                         pass
-                 else:
-                     self._parseheaders(subobj, fp)
-                 container.attach(subobj)
-                 maintype = subobj.get_content_maintype()
-                 hassubparts = (subobj.get_content_maintype() in 
-                                                 ( "message", "multipart" ))
-                 if hassubparts:
-                     subobj = self._parsemessage(subobj, fp)
- 
-                 trailer, matchobj = fp.readuntil(boundaryRE)
-                 if matchobj is None or trailer:
-                     mo = re.search('(?P<sep>\r\n|\r|\n){2}$', trailer)
-                     if not mo:
-                         mo = re.search('(?P<sep>\r\n|\r|\n)$', trailer)
-                         if not mo:
-                             raise Errors.BoundaryError(
-                           'No terminating boundary and no trailing empty line')
-                     linesep = mo.group('sep')
-                     trailer = trailer[:-len(linesep)]
-                 if trailer:
-                     self._attach_trailer(subobj, trailer)
-                 if matchobj is None or matchobj.group('end'):
-                     # That was the last piece of data. Let our caller attach
-                     # the epilogue to us. But before we do that, push the
-                     # line ending of the match group back into the readline
-                     # buffer, as it's part of the epilogue.
-                     if matchobj:
-                         fp.unreadline(matchobj.group('linesep'))
-                     return container
- 
-         elif container.get_content_maintype() == "multipart":
-             # Very bad.  A message is a multipart with no boundary!
-             raise Errors.BoundaryError(
-                     'multipart message with no defined boundary')
-         elif container.get_content_maintype() == "message":
-             ct = container.get_content_type()
-             if ct == "message/rfc822":
-                 submessage = self._class()
-                 self._parseheaders(submessage, fp)
-                 self._parsemessage(submessage, fp)
-                 container.attach(submessage)
-                 return submessage
-             elif ct == "message/delivery-status":
-                 # This special kind of type contains blocks of headers 
-                 # separated by a blank line.  We'll represent each header 
-                 # block as a separate Message object
-                 while 1:
-                     nextblock = self._class()
-                     self._parseheaders(nextblock, fp)
-                     container.attach(nextblock)
-                     # next peek ahead to see whether we've hit the end or not
-                     nextline = fp.peekline()
-                     if nextline[:2] == "--":
-                         break
-                 return container
-             else:
-                 # Other sort of message object (e.g. external-body)
-                 msg = self._class()
-                 self._parsemessage(msg, fp)
-                 container.attach(msg)
-                 return msg
-         else:
-             # single body section. We let our caller set the payload.
-             return container
- 
-     def _attach_trailer(self, obj, trailer):
-         if obj.get_content_maintype() in ("message", "multipart"):
-             obj.epilogue = trailer
-         else:
-             obj.set_payload(trailer)
  
  
  class HeaderParser(Parser):
!     """A subclass of Parser, this one only meaningfully parses message headers.
! 
!     This class can be used if all you're interested in is the headers of a
!     message.  While it consumes the message body, it does not parse it, but
!     simply makes it available as a string payload.
  
!     Parsing with this subclass can be considerably faster if all you're
!     interested in is the message headers.
!     """
!     def _parsemessage(self, container, fp):
!         # Consume but do not parse, the body
!         text = fp.read()
!         container.set_payload(text)
!         return None
--- 67,76 ----
          return self.parse(StringIO(text), headersonly=headersonly)
  
  
  
  class HeaderParser(Parser):
!     def parse(self, fp, headersonly=True):
!         return Parser.parse(self, fp, True)
  
!     def parsestr(self, text, headersonly=True):
!         return Parser.parsestr(self, text, True)




More information about the Python-checkins mailing list