[Python-checkins] python/dist/src/Lib/email Parser.py,1.20,1.20.4.1

Wed, 11 Jun 2003 23:08:58 -0700

Update of /cvsroot/python/python/dist/src/Lib/email
In directory sc8-pr-cvs1:/tmp/cvs-serv17621

Modified Files:
      Tag: anthony-parser-branch
	Parser.py 
Log Message:
A work-in-progress snapshot of the new parser. A couple of known problems:

- first (blank) line of MIME epilogues is being consumed
- message/delivery-status isn't quite right

It still needs a lot of cleanup, but right now it parses a whole lot of
badness that the old parser failed on. I also need to think about adding 
back the old 'strict' flag in some way.

Index: Parser.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/email/Parser.py,v
retrieving revision 1.20
retrieving revision 1.20.4.1
diff -C2 -d -r1.20 -r1.20.4.1
*** Parser.py	6 Mar 2003 05:25:35 -0000	1.20
--- Parser.py	12 Jun 2003 06:08:56 -0000	1.20.4.1
***************
*** 23,26 ****
--- 23,96 ----
  NLCRE = re.compile('\r\n|\r|\n')

+ class TextUtil:
+     """ A utility class for wrapping a file object and providing a 
+         couple of additional useful functions.
+     """
+ 
+     def __init__(self, fp):
+         self.fp = fp
+         self.unread = []
+ 
+     def readline(self):
+         """ Return a line of data.
+ 
+         If data has been pushed back with unreadline(), the most recently
+         returned unreadline()d data will be returned.
+         """
+         if self.unread:
+             return self.unread.pop()
+         else:
+             return self.fp.readline()
+ 
+     def unreadline(self, line):
+         """Push a line back into the object. 
+         """
+         self.unread.append(line)
+ 
+     def peekline(self):
+         """Non-destructively look at the next line"""
+         line = self.readline()
+         self.unreadline(line)
+         return line
+ 
+     def read(self):
+         """Return the remaining data
+         """
+         r = self.fp.read()
+         if self.unread:
+             self.unread.append(r)
+             r = "\n".join(self.unread) + r
+         self.unread = []
+         return r
+ 
+     def readuntil(self, re, afterblank=0, includematch=0):
+         """Read a line at a time until we get the specified RE. 
+ 
+         Returns the text up to (and including, if includematch is true) the 
+         matched text, and the RE match object. If afterblank is true, 
+         there must be a blank line before the matched text. Moves current 
+         filepointer to the line following the matched line. If we reach 
+         end-of-file, return what we've got so far, and return None as the
+         RE match object.
+         """
+         prematch = []
+         blankseen = 0
+         while 1:
+             line = self.readline()
+             if not line:
+                 # end of file
+                 return EMPTYSTRING.join(prematch), None
+             if afterblank:
+                 if NLCRE.match(line):
+                     blankseen = 1
+                     continue
+                 else:
+                     blankseen = 0
+             m = re.match(line)
+             if (m and not afterblank) or (m and afterblank and blankseen):
+                 if includematch:
+                     prematch.append(line)
+                 return EMPTYSTRING.join(prematch), m
+             prematch.append(line)

***************
*** 60,66 ****
          """
          root = self._class()
!         firstbodyline = self._parseheaders(root, fp)
          if not headersonly:
!             self._parsebody(root, fp, firstbodyline)
          return root

--- 130,139 ----
          """
          root = self._class()
!         fp = TextUtil(fp)
!         self._parseheaders(root, fp)
          if not headersonly:
!             obj = self._parsemessage(root, fp)
!             trailer = fp.read()
!             self._attach_trailer(obj, trailer)
          return root

***************
*** 81,85 ****
          lastvalue = []
          lineno = 0
-         firstbodyline = None
          while True:
              # Don't strip the line before we test for the end condition,
--- 154,157 ----
***************
*** 130,134 ****
                      # 2822, but we're in non-strict mode.  So just offer up
                      # this current line as the first body line.
!                     firstbodyline = line
                      break
              if lastheader:
--- 202,206 ----
                      # 2822, but we're in non-strict mode.  So just offer up
                      # this current line as the first body line.
!                     fp.unreadline(line)
                      break
              if lastheader:
***************
*** 139,276 ****
          if lastheader:
              container[lastheader] = NL.join(lastvalue)
!         return firstbodyline

!     def _parsebody(self, container, fp, firstbodyline=None):
!         # Parse the body, but first split the payload on the content-type
!         # boundary if present.
          boundary = container.get_boundary()
          isdigest = (container.get_content_type() == 'multipart/digest')
!         # If there's a boundary, split the payload text into its constituent
!         # parts and parse each separately.  Otherwise, just parse the rest of
!         # the body as a single message.  Note: any exceptions raised in the
!         # recursive parse need to have their line numbers coerced.
!         if boundary:
              preamble = epilogue = None
-             # Split into subparts.  The first boundary we're looking for won't
-             # always have a leading newline since we're at the start of the
-             # body text, and there's not always a preamble before the first
-             # boundary.
              separator = '--' + boundary
!             payload = fp.read()
!             if firstbodyline is not None:
!                 payload = firstbodyline + '\n' + payload
!             # We use an RE here because boundaries can have trailing
!             # whitespace.
!             mo = re.search(
!                 r'(?P<sep>' + re.escape(separator) + r')(?P<ws>[ \t]*)',
!                 payload)
!             if not mo:
!                 if self._strict:
!                     raise Errors.BoundaryError(
!                         "Couldn't find starting boundary: %s" % boundary)
!                 container.set_payload(payload)
!                 return
!             start = mo.start()
!             if start > 0:
!                 # there's some pre-MIME boundary preamble
!                 preamble = payload[0:start]
!             # Find out what kind of line endings we're using
!             start += len(mo.group('sep')) + len(mo.group('ws'))
!             mo = NLCRE.search(payload, start)
!             if mo:
!                 start += len(mo.group(0))
!             # We create a compiled regexp first because we need to be able to
!             # specify the start position, and the module function doesn't
!             # support this signature. :(
!             cre = re.compile('(?P<sep>\r\n|\r|\n)' +
!                              re.escape(separator) + '--')
!             mo = cre.search(payload, start)
!             if mo:
!                 terminator = mo.start()
!                 linesep = mo.group('sep')
!                 if mo.end() < len(payload):
!                     # There's some post-MIME boundary epilogue
!                     epilogue = payload[mo.end():]
!             elif self._strict:
!                 raise Errors.BoundaryError(
!                         "Couldn't find terminating boundary: %s" % boundary)
!             else:
!                 # Handle the case of no trailing boundary.  Check that it ends
!                 # in a blank line.  Some cases (spamspamspam) don't even have
!                 # that!
!                 mo = re.search('(?P<sep>\r\n|\r|\n){2}$', payload)
!                 if not mo:
!                     mo = re.search('(?P<sep>\r\n|\r|\n)$', payload)
!                     if not mo:
!                         raise Errors.BoundaryError(
!                           'No terminating boundary and no trailing empty line')
!                 linesep = mo.group('sep')
!                 terminator = len(payload)
!             # We split the textual payload on the boundary separator, which
!             # includes the trailing newline. If the container is a
!             # multipart/digest then the subparts are by default message/rfc822
!             # instead of text/plain.  In that case, they'll have a optional
!             # block of MIME headers, then an empty line followed by the
!             # message headers.
!             parts = re.split(
!                 linesep + re.escape(separator) + r'[ \t]*' + linesep,
!                 payload[start:terminator])
!             for part in parts:
                  if isdigest:
!                     if part.startswith(linesep):
!                         # There's no header block so create an empty message
!                         # object as the container, and lop off the newline so
!                         # we can parse the sub-subobject
!                         msgobj = self._class()
!                         part = part[len(linesep):]
!                     else:
!                         parthdrs, part = part.split(linesep+linesep, 1)
!                         # msgobj in this case is the "message/rfc822" container
!                         msgobj = self.parsestr(parthdrs, headersonly=1)
!                     # while submsgobj is the message itself
!                     msgobj.set_default_type('message/rfc822')
!                     maintype = msgobj.get_content_maintype()
!                     if maintype in ('message', 'multipart'):
!                         submsgobj = self.parsestr(part)
!                         msgobj.attach(submsgobj)
                      else:
!                         msgobj.set_payload(part)
                  else:
!                     msgobj = self.parsestr(part)
!                 container.preamble = preamble
!                 container.epilogue = epilogue
!                 container.attach(msgobj)
!         elif container.get_main_type() == 'multipart':
              # Very bad.  A message is a multipart with no boundary!
              raise Errors.BoundaryError(
!                 'multipart message with no defined boundary')
!         elif container.get_type() == 'message/delivery-status':
!             # This special kind of type contains blocks of headers separated
!             # by a blank line.  We'll represent each header block as a
!             # separate Message object
!             blocks = []
!             while True:
!                 blockmsg = self._class()
!                 self._parseheaders(blockmsg, fp)
!                 if not len(blockmsg):
!                     # No more header blocks left
!                     break
!                 blocks.append(blockmsg)
!             container.set_payload(blocks)
!         elif container.get_main_type() == 'message':
!             # Create a container for the payload, but watch out for there not
!             # being any headers left
!             try:
!                 msg = self.parse(fp)
!             except Errors.HeaderParseError:
!                 msg = self._class()
!                 self._parsebody(msg, fp)
!             container.attach(msg)
          else:
!             text = fp.read()
!             if firstbodyline is not None:
!                 text = firstbodyline + '\n' + text
!             container.set_payload(text)

--- 211,312 ----
          if lastheader:
              container[lastheader] = NL.join(lastvalue)
!         return 

!     def _parsemessage(self, container, fp):
!         # Parse the body. We walk through the body from top to bottom,
!         # keeping track of the current multipart nesting as we go.
!         # We return the object that gets the data at the end of this 
!         # block.
          boundary = container.get_boundary()
          isdigest = (container.get_content_type() == 'multipart/digest')
!         if boundary: 
              preamble = epilogue = None
              separator = '--' + boundary
!             boundaryRE = re.compile(
!                     r'(?P<sep>' + re.escape(separator) + 
!                     r')(?P<end>--)?(?P<ws>[ \t]*)')
!             preamble, matchobj = fp.readuntil(boundaryRE)
!             if not matchobj:
!                 # Broken. Just set the body to the text
!                 container.set_payload(preamble)
!                 return container
!             container.preamble = preamble
!             while 1:
!                 subobj = self._class()
                  if isdigest:
!                     subobj.set_default_type('message/rfc822')
!                     firstline = fp.peekline()
!                     if firstline.strip():
!                         # we have MIME headers. all good. 
!                         self._parseheaders(subobj, fp)
                      else:
!                         # no MIME headers. this is allowed for multipart/digest
!                         # Consume the extra blank line
!                         fp.readline()
!                         pass
                  else:
!                     self._parseheaders(subobj, fp)
!                 container.attach(subobj)
!                 maintype = subobj.get_content_maintype()
!                 hassubparts = (subobj.get_content_maintype() in 
!                                                 ( "message", "multipart" ))
!                 rfc822 = (subobj.get_content_type() == "message/rfc822")
!                 if hassubparts:
!                     subobj = self._parsemessage(subobj, fp)
! 
!                 trailer, matchobj = fp.readuntil(boundaryRE)
!                 if matchobj is None or trailer:
!                     mo = re.search('(?P<sep>\r\n|\r|\n){2}$', trailer)
!                     if not mo:
!                         mo = re.search('(?P<sep>\r\n|\r|\n)$', trailer)
!                         if not mo:
!                             raise Errors.BoundaryError(
!                           'No terminating boundary and no trailing empty line')
!                     linesep = mo.group('sep')
!                     trailer = trailer[:-len(linesep)]
!                 if trailer:
!                     self._attach_trailer(subobj, trailer)
!                 if matchobj.group('end'):
!                     # That was the last piece of data. Let our caller attach
!                     # the epilogue to us.
!                     return container
! 
!         elif container.get_content_maintype() == "multipart":
              # Very bad.  A message is a multipart with no boundary!
              raise Errors.BoundaryError(
!                     'multipart message with no defined boundary')
!         elif container.get_content_maintype() == "message":
!             ct = container.get_content_type()
!             if ct == "message/rfc822":
!                 # Error.
!                 submessage = self._class()
!                 self._parseheaders(submessage, fp)
!                 self._parsemessage(submessage, fp)
!                 container.attach(submessage)
!                 return submessage
!             elif ct == "message/delivery-status":
!                 # This special kind of type contains blocks of headers 
!                 # separated by a blank line.  We'll represent each header 
!                 # block as a separate Message object
!                 while 1:
!                     nextblock = self._class()
!                     self._parseheaders(nextblock, fp)
!                     container.attach(nextblock)
!                     # next peek ahead to see whether we've hit the end or not
!                     nextline = fp.peekline()
!                     if nextline[:2] == "--":
!                         break
!                     return container
!             else:
!                 raise ValueError, "%s not implemented yet"%(ct)
          else:
!             # single body section. We let our caller set the payload.
!             return container

+     def _attach_trailer(self, obj, trailer):
+         if obj.get_content_maintype() in ("message", "multipart"):
+             obj.epilogue = trailer
+         else:
+             obj.set_payload(trailer)

***************
*** 285,292 ****
      interested in is the message headers.
      """
!     def _parsebody(self, container, fp, firstbodyline=None):
          # Consume but do not parse, the body
          text = fp.read()
-         if firstbodyline is not None:
-             text = firstbodyline + '\n' + text
          container.set_payload(text)
--- 321,326 ----
      interested in is the message headers.
      """
!     def _parsemessage(self, container, fp):
          # Consume but do not parse, the body
          text = fp.read()
          container.set_payload(text)