[Python-checkins] python/dist/src/Lib/email Parser.py,1.9,1.10

Mon, 08 Jul 2002 19:50:04 -0700

Update of /cvsroot/python/python/dist/src/Lib/email
In directory usw-pr-cvs1:/tmp/cvs-serv14753/email

Modified Files:
	Parser.py 
Log Message:
Anthony Baxter's patch for non-strict parsing.  This adds a `strict'
argument to the constructor -- defaulting to true -- which is
different than Anthony's approach of using global state.

parse(), parsestr(): Grow a `headersonly' argument which stops parsing
once the header block has been seen, i.e. it does /not/ parse or even
read the body of the message.  This is used for parsing message/rfc822
type messages.

We need test cases for the non-strict parsing.  Anthony will supply
these.

_parsebody(): We can get rid of the isdigest end-of-line kludges,
although we still need to know if we're parsing a multipart/digest so
we can set the default type accordingly.

Index: Parser.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/email/Parser.py,v
retrieving revision 1.9
retrieving revision 1.10
diff -C2 -d -r1.9 -r1.10
*** Parser.py	2 Jun 2002 19:12:03 -0000	1.9
--- Parser.py	9 Jul 2002 02:50:02 -0000	1.10
***************
*** 15,22 ****
  NL = '\n'

- 

  class Parser:
!     def __init__(self, _class=Message.Message):
          """Parser of RFC 2822 and MIME email messages.

--- 15,21 ----
  NL = '\n'

  class Parser:
!     def __init__(self, _class=Message.Message, strict=1):
          """Parser of RFC 2822 and MIME email messages.

***************
*** 33,47 ****
          must be created.  This class must have a constructor that can take
          zero arguments.  Default is Message.Message.
          """
          self._class = _class

!     def parse(self, fp):
          root = self._class()
          self._parseheaders(root, fp)
!         self._parsebody(root, fp)
          return root

!     def parsestr(self, text):
!         return self.parse(StringIO(text))

      def _parseheaders(self, container, fp):
--- 32,54 ----
          must be created.  This class must have a constructor that can take
          zero arguments.  Default is Message.Message.
+ 
+         Optional strict tells the parser to be strictly RFC compliant or to be
+         more forgiving in parsing of ill-formatted MIME documents.  When
+         non-strict mode is used, the parser will try to make up for missing or
+         erroneous boundaries and other peculiarities seen in the wild.
+         Defaults to strict parsing.
          """
          self._class = _class
+         self._strict = strict

!     def parse(self, fp, headersonly=0):
          root = self._class()
          self._parseheaders(root, fp)
!         if not headersonly:
!             self._parsebody(root, fp)
          return root

!     def parsestr(self, text, headersonly=0):
!         return self.parse(StringIO(text), headersonly=headersonly)

      def _parseheaders(self, container, fp):
***************
*** 68,74 ****
                      container.set_unixfrom(line)
                      continue
!                 else:
                      raise Errors.HeaderParseError(
                          'Unix-from in headers after first rfc822 header')
              # Header continuation line
              if line[0] in ' \t':
--- 75,85 ----
                      container.set_unixfrom(line)
                      continue
!                 elif self._strict:
                      raise Errors.HeaderParseError(
                          'Unix-from in headers after first rfc822 header')
+                 else:
+                     # ignore the wierdly placed From_ line
+                     # XXX: maybe set unixfrom anyway? or only if not already?
+                     continue
              # Header continuation line
              if line[0] in ' \t':
***************
*** 85,90 ****
              i = line.find(':')
              if i < 0:
!                 raise Errors.HeaderParseError(
!                     'Not a header, not a continuation')
              if lastheader:
                  container[lastheader] = NL.join(lastvalue)
--- 96,108 ----
              i = line.find(':')
              if i < 0:
!                 if self._strict:
!                     raise Errors.HeaderParseError(
!                         "Not a header, not a continuation: ``%s''"%line)
!                 elif lineno == 1 and line.startswith('--'):
!                     # allow through duplicate boundary tags.
!                     continue
!                 else:
!                     raise Errors.HeaderParseError(
!                         "Not a header, not a continuation: ``%s''"%line)
              if lastheader:
                  container[lastheader] = NL.join(lastvalue)
***************
*** 123,127 ****
              mo = cre.search(payload, start)
              if mo:
!                 start += len(mo.group(0)) * (1 + isdigest)
              # We create a compiled regexp first because we need to be able to
              # specify the start position, and the module function doesn't
--- 141,145 ----
              mo = cre.search(payload, start)
              if mo:
!                 start += len(mo.group(0))
              # We create a compiled regexp first because we need to be able to
              # specify the start position, and the module function doesn't
***************
*** 130,151 ****
                               re.escape(separator) + '--')
              mo = cre.search(payload, start)
!             if not mo:
                  raise Errors.BoundaryError(
!                     "Couldn't find terminating boundary: %s" % boundary)
!             terminator = mo.start()
!             linesep = mo.group('sep')
!             if mo.end() < len(payload):
!                 # there's some post-MIME boundary epilogue
!                 epilogue = payload[mo.end():]
              # We split the textual payload on the boundary separator, which
!             # includes the trailing newline.  If the container is a
!             # multipart/digest then the subparts are by default message/rfc822
!             # instead of text/plain.  In that case, they'll have an extra
!             # newline before the headers to distinguish the message's headers
!             # from the subpart headers.
!             separator += linesep * (1 + isdigest)
              parts = payload[start:terminator].split(linesep + separator)
              for part in parts:
!                 msgobj = self.parsestr(part)
                  container.preamble = preamble
                  container.epilogue = epilogue
--- 148,198 ----
                               re.escape(separator) + '--')
              mo = cre.search(payload, start)
!             if mo:
!                 terminator = mo.start()
!                 linesep = mo.group('sep')
!                 if mo.end() < len(payload):
!                     # there's some post-MIME boundary epilogue
!                     epilogue = payload[mo.end():]
!             elif self._strict:
                  raise Errors.BoundaryError(
!                         "Couldn't find terminating boundary: %s" % boundary)
!             else:
!                 # handle the case of no trailing boundary. I hate mail clients.
!                 # check that it ends in a blank line
!                 endre = re.compile('(?P<sep>\r\n|\r|\n){2}$')
!                 mo = endre.search(payload)
!                 if not mo:
!                     raise Errors.BoundaryError(
!                         "Couldn't find terminating boundary, and no "+
!                         "trailing empty line")
!                 else:
!                     linesep = mo.group('sep')
!                     terminator = len(payload)
              # We split the textual payload on the boundary separator, which
!             # includes the trailing newline. If the container is a
!             # multipart/digest then the subparts are by default message/rfc822 
!             # instead of text/plain.  In that case, they'll have a optional 
!             # block of MIME headers, then an empty line followed by the 
!             # message headers.
!             separator += linesep
              parts = payload[start:terminator].split(linesep + separator)
              for part in parts:
!                 if isdigest: 
!                     if part[0] == linesep:
!                         # There's no header block so create an empty message
!                         # object as the container, and lop off the newline so
!                         # we can parse the sub-subobject
!                         msgobj = self._class()
!                         part = part[1:]
!                     else:
!                         parthdrs, part = part.split(linesep+linesep, 1)
!                         # msgobj in this case is the "message/rfc822" container
!                         msgobj = self.parsestr(parthdrs, headersonly=1)
!                     # while submsgobj is the message itself
!                     submsgobj = self.parsestr(part)
!                     msgobj.attach(submsgobj)
!                     msgobj.set_default_type('message/rfc822')
!                 else:
!                     msgobj = self.parsestr(part)
                  container.preamble = preamble
                  container.epilogue = epilogue