[Python-checkins] python/dist/src/Lib/email Parser.py,1.9,1.10
bwarsaw@users.sourceforge.net
bwarsaw@users.sourceforge.net
Mon, 08 Jul 2002 19:50:04 -0700
Update of /cvsroot/python/python/dist/src/Lib/email
In directory usw-pr-cvs1:/tmp/cvs-serv14753/email
Modified Files:
Parser.py
Log Message:
Anthony Baxter's patch for non-strict parsing. This adds a `strict'
argument to the constructor -- defaulting to true -- which is
different than Anthony's approach of using global state.
parse(), parsestr(): Grow a `headersonly' argument which stops parsing
once the header block has been seen, i.e. it does /not/ parse or even
read the body of the message. This is used for parsing message/rfc822
type messages.
We need test cases for the non-strict parsing. Anthony will supply
these.
_parsebody(): We can get rid of the isdigest end-of-line kludges,
although we still need to know if we're parsing a multipart/digest so
we can set the default type accordingly.
Index: Parser.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/email/Parser.py,v
retrieving revision 1.9
retrieving revision 1.10
diff -C2 -d -r1.9 -r1.10
*** Parser.py 2 Jun 2002 19:12:03 -0000 1.9
--- Parser.py 9 Jul 2002 02:50:02 -0000 1.10
***************
*** 15,22 ****
NL = '\n'
-
class Parser:
! def __init__(self, _class=Message.Message):
"""Parser of RFC 2822 and MIME email messages.
--- 15,21 ----
NL = '\n'
class Parser:
! def __init__(self, _class=Message.Message, strict=1):
"""Parser of RFC 2822 and MIME email messages.
***************
*** 33,47 ****
must be created. This class must have a constructor that can take
zero arguments. Default is Message.Message.
"""
self._class = _class
! def parse(self, fp):
root = self._class()
self._parseheaders(root, fp)
! self._parsebody(root, fp)
return root
! def parsestr(self, text):
! return self.parse(StringIO(text))
def _parseheaders(self, container, fp):
--- 32,54 ----
must be created. This class must have a constructor that can take
zero arguments. Default is Message.Message.
+
+ Optional strict tells the parser to be strictly RFC compliant or to be
+ more forgiving in parsing of ill-formatted MIME documents. When
+ non-strict mode is used, the parser will try to make up for missing or
+ erroneous boundaries and other peculiarities seen in the wild.
+ Defaults to strict parsing.
"""
self._class = _class
+ self._strict = strict
! def parse(self, fp, headersonly=0):
root = self._class()
self._parseheaders(root, fp)
! if not headersonly:
! self._parsebody(root, fp)
return root
! def parsestr(self, text, headersonly=0):
! return self.parse(StringIO(text), headersonly=headersonly)
def _parseheaders(self, container, fp):
***************
*** 68,74 ****
container.set_unixfrom(line)
continue
! else:
raise Errors.HeaderParseError(
'Unix-from in headers after first rfc822 header')
# Header continuation line
if line[0] in ' \t':
--- 75,85 ----
container.set_unixfrom(line)
continue
! elif self._strict:
raise Errors.HeaderParseError(
'Unix-from in headers after first rfc822 header')
+ else:
+ # ignore the wierdly placed From_ line
+ # XXX: maybe set unixfrom anyway? or only if not already?
+ continue
# Header continuation line
if line[0] in ' \t':
***************
*** 85,90 ****
i = line.find(':')
if i < 0:
! raise Errors.HeaderParseError(
! 'Not a header, not a continuation')
if lastheader:
container[lastheader] = NL.join(lastvalue)
--- 96,108 ----
i = line.find(':')
if i < 0:
! if self._strict:
! raise Errors.HeaderParseError(
! "Not a header, not a continuation: ``%s''"%line)
! elif lineno == 1 and line.startswith('--'):
! # allow through duplicate boundary tags.
! continue
! else:
! raise Errors.HeaderParseError(
! "Not a header, not a continuation: ``%s''"%line)
if lastheader:
container[lastheader] = NL.join(lastvalue)
***************
*** 123,127 ****
mo = cre.search(payload, start)
if mo:
! start += len(mo.group(0)) * (1 + isdigest)
# We create a compiled regexp first because we need to be able to
# specify the start position, and the module function doesn't
--- 141,145 ----
mo = cre.search(payload, start)
if mo:
! start += len(mo.group(0))
# We create a compiled regexp first because we need to be able to
# specify the start position, and the module function doesn't
***************
*** 130,151 ****
re.escape(separator) + '--')
mo = cre.search(payload, start)
! if not mo:
raise Errors.BoundaryError(
! "Couldn't find terminating boundary: %s" % boundary)
! terminator = mo.start()
! linesep = mo.group('sep')
! if mo.end() < len(payload):
! # there's some post-MIME boundary epilogue
! epilogue = payload[mo.end():]
# We split the textual payload on the boundary separator, which
! # includes the trailing newline. If the container is a
! # multipart/digest then the subparts are by default message/rfc822
! # instead of text/plain. In that case, they'll have an extra
! # newline before the headers to distinguish the message's headers
! # from the subpart headers.
! separator += linesep * (1 + isdigest)
parts = payload[start:terminator].split(linesep + separator)
for part in parts:
! msgobj = self.parsestr(part)
container.preamble = preamble
container.epilogue = epilogue
--- 148,198 ----
re.escape(separator) + '--')
mo = cre.search(payload, start)
! if mo:
! terminator = mo.start()
! linesep = mo.group('sep')
! if mo.end() < len(payload):
! # there's some post-MIME boundary epilogue
! epilogue = payload[mo.end():]
! elif self._strict:
raise Errors.BoundaryError(
! "Couldn't find terminating boundary: %s" % boundary)
! else:
! # handle the case of no trailing boundary. I hate mail clients.
! # check that it ends in a blank line
! endre = re.compile('(?P<sep>\r\n|\r|\n){2}$')
! mo = endre.search(payload)
! if not mo:
! raise Errors.BoundaryError(
! "Couldn't find terminating boundary, and no "+
! "trailing empty line")
! else:
! linesep = mo.group('sep')
! terminator = len(payload)
# We split the textual payload on the boundary separator, which
! # includes the trailing newline. If the container is a
! # multipart/digest then the subparts are by default message/rfc822
! # instead of text/plain. In that case, they'll have a optional
! # block of MIME headers, then an empty line followed by the
! # message headers.
! separator += linesep
parts = payload[start:terminator].split(linesep + separator)
for part in parts:
! if isdigest:
! if part[0] == linesep:
! # There's no header block so create an empty message
! # object as the container, and lop off the newline so
! # we can parse the sub-subobject
! msgobj = self._class()
! part = part[1:]
! else:
! parthdrs, part = part.split(linesep+linesep, 1)
! # msgobj in this case is the "message/rfc822" container
! msgobj = self.parsestr(parthdrs, headersonly=1)
! # while submsgobj is the message itself
! submsgobj = self.parsestr(part)
! msgobj.attach(submsgobj)
! msgobj.set_default_type('message/rfc822')
! else:
! msgobj = self.parsestr(part)
container.preamble = preamble
container.epilogue = epilogue