[Python-checkins] python/dist/src/Lib/email Header.py,1.17,1.17.6.1
bwarsaw@users.sourceforge.net
bwarsaw@users.sourceforge.net
Sat, 01 Mar 2003 19:37:21 -0800
Update of /cvsroot/python/python/dist/src/Lib/email
In directory sc8-pr-cvs1:/tmp/cvs-serv9598
Modified Files:
Tag: folding-reimpl-branch
Header.py
Log Message:
Re-implemented ASCII split algorithm. Committing on a branch until
the tests all pass. :/
Index: Header.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/email/Header.py,v
retrieving revision 1.17
retrieving revision 1.17.6.1
diff -C2 -d -r1.17 -r1.17.6.1
*** Header.py 30 Dec 2002 19:13:00 -0000 1.17
--- Header.py 2 Mar 2003 03:37:19 -0000 1.17.6.1
***************
*** 26,29 ****
--- 26,30 ----
CRLF = '\r\n'
NL = '\n'
+ SPACE = ' '
SPACE8 = ' ' * 8
EMPTYSTRING = ''
***************
*** 48,51 ****
--- 49,59 ----
''', re.VERBOSE | re.IGNORECASE)
+ pcre = re.compile('([,;])')
+
+ # Field name regexp, including trailing colon, but not separating whitespace,
+ # according to RFC 2822. Character range is from tilde to exclamation mark.
+ # For use with .match()
+ fcre = re.compile(r'[\041-\176]+:$')
+
***************
*** 127,131 ****
class Header:
! def __init__(self, s=None, charset=None, maxlinelen=None, header_name=None,
continuation_ws=' ', errors='strict'):
"""Create a MIME-compliant header that can contain many character sets.
--- 135,140 ----
class Header:
! def __init__(self, s=None, charset=None,
! maxlinelen=None, header_name=None,
continuation_ws=' ', errors='strict'):
"""Create a MIME-compliant header that can contain many character sets.
***************
*** 254,258 ****
self._chunks.append((s, charset))
! def _split(self, s, charset, firstline=False):
# Split up a header safely for use with encode_chunks.
splittable = charset.to_splittable(s)
--- 263,267 ----
self._chunks.append((s, charset))
! def _split(self, s, charset, firstline, splitchars):
# Split up a header safely for use with encode_chunks.
splittable = charset.to_splittable(s)
***************
*** 281,287 ****
# although it's possible that other charsets may also benefit from the
# higher-level syntactic breaks.
- #
elif charset == 'us-ascii':
! return self._ascii_split(s, charset, firstline)
# BAW: should we use encoded?
elif elen == len(s):
--- 290,295 ----
# although it's possible that other charsets may also benefit from the
# higher-level syntactic breaks.
elif charset == 'us-ascii':
! return self._split_ascii(s, charset, firstline, splitchars)
# BAW: should we use encoded?
elif elen == len(s):
***************
*** 297,377 ****
last = charset.from_splittable(splittable[halfway:], False)
# Do the split
! return self._split(first, charset, firstline) + \
! self._split(last, charset)
! def _ascii_split(self, s, charset, firstline):
! # Attempt to split the line at the highest-level syntactic break
! # possible. Note that we don't have a lot of smarts about field
! # syntax; we just try to break on semi-colons, then whitespace.
! rtn = []
! lines = s.splitlines()
! while lines:
! line = lines.pop(0)
! if firstline:
! maxlinelen = self._firstlinelen
! firstline = False
! else:
! #line = line.lstrip()
! maxlinelen = self._maxlinelen
! # Short lines can remain unchanged
! if len(line.replace('\t', SPACE8)) <= maxlinelen:
! rtn.append(line)
! else:
! oldlen = len(line)
! # Try to break the line on semicolons, but if that doesn't
! # work, try to split on folding whitespace.
! while len(line) > maxlinelen:
! i = line.rfind(';', 0, maxlinelen)
! if i < 0:
! break
! rtn.append(line[:i] + ';')
! line = line[i+1:]
! # Is the remaining stuff still longer than maxlinelen?
! if len(line) <= maxlinelen:
! # Splitting on semis worked
! rtn.append(line)
! continue
! # Splitting on semis didn't finish the job. If it did any
! # work at all, stick the remaining junk on the front of the
! # `lines' sequence and let the next pass do its thing.
! if len(line) <> oldlen:
! lines.insert(0, line)
! continue
! # Otherwise, splitting on semis didn't help at all.
! parts = re.split(r'(\s+)', line)
! if len(parts) == 1 or (len(parts) == 3 and
! parts[0].endswith(':')):
! # This line can't be split on whitespace. There's now
! # little we can do to get this into maxlinelen. BAW:
! # We're still potentially breaking the RFC by possibly
! # allowing lines longer than the absolute maximum of 998
! # characters. For now, let it slide.
! #
! # len(parts) will be 1 if this line has no `Field: '
! # prefix, otherwise it will be len(3).
! rtn.append(line)
! continue
! # There is whitespace we can split on.
! first = parts.pop(0)
! sublines = [first]
! acc = len(first)
! while parts:
! len0 = len(parts[0])
! len1 = len(parts[1])
! if acc + len0 + len1 <= maxlinelen:
! sublines.append(parts.pop(0))
! sublines.append(parts.pop(0))
! acc += len0 + len1
! else:
! # Split it here, but don't forget to ignore the
! # next whitespace-only part
! if first <> '':
! rtn.append(EMPTYSTRING.join(sublines))
! del parts[0]
! first = parts.pop(0)
! sublines = [first]
! acc = len(first)
! rtn.append(EMPTYSTRING.join(sublines))
! return [(chunk, charset) for chunk in rtn]
def _encode_chunks(self, newchunks):
--- 305,321 ----
last = charset.from_splittable(splittable[halfway:], False)
# Do the split
! return self._split(first, charset, firstline, splitchars) + \
! self._split(last, charset, False, splitchars)
! def _split_ascii(self, s, charset, firstline, splitchars):
! if firstline:
! firstlen = self._firstlinelen
! restlen = self._maxlinelen
! else:
! firstlen = restlen = self._maxlinelen
! line = _split_ascii(s, firstlen, restlen,
! self._continuation_ws, splitchars)
! lines = line.splitlines()
! return zip(lines, [charset]*len(lines))
def _encode_chunks(self, newchunks):
***************
*** 397,409 ****
for header, charset in newchunks:
if charset is None or charset.header_encoding is None:
! # There's no encoding for this chunk's charsets
! _max_append(chunks, header, self._maxlinelen)
else:
! _max_append(chunks, charset.header_encode(header),
! self._maxlinelen, ' ')
joiner = NL + self._continuation_ws
return joiner.join(chunks)
! def encode(self):
"""Encode a message header into an RFC-compliant format.
--- 341,352 ----
for header, charset in newchunks:
if charset is None or charset.header_encoding is None:
! s = header
else:
! s = charset.header_encode(header)
! _max_append(chunks, s, self._maxlinelen, ' ')
joiner = NL + self._continuation_ws
return joiner.join(chunks)
! def encode(self, splitchars=';, '):
"""Encode a message header into an RFC-compliant format.
***************
*** 422,428 ****
If the given charset is not known or an error occurs during
conversion, this function will return the header untouched.
"""
newchunks = []
for s, charset in self._chunks:
! newchunks += self._split(s, charset, True)
return self._encode_chunks(newchunks)
--- 365,434 ----
If the given charset is not known or an error occurs during
conversion, this function will return the header untouched.
+
+ Optional splitchars is a string containing characters to split long
+ ASCII lines on, in rough support of RFC 2822's `highest level
+ syntactic breaks'. This doesn't affect RFC 2047 encoded lines.
"""
newchunks = []
for s, charset in self._chunks:
! newchunks += self._split(s, charset, True, splitchars)
return self._encode_chunks(newchunks)
+
+
+
+ def _split_ascii(s, firstlen, restlen, continuation_ws, splitchars):
+ lines = []
+ maxlen = firstlen
+ for line in s.splitlines():
+ if len(line) < maxlen:
+ lines.append(line)
+ maxlen = restlen
+ continue
+ # Attempt to split the line at the highest-level syntactic break
+ # possible. Note that we don't have a lot of smarts about field
+ # syntax; we just try to break on semi-colons, then commas, then
+ # whitespace.
+ for ch in splitchars:
+ if line.find(ch) >= 0:
+ break
+ else:
+ # There's nothing useful to split the line on, not even spaces, so
+ # just append this line unchanged
+ lines.append(line)
+ maxlen = restlen
+ continue
+ # Now split the line on the character plus trailing whitespace
+ cre = re.compile(r'%s\s*' % ch)
+ if ch in ';,':
+ eol = ch
+ else:
+ eol = ''
+ joiner = eol + ' '
+ joinlen = len(joiner)
+ wslen = len(continuation_ws.replace('\t', SPACE8))
+ this = []
+ linelen = 0
+ for part in cre.split(line):
+ curlen = linelen + max(0, len(this)-1) * joinlen
+ partlen = len(part)
+ onfirstline = not lines
+ # We don't want to split after the field name, if we're on the
+ # first line and the field name is present in the header string.
+ if ch == ' ' and onfirstline and \
+ len(this) == 1 and fcre.match(this[0]):
+ this.append(part)
+ linelen += partlen
+ elif curlen + partlen > maxlen:
+ if this:
+ lines.append(joiner.join(this) + eol)
+ this = [part]
+ linelen = wslen + partlen
+ maxlen = restlen
+ else:
+ this.append(part)
+ linelen += partlen
+ # Put any left over parts on a line by themselves
+ if this:
+ lines.append(joiner.join(this))
+ linejoiner = '\n' + continuation_ws
+ return linejoiner.join(lines)