[pypy-svn] pypy default: Following CPython, extensive rewrite of the utf-7 decoder,
amauryfa
commits-noreply at bitbucket.org
Tue Feb 8 18:32:59 CET 2011
Author: Amaury Forgeot d'Arc <amauryfa at gmail.com>
Branch:
Changeset: r41704:69d17c51e630
Date: 2011-02-08 14:55 +0100
http://bitbucket.org/pypy/pypy/changeset/69d17c51e630/
Log: Following CPython, extensive rewrite of the utf-7 decoder, which did
not accept some legal sequences. (issue4426)
Add utf-7 to the sytematic tests, and ensure that it works with
surrogate pairs.
diff --git a/pypy/rlib/test/test_runicode.py b/pypy/rlib/test/test_runicode.py
--- a/pypy/rlib/test/test_runicode.py
+++ b/pypy/rlib/test/test_runicode.py
@@ -99,13 +99,13 @@
def test_all_first_256(self):
for i in range(256):
- for encoding in ("utf-8 latin-1 utf-16 utf-16-be utf-16-le "
+ for encoding in ("utf-7 utf-8 latin-1 utf-16 utf-16-be utf-16-le "
"utf-32 utf-32-be utf-32-le").split():
self.checkdecode(unichr(i), encoding)
def test_first_10000(self):
for i in range(10000):
- for encoding in ("utf-8 utf-16 utf-16-be utf-16-le "
+ for encoding in ("utf-7 utf-8 utf-16 utf-16-be utf-16-le "
"utf-32 utf-32-be utf-32-le").split():
self.checkdecode(unichr(i), encoding)
@@ -115,13 +115,13 @@
if 0xd800 <= v <= 0xdfff:
continue
uni = unichr(v)
- for encoding in ("utf-8 utf-16 utf-16-be utf-16-le "
+ for encoding in ("utf-7 utf-8 utf-16 utf-16-be utf-16-le "
"utf-32 utf-32-be utf-32-le").split():
self.checkdecode(uni, encoding)
def test_maxunicode(self):
uni = unichr(sys.maxunicode)
- for encoding in ("utf-8 utf-16 utf-16-be utf-16-le "
+ for encoding in ("utf-7 utf-8 utf-16 utf-16-be utf-16-le "
"utf-32 utf-32-be utf-32-le").split():
self.checkdecode(uni, encoding)
@@ -184,22 +184,27 @@
assert decode(s, 4, None) == (u'a+-', 4)
assert decode(s, 5, None) == (u'a+-b', 5)
+ def test_utf7_surrogates(self):
+ assert u'\U000abcde'.encode('utf-7') == '+2m/c3g-'
+ raises(UnicodeError, unicode, '+3ADYAA-', 'utf-7')
+ assert unicode('+3ADYAA-', 'utf-7', 'replace') == u'\ufffd\ufffd'
+
class TestEncoding(UnicodeTests):
def test_all_ascii(self):
for i in range(128):
- for encoding in "utf-8 latin-1 ascii".split():
+ for encoding in "utf-7 utf-8 latin-1 ascii".split():
self.checkencode(unichr(i), encoding)
def test_all_first_256(self):
for i in range(256):
- for encoding in ("utf-8 utf-16 utf-16-be utf-16-le "
+ for encoding in ("utf-7 utf-8 utf-16 utf-16-be utf-16-le "
"utf-32 utf-32-be utf-32-le").split():
self.checkencode(unichr(i), encoding)
def test_first_10000(self):
for i in range(10000):
- for encoding in ("utf-8 utf-16 utf-16-be utf-16-le "
+ for encoding in ("utf-7 utf-8 utf-16 utf-16-be utf-16-le "
"utf-32 utf-32-be utf-32-le").split():
self.checkencode(unichr(i), encoding)
@@ -209,13 +214,13 @@
if 0xd800 <= v <= 0xdfff:
continue
uni = unichr(v)
- for encoding in ("utf-8 utf-16 utf-16-be utf-16-le "
+ for encoding in ("utf-7 utf-8 utf-16 utf-16-be utf-16-le "
"utf-32 utf-32-be utf-32-le").split():
self.checkencode(uni, encoding)
def test_maxunicode(self):
uni = unichr(sys.maxunicode)
- for encoding in ("utf-8 utf-16 utf-16-be utf-16-le "
+ for encoding in ("utf-7 utf-8 utf-16 utf-16-be utf-16-le "
"utf-32 utf-32-be utf-32-le").split():
self.checkencode(uni, encoding)
diff --git a/pypy/rlib/runicode.py b/pypy/rlib/runicode.py
--- a/pypy/rlib/runicode.py
+++ b/pypy/rlib/runicode.py
@@ -579,81 +579,91 @@
# ____________________________________________________________
# utf-7
-## indicate whether a UTF-7 character is special i.e. cannot be directly
-## encoded:
-## 0 - not special
-## 1 - special
-## 2 - whitespace (optional)
-## 3 - RFC2152 Set O (optional)
+# Three simple macros defining base-64
-_utf7_special = [
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
- 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
- 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
-]
-
-def _utf7_SPECIAL(oc, encodeO=False, encodeWS=False):
- return (oc > 127 or _utf7_special[oc] == 1 or
- (encodeWS and _utf7_special[oc] == 2) or
- (encodeO and _utf7_special[oc] == 3))
-
-def _utf7_B64CHAR(oc):
- if oc > 127:
- return False
+def _utf7_IS_BASE64(oc):
+ "Is c a base-64 character?"
c = chr(oc)
return c.isalnum() or c == '+' or c == '/'
def _utf7_TO_BASE64(n):
"Returns the base-64 character of the bottom 6 bits of n"
return "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[n & 0x3f]
def _utf7_FROM_BASE64(c):
- "Retuns the base-64 value of a base-64 character"
- if c == '+':
- return 62
- elif c == '/':
- return 63
- elif c >= 'a':
+ "given that c is a base-64 character, what is its base-64 value?"
+ if c >= 'a':
return ord(c) - 71
elif c >= 'A':
return ord(c) - 65
- else:
+ elif c >= '0':
return ord(c) + 4
+ elif c == '+':
+ return 62
+ else: # c == '/'
+ return 63
-def _utf7_ENCODE(result, ch, bits):
- while bits >= 6:
- result.append(_utf7_TO_BASE64(ch >> (bits - 6)))
- bits -= 6
- return bits
+def _utf7_DECODE_DIRECT(oc):
+ return oc <= 127 and oc != ord('+')
-def _utf7_DECODE(s, result, errorhandler, errors,
- pos, charsleft, bitsleft, surrogate):
- while bitsleft >= 16:
- outCh = (charsleft >> (bitsleft-16)) & 0xffff
- bitsleft -= 16
+# The UTF-7 encoder treats ASCII characters differently according to
+# whether they are Set D, Set O, Whitespace, or special (i.e. none of
+# the above). See RFC2152. This array identifies these different
+# sets:
+# 0 : "Set D"
+# alphanumeric and '(),-./:?
+# 1 : "Set O"
+# !"#$%&*;<=>@[]^_`{|}
+# 2 : "whitespace"
+# ht nl cr sp
+# 3 : special (must be base64 encoded)
+# everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
- if surrogate:
- ## We have already generated an error for the high
- ## surrogate so let's not bother seeing if the low
- ## surrogate is correct or not
- surrogate = False
- elif 0xDC00 <= outCh <= 0xDFFF:
- ## This is a surrogate pair. Unfortunately we can't
- ## represent it in a 16-bit character
- surrogate = True
- msg = "code pairs are not supported"
- res, pos = errorhandler(errors, 'utf-7',
- msg, s, pos-1, pos)
- result.append(res)
- bitsleft = 0
- break
- else:
- result.append(unichr(outCh))
- return pos, charsleft, bitsleft, surrogate
+utf7_category = [
+# nul soh stx etx eot enq ack bel bs ht nl vt np cr so si
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
+# dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+# sp ! " # $ % & ' ( ) * + , - . /
+ 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
+# 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
+# @ A B C D E F G H I J K L M N O
+ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+# P Q R S T U V W X Y Z [ \ ] ^ _
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
+# ` a b c d e f g h i j k l m n o
+ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+# p q r s t u v w x y z { | } ~ del
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
+]
+# ENCODE_DIRECT: this character should be encoded as itself. The
+# answer depends on whether we are encoding set O as itself, and also
+# on whether we are encoding whitespace as itself. RFC2152 makes it
+# clear that the answers to these questions vary between
+# applications, so this code needs to be flexible.
+
+def _utf7_ENCODE_DIRECT(oc, directO, directWS):
+ return(oc < 128 and oc > 0 and
+ (utf7_category[oc] == 0 or
+ (directWS and utf7_category[oc] == 2) or
+ (directO and utf7_category[oc] == 1)))
+
+def _utf7_ENCODE_CHAR(result, oc, base64bits, base64buffer):
+ if MAXUNICODE > 65535 and oc >= 0x10000:
+ # code first surrogate
+ base64bits += 16
+ base64buffer = (base64buffer << 16) | 0xd800 | ((oc-0x10000) >> 10)
+ while base64bits >= 6:
+ result.append(_utf7_TO_BASE64(base64buffer >> (base64bits-6)))
+ base64bits -= 6
+ # prepare second surrogate
+ oc = 0xDC00 | ((oc-0x10000) & 0x3FF)
+ base64bits += 16
+ base64buffer = (base64buffer << 16) | oc
+ while base64bits >= 6:
+ result.append(_utf7_TO_BASE64(base64buffer >> (base64bits-6)))
+ base64bits -= 6
+ return base64bits, base64buffer
def str_decode_utf_7(s, size, errors, final=False,
errorhandler=None):
@@ -663,83 +673,126 @@
return u'', 0
inShift = False
- bitsleft = 0
- startinpos = 0
- charsleft = 0
- surrogate = False
+ base64bits = 0
+ base64buffer = 0
+ surrogate = 0
result = UnicodeBuilder(size)
pos = 0
+ shiftOutStartPos = 0
while pos < size:
ch = s[pos]
oc = ord(ch)
- if inShift:
- if ch == '-' or not _utf7_B64CHAR(oc):
+ if inShift: # in a base-64 section
+ if _utf7_IS_BASE64(oc): #consume a base-64 character
+ base64buffer = (base64buffer << 6) | _utf7_FROM_BASE64(ch)
+ base64bits += 6
+ pos += 1
+
+ if base64bits >= 16:
+ # enough bits for a UTF-16 value
+ outCh = base64buffer >> (base64bits - 16)
+ base64bits -= 16
+ base64buffer &= (1 << base64bits) - 1 # clear high bits
+ if surrogate:
+ # expecting a second surrogate
+ if outCh >= 0xDC00 and outCh <= 0xDFFFF:
+ if MAXUNICODE < 65536:
+ result.append(unichr(surrogate))
+ result.append(unichr(outCh))
+ else:
+ result.append(
+ UNICHR((((surrogate & 0x3FF)<<10) |
+ (outCh & 0x3FF)) + 0x10000))
+ else:
+ surrogate = 0
+ msg = "second surrogate missing"
+ res, pos = errorhandler(errors, 'utf-7',
+ msg, s, pos-1, pos)
+ result.append(res)
+ continue
+ elif outCh >= 0xD800 and outCh <= 0xDBFF:
+ # first surrogate
+ surrogate = outCh
+ elif outCh >= 0xDC00 and outCh <= 0xDFFF:
+ msg = "unexpected second surrogate"
+ res, pos = errorhandler(errors, 'utf-7',
+ msg, s, pos-1, pos)
+ result.append(res)
+ continue
+ else:
+ result.append(unichr(outCh))
+
+ else:
+ # now leaving a base-64 section
inShift = 0
pos += 1
- pos, charsleft, bitsleft, surrogate = _utf7_DECODE(
- s, result, errorhandler, errors,
- pos, charsleft, bitsleft, surrogate)
- if bitsleft >= 6:
- ## The shift sequence has a partial character in it. If
- ## bitsleft < 6 then we could just classify it as padding
- ## but that is not the case here
- msg = "partial character in shift sequence"
+ if surrogate:
+ msg = "second surrogate missing at end of shift sequence"
res, pos = errorhandler(errors, 'utf-7',
msg, s, pos-1, pos)
result.append(res)
- ## According to RFC2152 the remaining bits should be
- ## zero. We choose to signal an error/insert a replacement
- ## character here so indicate the potential of a
- ## misencoded character.
+ continue
+
+ if base64bits > 0: # left-over bits
+ if base64bits >= 6:
+ # We've seen at least one base-64 character
+ msg = "partial character in shift sequence"
+ res, pos = errorhandler(errors, 'utf-7',
+ msg, s, pos-1, pos)
+ result.append(res)
+ continue
+ else:
+ # Some bits remain; they should be zero
+ if base64buffer != 0:
+ msg = "non-zero padding bits in shift sequence"
+ res, pos = errorhandler(errors, 'utf-7',
+ msg, s, pos-1, pos)
+ result.append(res)
+ continue
+
if ch == '-':
- if pos < size and s[pos] == '-':
- result.append(u'-')
- inShift = True
+ # '-' is absorbed; other terminating characters are
+ # preserved
+ pass
+ else:
+ result.append(ch)
- elif _utf7_SPECIAL(oc):
- msg = "unexpected special character"
- res, pos = errorhandler(errors, 'utf-7',
- msg, s, pos-1, pos)
- result.append(res)
- else:
- result.append(unichr(ord(ch)))
- else:
- charsleft = (charsleft << 6) | _utf7_FROM_BASE64(ch)
- bitsleft += 6
- pos += 1
-
- pos, charsleft, bitsleft, surrogate = _utf7_DECODE(
- s, result, errorhandler, errors,
- pos, charsleft, bitsleft, surrogate)
elif ch == '+':
startinpos = pos
- pos += 1
- if pos < size and s[pos] == '-':
+ pos += 1 # consume '+'
+ if pos < size and s[pos] == '-': # '+-' encodes '+'
pos += 1
result.append(u'+')
- else:
+ else: # begin base64-encoded section
inShift = 1
+ shiftOutStartPos = pos - 1
bitsleft = 0
- elif _utf7_SPECIAL(oc):
+ elif _utf7_DECODE_DIRECT(oc): # character decodes at itself
+ result.append(unichr(oc))
+ pos += 1
+ else:
pos += 1
msg = "unexpected special character"
res, pos = errorhandler(errors, 'utf-7', msg, s, pos-1, pos)
result.append(res)
- else:
- result.append(unichr(oc))
- pos += 1
- if inShift and final:
- endinpos = size
- msg = "unterminated shift sequence"
- res, pos = errorhandler(errors, 'utf-7', msg, s, startinpos, pos)
- result.append(res)
+ # end of string
+
+ if inShift and final: # in shift sequence, no more to follow
+ # if we're in an inconsistent state, that's an error
+ if (surrogate or
+ base64bits >= 6 or
+ (base64bits > 0 and base64buffer != 0)):
+ endinpos = size
+ msg = "unterminated shift sequence"
+ res, pos = errorhandler(errors, 'utf-7', msg, s, startinpos, pos)
+ result.append(res)
elif inShift:
- pos = startinpos
+ pos = shiftOutStartPos # back off output
return result.build(), pos
@@ -751,8 +804,8 @@
encodeSetO = encodeWhiteSpace = False
inShift = False
- bitsleft = 0
- charsleft = 0
+ base64bits = 0
+ base64buffer = 0
pos = 0
while pos < size:
@@ -761,53 +814,36 @@
if not inShift:
if ch == u'+':
result.append('+-')
- elif _utf7_SPECIAL(oc, encodeSetO, encodeWhiteSpace):
- charsleft = oc
- bitsleft = 16
+ elif _utf7_ENCODE_DIRECT(oc, not encodeSetO, not encodeWhiteSpace):
+ result.append(chr(oc))
+ else:
result.append('+')
- bitsleft = _utf7_ENCODE(result, charsleft, bitsleft)
- inShift = bitsleft > 0
- else:
- result.append(chr(oc))
+ inShift = True
+ base64bits, base64buffer = _utf7_ENCODE_CHAR(
+ result, oc, base64bits, base64buffer)
else:
- if not _utf7_SPECIAL(oc, encodeSetO, encodeWhiteSpace):
- result.append(_utf7_TO_BASE64(charsleft << (6-bitsleft)))
- charsleft = 0
- bitsleft = 0
+ if _utf7_ENCODE_DIRECT(oc, not encodeSetO, not encodeWhiteSpace):
+ # shifting out
+ if base64bits: # output remaining bits
+ result.append(_utf7_TO_BASE64(base64buffer >> (base64bits-6)))
+ base64buffer = 0
+ base64bits = 0
+
+ inShift = False
## Characters not in the BASE64 set implicitly unshift the
## sequence so no '-' is required, except if the character is
## itself a '-'
- if _utf7_B64CHAR(oc) or ch == u'-':
+ if _utf7_IS_BASE64(oc) or ch == u'-':
result.append('-')
- inShift = False
result.append(chr(oc))
else:
- bitsleft += 16
- charsleft = (charsleft << 16) | oc
- bitsleft = _utf7_ENCODE(result, charsleft, bitsleft)
- ## If the next character is special then we dont' need to
- ## terminate the shift sequence. If the next character is not
- ## a BASE64 character or '-' then the shift sequence will be
- ## terminated implicitly and we don't have to insert a '-'.
- if bitsleft == 0:
- if pos + 1 < size:
- ch2 = s[pos + 1]
- oc2 = ord(ch2)
-
- if _utf7_SPECIAL(oc2, encodeSetO, encodeWhiteSpace):
- pass
- elif _utf7_B64CHAR(oc2) or ch2 == u'-':
- result.append('-')
- inShift = False
- else:
- inShift = False
- else:
- result.append('-')
- inShift = False
+ base64bits, base64buffer = _utf7_ENCODE_CHAR(
+ result, oc, base64bits, base64buffer)
pos += 1
- if bitsleft:
- result.append(_utf7_TO_BASE64(charsleft << (6 - bitsleft)))
+ if base64bits:
+ result.append(_utf7_TO_BASE64(base64buffer << (6 - base64bits)))
+ if inShift:
result.append('-')
return result.build()
More information about the Pypy-commit
mailing list