[pypy-svn] pypy default: Following CPython, extensive rewrite of the utf-7 decoder,

amauryfa commits-noreply at bitbucket.org
Tue Feb 8 18:32:59 CET 2011


Author: Amaury Forgeot d'Arc <amauryfa at gmail.com>
Branch: 
Changeset: r41704:69d17c51e630
Date: 2011-02-08 14:55 +0100
http://bitbucket.org/pypy/pypy/changeset/69d17c51e630/

Log:	Following CPython, extensive rewrite of the utf-7 decoder, which did
	not accept some legal sequences. (issue4426)

	Add utf-7 to the sytematic tests, and ensure that it works with
	surrogate pairs.

diff --git a/pypy/rlib/test/test_runicode.py b/pypy/rlib/test/test_runicode.py
--- a/pypy/rlib/test/test_runicode.py
+++ b/pypy/rlib/test/test_runicode.py
@@ -99,13 +99,13 @@
 
     def test_all_first_256(self):
         for i in range(256):
-            for encoding in ("utf-8 latin-1 utf-16 utf-16-be utf-16-le "
+            for encoding in ("utf-7 utf-8 latin-1 utf-16 utf-16-be utf-16-le "
                              "utf-32 utf-32-be utf-32-le").split():
                 self.checkdecode(unichr(i), encoding)
 
     def test_first_10000(self):
         for i in range(10000):
-            for encoding in ("utf-8 utf-16 utf-16-be utf-16-le "
+            for encoding in ("utf-7 utf-8 utf-16 utf-16-be utf-16-le "
                              "utf-32 utf-32-be utf-32-le").split():
                 self.checkdecode(unichr(i), encoding)
 
@@ -115,13 +115,13 @@
             if 0xd800 <= v <= 0xdfff:
                 continue
             uni = unichr(v)
-            for encoding in ("utf-8 utf-16 utf-16-be utf-16-le "
+            for encoding in ("utf-7 utf-8 utf-16 utf-16-be utf-16-le "
                              "utf-32 utf-32-be utf-32-le").split():
                 self.checkdecode(uni, encoding)
 
     def test_maxunicode(self):
         uni = unichr(sys.maxunicode)
-        for encoding in ("utf-8 utf-16 utf-16-be utf-16-le "
+        for encoding in ("utf-7 utf-8 utf-16 utf-16-be utf-16-le "
                          "utf-32 utf-32-be utf-32-le").split():
             self.checkdecode(uni, encoding)
 
@@ -184,22 +184,27 @@
         assert decode(s, 4, None) == (u'a+-', 4)
         assert decode(s, 5, None) == (u'a+-b', 5)
 
+    def test_utf7_surrogates(self):
+        assert u'\U000abcde'.encode('utf-7') == '+2m/c3g-'
+        raises(UnicodeError, unicode, '+3ADYAA-', 'utf-7')
+        assert unicode('+3ADYAA-', 'utf-7', 'replace') == u'\ufffd\ufffd'
+
 
 class TestEncoding(UnicodeTests):
     def test_all_ascii(self):
         for i in range(128):
-            for encoding in "utf-8 latin-1 ascii".split():
+            for encoding in "utf-7 utf-8 latin-1 ascii".split():
                 self.checkencode(unichr(i), encoding)
 
     def test_all_first_256(self):
         for i in range(256):
-            for encoding in ("utf-8 utf-16 utf-16-be utf-16-le "
+            for encoding in ("utf-7 utf-8 utf-16 utf-16-be utf-16-le "
                              "utf-32 utf-32-be utf-32-le").split():
                 self.checkencode(unichr(i), encoding)
 
     def test_first_10000(self):
         for i in range(10000):
-            for encoding in ("utf-8 utf-16 utf-16-be utf-16-le "
+            for encoding in ("utf-7 utf-8 utf-16 utf-16-be utf-16-le "
                              "utf-32 utf-32-be utf-32-le").split():
                 self.checkencode(unichr(i), encoding)
 
@@ -209,13 +214,13 @@
             if 0xd800 <= v <= 0xdfff:
                 continue
             uni = unichr(v)
-            for encoding in ("utf-8 utf-16 utf-16-be utf-16-le "
+            for encoding in ("utf-7 utf-8 utf-16 utf-16-be utf-16-le "
                              "utf-32 utf-32-be utf-32-le").split():
                 self.checkencode(uni, encoding)                
 
     def test_maxunicode(self):
         uni = unichr(sys.maxunicode)
-        for encoding in ("utf-8 utf-16 utf-16-be utf-16-le "
+        for encoding in ("utf-7 utf-8 utf-16 utf-16-be utf-16-le "
                          "utf-32 utf-32-be utf-32-le").split():
             self.checkencode(uni, encoding)        
 

diff --git a/pypy/rlib/runicode.py b/pypy/rlib/runicode.py
--- a/pypy/rlib/runicode.py
+++ b/pypy/rlib/runicode.py
@@ -579,81 +579,91 @@
 # ____________________________________________________________
 # utf-7
 
-## indicate whether a UTF-7 character is special i.e. cannot be directly
-##       encoded:
-##         0 - not special
-##         1 - special
-##         2 - whitespace (optional)
-##         3 - RFC2152 Set O (optional)
+# Three simple macros defining base-64
 
-_utf7_special = [
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
-    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
-    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
-]
-
-def _utf7_SPECIAL(oc, encodeO=False, encodeWS=False):
-    return (oc > 127 or _utf7_special[oc] == 1 or
-            (encodeWS and _utf7_special[oc] == 2) or
-            (encodeO and _utf7_special[oc] == 3))
-
-def _utf7_B64CHAR(oc):
-    if oc > 127:
-        return False
+def _utf7_IS_BASE64(oc):
+    "Is c a base-64 character?"
     c = chr(oc)
     return c.isalnum() or c == '+' or c == '/'
 def _utf7_TO_BASE64(n):
     "Returns the base-64 character of the bottom 6 bits of n"
     return "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[n & 0x3f]
 def _utf7_FROM_BASE64(c):
-    "Retuns the base-64 value of a base-64 character"
-    if c == '+':
-        return 62
-    elif c == '/':
-        return 63
-    elif c >= 'a':
+    "given that c is a base-64 character, what is its base-64 value?"
+    if c >= 'a':
         return ord(c) - 71
     elif c >= 'A':
         return ord(c) - 65
-    else:
+    elif c >= '0':
         return ord(c) + 4
+    elif c == '+':
+        return 62
+    else: # c == '/'
+        return 63
 
-def _utf7_ENCODE(result, ch, bits):
-    while bits >= 6:
-        result.append(_utf7_TO_BASE64(ch >> (bits - 6)))
-        bits -= 6
-    return bits
+def _utf7_DECODE_DIRECT(oc):
+    return oc <= 127 and oc != ord('+')
 
-def _utf7_DECODE(s, result, errorhandler, errors,
-                 pos, charsleft, bitsleft, surrogate):
-    while bitsleft >= 16:
-        outCh =  (charsleft >> (bitsleft-16)) & 0xffff
-        bitsleft -= 16
+# The UTF-7 encoder treats ASCII characters differently according to
+# whether they are Set D, Set O, Whitespace, or special (i.e. none of
+# the above).  See RFC2152.  This array identifies these different
+# sets:
+# 0 : "Set D"
+#      alphanumeric and '(),-./:?
+# 1 : "Set O"
+#     !"#$%&*;<=>@[]^_`{|}
+# 2 : "whitespace"
+#     ht nl cr sp
+# 3 : special (must be base64 encoded)
+#     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
 
-        if surrogate:
-            ## We have already generated an error for the high
-            ## surrogate so let's not bother seeing if the low
-            ## surrogate is correct or not
-            surrogate = False
-        elif 0xDC00 <= outCh <= 0xDFFF:
-            ## This is a surrogate pair. Unfortunately we can't
-            ## represent it in a 16-bit character
-            surrogate = True
-            msg = "code pairs are not supported"
-            res, pos = errorhandler(errors, 'utf-7',
-                                    msg, s, pos-1, pos)
-            result.append(res)
-            bitsleft = 0
-            break
-        else:
-            result.append(unichr(outCh))
-    return pos, charsleft, bitsleft, surrogate
+utf7_category = [
+#  nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si
+    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
+#  dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us
+    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
+#  sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /
+    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
+#   0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ? 
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
+#   @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O
+    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+#   P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
+#   `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o
+    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+#   p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
+]
 
+# ENCODE_DIRECT: this character should be encoded as itself.  The
+# answer depends on whether we are encoding set O as itself, and also
+# on whether we are encoding whitespace as itself.  RFC2152 makes it
+# clear that the answers to these questions vary between
+# applications, so this code needs to be flexible.
+
+def _utf7_ENCODE_DIRECT(oc, directO, directWS):
+    return(oc < 128 and oc > 0 and
+           (utf7_category[oc] == 0 or
+            (directWS and utf7_category[oc] == 2) or
+            (directO and utf7_category[oc] == 1)))
+
+def _utf7_ENCODE_CHAR(result, oc, base64bits, base64buffer):
+    if MAXUNICODE > 65535 and oc >= 0x10000:
+        # code first surrogate
+        base64bits += 16
+        base64buffer = (base64buffer << 16) | 0xd800 | ((oc-0x10000) >> 10)
+        while base64bits >= 6:
+            result.append(_utf7_TO_BASE64(base64buffer >> (base64bits-6)))
+            base64bits -= 6
+        # prepare second surrogate
+        oc = 0xDC00 | ((oc-0x10000) & 0x3FF)
+    base64bits += 16
+    base64buffer = (base64buffer << 16) | oc
+    while base64bits >= 6:
+        result.append(_utf7_TO_BASE64(base64buffer >> (base64bits-6)))
+        base64bits -= 6
+    return base64bits, base64buffer
 
 def str_decode_utf_7(s, size, errors, final=False,
                      errorhandler=None):
@@ -663,83 +673,126 @@
         return u'', 0
 
     inShift = False
-    bitsleft = 0
-    startinpos = 0
-    charsleft = 0
-    surrogate = False
+    base64bits = 0
+    base64buffer = 0
+    surrogate = 0
 
     result = UnicodeBuilder(size)
     pos = 0
+    shiftOutStartPos = 0
     while pos < size:
         ch = s[pos]
         oc = ord(ch)
 
-        if inShift:
-            if ch == '-' or not _utf7_B64CHAR(oc):
+        if inShift: # in a base-64 section
+            if _utf7_IS_BASE64(oc): #consume a base-64 character
+                base64buffer = (base64buffer << 6) | _utf7_FROM_BASE64(ch)
+                base64bits += 6
+                pos += 1
+
+                if base64bits >= 16:
+                    # enough bits for a UTF-16 value
+                    outCh = base64buffer >> (base64bits - 16)
+                    base64bits -= 16
+                    base64buffer &= (1 << base64bits) - 1 # clear high bits
+                    if surrogate:
+                        # expecting a second surrogate
+                        if outCh >= 0xDC00 and outCh <= 0xDFFFF:
+                            if MAXUNICODE < 65536:
+                                result.append(unichr(surrogate))
+                                result.append(unichr(outCh))
+                            else:
+                                result.append(
+                                    UNICHR((((surrogate & 0x3FF)<<10) |
+                                            (outCh & 0x3FF)) + 0x10000))
+                        else:
+                            surrogate = 0
+                            msg = "second surrogate missing"
+                            res, pos = errorhandler(errors, 'utf-7',
+                                                    msg, s, pos-1, pos)
+                            result.append(res)
+                            continue
+                    elif outCh >= 0xD800 and outCh <= 0xDBFF:
+                        # first surrogate
+                        surrogate = outCh
+                    elif outCh >= 0xDC00 and outCh <= 0xDFFF:
+                        msg = "unexpected second surrogate"
+                        res, pos = errorhandler(errors, 'utf-7',
+                                                msg, s, pos-1, pos)
+                        result.append(res)
+                        continue
+                    else:
+                        result.append(unichr(outCh))
+
+            else:
+                # now leaving a base-64 section
                 inShift = 0
                 pos += 1
 
-                pos, charsleft, bitsleft, surrogate = _utf7_DECODE(
-                    s, result, errorhandler, errors,
-                    pos, charsleft, bitsleft, surrogate)
-                if bitsleft >= 6:
-                    ## The shift sequence has a partial character in it. If
-                    ## bitsleft < 6 then we could just classify it as padding
-                    ## but that is not the case here
-                    msg = "partial character in shift sequence"
+                if surrogate:
+                    msg = "second surrogate missing at end of shift sequence"
                     res, pos = errorhandler(errors, 'utf-7',
                                             msg, s, pos-1, pos)
                     result.append(res)
-                    ## According to RFC2152 the remaining bits should be
-                    ## zero. We choose to signal an error/insert a replacement
-                    ## character here so indicate the potential of a
-                    ## misencoded character.
+                    continue
+
+                if base64bits > 0: # left-over bits
+                    if base64bits >= 6:
+                        # We've seen at least one base-64 character
+                        msg = "partial character in shift sequence"
+                        res, pos = errorhandler(errors, 'utf-7',
+                                                msg, s, pos-1, pos)
+                        result.append(res)
+                        continue
+                    else:
+                        # Some bits remain; they should be zero
+                        if base64buffer != 0:
+                            msg = "non-zero padding bits in shift sequence"
+                            res, pos = errorhandler(errors, 'utf-7',
+                                                    msg, s, pos-1, pos)
+                            result.append(res)
+                            continue
+
                 if ch == '-':
-                    if pos < size and s[pos] == '-':
-                        result.append(u'-')
-                        inShift = True
+                    # '-' is absorbed; other terminating characters are
+                    # preserved
+                    pass
+                else:
+                    result.append(ch)
 
-                elif _utf7_SPECIAL(oc):
-                    msg = "unexpected special character"
-                    res, pos = errorhandler(errors, 'utf-7',
-                                            msg, s, pos-1, pos)
-                    result.append(res)
-                else:
-                    result.append(unichr(ord(ch)))
-            else:
-                charsleft = (charsleft << 6) | _utf7_FROM_BASE64(ch)
-                bitsleft += 6
-                pos += 1
-
-                pos, charsleft, bitsleft, surrogate = _utf7_DECODE(
-                    s, result, errorhandler, errors,
-                    pos, charsleft, bitsleft, surrogate)
         elif ch == '+':
             startinpos = pos
-            pos += 1
-            if pos < size and s[pos] == '-':
+            pos += 1 # consume '+'
+            if pos < size and s[pos] == '-': # '+-' encodes '+'
                 pos += 1
                 result.append(u'+')
-            else:
+            else: # begin base64-encoded section
                 inShift = 1
+                shiftOutStartPos = pos - 1
                 bitsleft = 0
 
-        elif _utf7_SPECIAL(oc):
+        elif _utf7_DECODE_DIRECT(oc): # character decodes at itself
+            result.append(unichr(oc))
+            pos += 1
+        else:
             pos += 1
             msg = "unexpected special character"
             res, pos = errorhandler(errors, 'utf-7', msg, s, pos-1, pos)
             result.append(res)
-        else:
-            result.append(unichr(oc))
-            pos += 1
 
-    if inShift and final:
-        endinpos = size
-        msg = "unterminated shift sequence"
-        res, pos = errorhandler(errors, 'utf-7', msg, s, startinpos, pos)
-        result.append(res)
+    # end of string
+
+    if inShift and final: # in shift sequence, no more to follow
+        # if we're in an inconsistent state, that's an error
+        if (surrogate or
+            base64bits >= 6 or
+            (base64bits > 0 and base64buffer != 0)):
+            endinpos = size
+            msg = "unterminated shift sequence"
+            res, pos = errorhandler(errors, 'utf-7', msg, s, startinpos, pos)
+            result.append(res)
     elif inShift:
-        pos = startinpos
+        pos = shiftOutStartPos # back off output
 
     return result.build(), pos
 
@@ -751,8 +804,8 @@
     encodeSetO = encodeWhiteSpace = False
 
     inShift = False
-    bitsleft = 0
-    charsleft = 0
+    base64bits = 0
+    base64buffer = 0
 
     pos = 0
     while pos < size:
@@ -761,53 +814,36 @@
         if not inShift:
             if ch == u'+':
                 result.append('+-')
-            elif _utf7_SPECIAL(oc, encodeSetO, encodeWhiteSpace):
-                charsleft = oc
-                bitsleft = 16
+            elif _utf7_ENCODE_DIRECT(oc, not encodeSetO, not encodeWhiteSpace):
+                result.append(chr(oc))
+            else:
                 result.append('+')
-                bitsleft = _utf7_ENCODE(result, charsleft, bitsleft)
-                inShift = bitsleft > 0
-            else:
-                result.append(chr(oc))
+                inShift = True
+                base64bits, base64buffer = _utf7_ENCODE_CHAR(
+                    result, oc, base64bits, base64buffer)
         else:
-            if not _utf7_SPECIAL(oc, encodeSetO, encodeWhiteSpace):
-                result.append(_utf7_TO_BASE64(charsleft << (6-bitsleft)))
-                charsleft = 0
-                bitsleft = 0
+            if _utf7_ENCODE_DIRECT(oc, not encodeSetO, not encodeWhiteSpace):
+                # shifting out
+                if base64bits: # output remaining bits
+                    result.append(_utf7_TO_BASE64(base64buffer >> (base64bits-6)))
+                    base64buffer = 0
+                    base64bits = 0
+
+                inShift = False
                 ## Characters not in the BASE64 set implicitly unshift the
                 ## sequence so no '-' is required, except if the character is
                 ## itself a '-'
-                if _utf7_B64CHAR(oc) or ch == u'-':
+                if _utf7_IS_BASE64(oc) or ch == u'-':
                     result.append('-')
-                inShift = False
                 result.append(chr(oc))
             else:
-                bitsleft += 16
-                charsleft = (charsleft << 16) | oc
-                bitsleft =  _utf7_ENCODE(result, charsleft, bitsleft)
-                ## If the next character is special then we dont' need to
-                ## terminate the shift sequence. If the next character is not
-                ## a BASE64 character or '-' then the shift sequence will be
-                ## terminated implicitly and we don't have to insert a '-'.
-                if bitsleft == 0:
-                    if pos + 1 < size:
-                        ch2 = s[pos + 1]
-                        oc2 = ord(ch2)
-
-                        if _utf7_SPECIAL(oc2, encodeSetO, encodeWhiteSpace):
-                            pass
-                        elif _utf7_B64CHAR(oc2) or ch2 == u'-':
-                            result.append('-')
-                            inShift = False
-                        else:
-                            inShift = False
-                    else:
-                        result.append('-')
-                        inShift = False
+                base64bits, base64buffer = _utf7_ENCODE_CHAR(
+                    result, oc, base64bits, base64buffer)
         pos += 1
 
-    if bitsleft:
-        result.append(_utf7_TO_BASE64(charsleft << (6 - bitsleft)))
+    if base64bits:
+        result.append(_utf7_TO_BASE64(base64buffer << (6 - base64bits)))
+    if inShift:
         result.append('-')
 
     return result.build()


More information about the Pypy-commit mailing list