[pypy-svn] r75644 - in pypy/branch/interplevel-codecs/pypy: module/_codecs rlib

afa at codespeak.net afa at codespeak.net
Mon Jun 28 18:53:43 CEST 2010


Author: afa
Date: Mon Jun 28 18:53:42 2010
New Revision: 75644

Modified:
   pypy/branch/interplevel-codecs/pypy/module/_codecs/__init__.py
   pypy/branch/interplevel-codecs/pypy/module/_codecs/app_codecs.py
   pypy/branch/interplevel-codecs/pypy/module/_codecs/interp_codecs.py
   pypy/branch/interplevel-codecs/pypy/rlib/runicode.py
Log:
Rewrite utf-7 codec at interp-level.
This should dramatically reduce the generated C code.


Modified: pypy/branch/interplevel-codecs/pypy/module/_codecs/__init__.py
==============================================================================
--- pypy/branch/interplevel-codecs/pypy/module/_codecs/__init__.py	(original)
+++ pypy/branch/interplevel-codecs/pypy/module/_codecs/__init__.py	Mon Jun 28 18:53:42 2010
@@ -11,8 +11,6 @@
          'escape_encode' :  'app_codecs.escape_encode',
          'unicode_internal_decode' :  'app_codecs.unicode_internal_decode',
          'unicode_internal_encode' :  'app_codecs.unicode_internal_encode',
-         'utf_7_decode' :  'app_codecs.utf_7_decode',
-         'utf_7_encode' :  'app_codecs.utf_7_encode',
          'charmap_build' : 'app_codecs.charmap_build'
     }
     interpleveldefs = {
@@ -28,6 +26,8 @@
          'ascii_encode'     : 'interp_codecs.ascii_encode',
          'latin_1_decode'   : 'interp_codecs.latin_1_decode',
          'latin_1_encode'   : 'interp_codecs.latin_1_encode',
+         'utf_7_decode'     : 'interp_codecs.utf_7_decode',
+         'utf_7_encode'     : 'interp_codecs.utf_7_encode',
          'utf_8_decode'     : 'interp_codecs.utf_8_decode',
          'utf_8_encode'     : 'interp_codecs.utf_8_encode',
          'utf_16_be_decode' : 'interp_codecs.utf_16_be_decode',

Modified: pypy/branch/interplevel-codecs/pypy/module/_codecs/app_codecs.py
==============================================================================
--- pypy/branch/interplevel-codecs/pypy/module/_codecs/app_codecs.py	(original)
+++ pypy/branch/interplevel-codecs/pypy/module/_codecs/app_codecs.py	Mon Jun 28 18:53:42 2010
@@ -46,13 +46,6 @@
     v = s[1:-1]
     return v, len(v)
 
-def utf_7_decode( data, errors='strict'):
-    """None
-    """
-    res = PyUnicode_DecodeUTF7(data, len(data), errors)
-    res = u''.join(res)
-    return res, len(data)
-
 def charmap_encode(obj, errors='strict', mapping=None):
     """None
     """
@@ -175,238 +168,8 @@
     res = ''.join(res)    
     return res, len(data)
 
-
-def utf_7_encode( obj, errors='strict'):
-    """None
-    """
-    res = PyUnicode_EncodeUTF7(obj, len(obj), 0, 0, errors)
-    res = ''.join(res)
-    return res, len(res)    
-
 #  ----------------------------------------------------------------------
 
-##import sys
-##""" Python implementation of CPythons builtin unicode codecs.
-##
-##    Generally the functions in this module take a list of characters an returns 
-##    a list of characters.
-##    
-##    For use in the PyPy project"""
-
-
-## indicate whether a UTF-7 character is special i.e. cannot be directly
-##       encoded:
-##         0 - not special
-##         1 - special
-##         2 - whitespace (optional)
-##         3 - RFC2152 Set O (optional)
-    
-utf7_special = [
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
-    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
-    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
-]
-
-    
-def SPECIAL(c, encodeO, encodeWS):
-    c = ord(c)
-    return (c>127 or utf7_special[c] == 1) or \
-            (encodeWS and (utf7_special[(c)] == 2)) or \
-            (encodeO and (utf7_special[(c)] == 3))
-def B64(n):
-    return ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
-def B64CHAR(c):
-    return (c.isalnum() or (c) == '+' or (c) == '/')
-def UB64(c):
-    if (c) == '+' :
-        return 62 
-    elif (c) == '/':
-        return 63 
-    elif (c) >= 'a':
-        return ord(c) - 71 
-    elif (c) >= 'A':
-        return ord(c) - 65 
-    else: 
-        return ord(c) + 4
-
-def ENCODE( ch, bits) :
-    out = []
-    while (bits >= 6):
-        out +=  B64(ch >> (bits-6))
-        bits -= 6 
-    return out, bits
-
-def PyUnicode_DecodeUTF7(s, size, errors):
-    from _codecs import lookup_error
-    errmsg = ""
-    inShift = 0
-    bitsleft = 0
-    charsleft = 0
-    surrogate = 0
-    startinpos = 0
-    p = []
-    errorHandler = None
-    exc = None
-
-    if (size == 0):
-        return unicode('')
-    i = 0
-    while i < size:
-        
-        ch = s[i]
-        if (inShift):
-            if ((ch == '-') or not B64CHAR(ch)):
-                inShift = 0
-                i += 1
-                
-                while (bitsleft >= 16):
-                    outCh =  ((charsleft) >> (bitsleft-16)) & 0xffff
-                    bitsleft -= 16
-                    
-                    if (surrogate):
-                        ##            We have already generated an error for the high surrogate
-                        ##            so let's not bother seeing if the low surrogate is correct or not 
-                        surrogate = 0
-                    elif (0xDC00 <= (outCh) and (outCh) <= 0xDFFF):
-            ##             This is a surrogate pair. Unfortunately we can't represent 
-            ##               it in a 16-bit character 
-                        surrogate = 1
-                        msg = "code pairs are not supported"
-                        out, i = unicode_call_errorhandler(errors, 'utf-7', msg, s, i-1, i)
-                        p += out
-                        bitsleft = 0
-                        break
-                    else:
-                        p +=  unichr(outCh )
-                        #p += out
-                if (bitsleft >= 6):
-##                    /* The shift sequence has a partial character in it. If
-##                       bitsleft < 6 then we could just classify it as padding
-##                       but that is not the case here */
-                    msg = "partial character in shift sequence"
-                    out, i = unicode_call_errorhandler(errors, 'utf-7', msg, s, i-1, i)
-                    
-##                /* According to RFC2152 the remaining bits should be zero. We
-##                   choose to signal an error/insert a replacement character
-##                   here so indicate the potential of a misencoded character. */
-
-##                /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
-##                if (bitsleft and (charsleft << (sizeof(charsleft) * 8 - bitsleft))):
-##                    raise UnicodeDecodeError, "non-zero padding bits in shift sequence"
-                if (ch == '-') :
-                    if ((i < size) and (s[i] == '-')) :
-                        p +=  '-'
-                        inShift = 1
-                    
-                elif SPECIAL(ch, 0, 0) :
-                    msg = "unexpected special character"
-                    out, i = unicode_call_errorhandler(errors, 'utf-7', msg, s, i-1, i)
-                    p += out
-                else:  
-                    p +=  ch 
-            else:
-                charsleft = (charsleft << 6) | UB64(ch)
-                bitsleft += 6
-                i += 1
-##                /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
-        elif ( ch == '+' ):
-            startinpos = i
-            i += 1
-            if (i<size and s[i] == '-'):
-                i += 1
-                p +=  '+'
-            else:
-                inShift = 1
-                bitsleft = 0
-                
-        elif (SPECIAL(ch, 0, 0)):
-            i += 1
-            msg = "unexpected special character"
-            out, i = unicode_call_errorhandler(errors, 'utf-7', msg, s, i-1, i)
-            p += out
-        else:
-            p +=  ch 
-            i += 1
-
-    if (inShift) :
-        endinpos = size
-        msg = "unterminated shift sequence"
-        out, i = unicode_call_errorhandler(errors, 'utf-7', msg, s, startinpos, i)
-        p += out
-    return p
-
-def PyUnicode_EncodeUTF7(s, size, encodeSetO, encodeWhiteSpace, errors):
-
-#    /* It might be possible to tighten this worst case */
-    inShift = False
-    i = 0
-    bitsleft = 0
-    charsleft = 0
-    out = []
-    for ch in s:
-        if (not inShift) :
-            if (ch == '+'):
-                out +=  '+'
-                out +=  '-'
-            elif (SPECIAL(ch, encodeSetO, encodeWhiteSpace)):
-                charsleft = ord(ch)
-                bitsleft = 16
-                out += '+'
-                p, bitsleft = ENCODE( charsleft, bitsleft)
-                out += p
-                inShift = bitsleft > 0
-            else:
-                out += chr(ord(ch))
-        else:
-            if (not SPECIAL(ch, encodeSetO, encodeWhiteSpace)):
-                out += B64((charsleft) << (6-bitsleft))
-                charsleft = 0
-                bitsleft = 0
-##                /* Characters not in the BASE64 set implicitly unshift the sequence
-##                   so no '-' is required, except if the character is itself a '-' */
-                if (B64CHAR(ch) or ch == '-'):
-                    out += '-'
-                inShift = False
-                out += chr(ord(ch))
-            else:
-                bitsleft += 16
-                charsleft = (((charsleft) << 16) | ord(ch))
-                p, bitsleft =  ENCODE(charsleft, bitsleft)
-                out += p
-##                /* If the next character is special then we dont' need to terminate
-##                   the shift sequence. If the next character is not a BASE64 character
-##                   or '-' then the shift sequence will be terminated implicitly and we
-##                   don't have to insert a '-'. */
-
-                if (bitsleft == 0):
-                    if (i + 1 < size):
-                        ch2 = s[i+1]
-
-                        if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)):
-                            pass
-                        elif (B64CHAR(ch2) or ch2 == '-'):
-                            out +=  '-'
-                            inShift = False
-                        else:
-                            inShift = False
-                    else:
-                        out +=  '-'
-                        inShift = False
-        i += 1
-            
-    if (bitsleft):
-        out += B64(charsleft << (6-bitsleft) ) 
-        out +=  '-'
-
-    return out
-
-unicode_empty = u''
-
 def unicode_call_errorhandler(errors,  encoding, 
                 reason, input, startinpos, endinpos, decode=True):
     

Modified: pypy/branch/interplevel-codecs/pypy/module/_codecs/interp_codecs.py
==============================================================================
--- pypy/branch/interplevel-codecs/pypy/module/_codecs/interp_codecs.py	(original)
+++ pypy/branch/interplevel-codecs/pypy/module/_codecs/interp_codecs.py	Mon Jun 28 18:53:42 2010
@@ -368,6 +368,7 @@
 for encoders in [
          "ascii_encode",
          "latin_1_encode",
+         "utf_7_encode",
          "utf_8_encode",
          "utf_16_encode",
          "utf_16_be_encode",
@@ -380,6 +381,7 @@
 for decoders in [
          "ascii_decode",
          "latin_1_decode",
+         "utf_7_decode",
          "utf_8_decode",
          "utf_16_decode",
          "utf_16_be_decode",

Modified: pypy/branch/interplevel-codecs/pypy/rlib/runicode.py
==============================================================================
--- pypy/branch/interplevel-codecs/pypy/rlib/runicode.py	(original)
+++ pypy/branch/interplevel-codecs/pypy/rlib/runicode.py	Mon Jun 28 18:53:42 2010
@@ -418,6 +418,228 @@
 
 
 # ____________________________________________________________
+# utf-7
+
+## indicate whether a UTF-7 character is special i.e. cannot be directly
+##       encoded:
+##         0 - not special
+##         1 - special
+##         2 - whitespace (optional)
+##         3 - RFC2152 Set O (optional)
+
+_utf7_special = [
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
+    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
+    3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
+]
+
+def _utf7_SPECIAL(oc, encodeO=False, encodeWS=False):
+    return (oc > 127 or _utf7_special[oc] == 1 or
+            (encodeWS and _utf7_special[oc] == 2) or
+            (encodeO and _utf7_special[oc] == 3))
+
+def _utf7_B64CHAR(oc):
+    if oc > 127:
+        return False
+    c = chr(oc)
+    return c.isalnum() or c == '+' or c == '/'
+def _utf7_TO_BASE64(n):
+    "Returns the base-64 character of the bottom 6 bits of n"
+    return "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[n & 0x3f]
+def _utf7_FROM_BASE64(c):
+    "Retuns the base-64 value of a base-64 character"
+    if c == '+':
+        return 62
+    elif c == '/':
+        return 63
+    elif c >= 'a':
+        return ord(c) - 71
+    elif c >= 'A':
+        return ord(c) - 65
+    else:
+        return ord(c) + 4
+
+def _utf7_ENCODE(result, ch, bits) :
+    while (bits >= 6):
+        result.append(_utf7_TO_BASE64(ch >> (bits - 6)))
+        bits -= 6
+    return bits
+
+def str_decode_utf_7(s, size, errors, final=False,
+                     errorhandler=None):
+    if errorhandler is None:
+        errorhandler = raise_unicode_exception_decode
+    if (size == 0):
+        return u'', 0
+
+    inShift = False
+    bitsleft = 0
+    startinpos = 0
+    charsleft = 0
+    surrogate = False
+
+    result = UnicodeBuilder(size)
+    pos = 0
+    while pos < size:
+        ch = s[pos]
+        oc = ord(ch)
+
+        if inShift:
+            if ch == '-' or not _utf7_B64CHAR(oc):
+                inShift = 0
+                pos += 1
+
+                while bitsleft >= 16:
+                    outCh =  (charsleft >> (bitsleft-16)) & 0xffff
+                    bitsleft -= 16
+
+                    if surrogate:
+                        ##  We have already generated an error for the high
+                        ##  surrogate so let's not bother seeing if the low
+                        ##  surrogate is correct or not
+                        surrogate = False
+                    elif 0xDC00 <= outCh <= 0xDFFF:
+                        ## This is a surrogate pair. Unfortunately we can't
+                        ## represent it in a 16-bit character
+                        surrogate = True
+                        msg = "code pairs are not supported"
+                        res, pos = errorhandler(errors, 'utf-7',
+                                                msg, s, pos-1, pos)
+                        result.append(res)
+                        bitsleft = 0
+                        break
+                    else:
+                        result.append(unichr(outCh))
+                if bitsleft >= 6:
+                    ## The shift sequence has a partial character in it. If
+                    ## bitsleft < 6 then we could just classify it as padding
+                    ## but that is not the case here
+                    msg = "partial character in shift sequence"
+                    res, pos = errorhandler(errors, 'utf-7',
+                                            msg, s, pos-1, pos)
+                    result.append(res)
+                    ## According to RFC2152 the remaining bits should be
+                    ## zero. We choose to signal an error/insert a replacement
+                    ## character here so indicate the potential of a
+                    ## misencoded character.
+                if ch == '-':
+                    if pos < size and s[pos] == '-':
+                        result.append(u'-')
+                        inShift = True
+
+                elif _utf7_SPECIAL(oc) :
+                    msg = "unexpected special character"
+                    res, pos = errorhandler(errors, 'utf-7',
+                                            msg, s, pos-1, pos)
+                    result.append(res)
+                else:
+                    result.append(unichr(ord(ch)))
+            else:
+                charsleft = (charsleft << 6) | _utf7_FROM_BASE64(ch)
+                bitsleft += 6
+                pos += 1
+        elif ch == '+':
+            startinpos = pos
+            pos += 1
+            if pos < size and s[pos] == '-':
+                pos += 1
+                result.append(u'+')
+            else:
+                inShift = 1
+                bitsleft = 0
+
+        elif _utf7_SPECIAL(oc):
+            pos += 1
+            msg = "unexpected special character"
+            res, pos = errorhandler(errors, 'utf-7', msg, s, pos-1, pos)
+            result.append(res)
+        else:
+            result.append(unichr(oc))
+            pos += 1
+
+    if inShift:
+        endinpos = size
+        msg = "unterminated shift sequence"
+        res, pos = errorhandler(errors, 'utf-7', msg, s, startinpos, pos)
+        result.append(res)
+
+    return result.build(), pos
+
+def unicode_encode_utf_7(s, size, errors, errorhandler=None):
+    if (size == 0):
+        return ''
+    result = StringBuilder(size)
+
+    encodeSetO = encodeWhiteSpace = False
+
+    inShift = False
+    bitsleft = 0
+    charsleft = 0
+
+    pos = 0
+    while pos < size:
+        ch = s[pos]
+        oc = ord(ch)
+        if not inShift:
+            if ch == u'+':
+                result.append('+-')
+            elif _utf7_SPECIAL(oc, encodeSetO, encodeWhiteSpace):
+                charsleft = oc
+                bitsleft = 16
+                result.append('+')
+                bitsleft = _utf7_ENCODE(result, charsleft, bitsleft)
+                inShift = bitsleft > 0
+            else:
+                result.append(chr(oc))
+        else:
+            if not _utf7_SPECIAL(oc, encodeSetO, encodeWhiteSpace):
+                result.append(_utf7_TO_BASE64(charsleft << (6-bitsleft)))
+                charsleft = 0
+                bitsleft = 0
+                ## Characters not in the BASE64 set implicitly unshift the
+                ## sequence so no '-' is required, except if the character is
+                ## itself a '-'
+                if _utf7_B64CHAR(oc) or ch == u'-':
+                    result.append('-')
+                inShift = False
+                result.append(chr(oc))
+            else:
+                bitsleft += 16
+                charsleft = (charsleft << 16) | oc
+                bitsleft =  _utf7_ENCODE(result, charsleft, bitsleft)
+                ## If the next character is special then we dont' need to
+                ## terminate the shift sequence. If the next character is not
+                ## a BASE64 character or '-' then the shift sequence will be
+                ## terminated implicitly and we don't have to insert a '-'.
+                if bitsleft == 0:
+                    if pos + 1 < size:
+                        ch2 = s[pos + 1]
+                        oc2 = ord(ch2)
+
+                        if (_utf7_SPECIAL(oc2, encodeSetO, encodeWhiteSpace)):
+                            pass
+                        elif (_utf7_B64CHAR(oc2) or ch2 == u'-'):
+                            result.append('-')
+                            inShift = False
+                        else:
+                            inShift = False
+                    else:
+                        result.append('-')
+                        inShift = False
+        pos += 1
+
+    if bitsleft:
+        result.append(_utf7_TO_BASE64(charsleft << (6 - bitsleft)))
+        result.append('-')
+
+    return result.build()
+
+# ____________________________________________________________
 # ascii and latin-1
 
 def str_decode_latin_1(s, size, errors, final=False,
@@ -563,7 +785,7 @@
 
         ch = s[pos]
         pos += 1
-        # \x escapes */
+        # \x escapes
         if ch == '\n': pass
         elif ch == '\\': builder.append(u'\\')
         elif ch == '\'': builder.append(u'\'')
@@ -697,7 +919,7 @@
             pos += 1
             oc2 = ord(s[pos])
 
-            if 0xDC00 <= oc2 < 0xDFFF:
+            if 0xDC00 <= oc2 <= 0xDFFF:
                 ucs = (((oc & 0x03FF) << 10) | (oc2 & 0x03FF)) + 0x00010000
                 raw_unicode_escape_helper(result, ucs)
                 pos += 1



More information about the Pypy-commit mailing list