[pypy-svn] r48616 - pypy/branch/more-unicode-improvements/pypy/module/_codecs

cfbolz at codespeak.net cfbolz at codespeak.net
Mon Nov 12 22:34:10 CET 2007


Author: cfbolz
Date: Mon Nov 12 22:34:09 2007
New Revision: 48616

Modified:
   pypy/branch/more-unicode-improvements/pypy/module/_codecs/app_codecs.py
Log:
kill kill kill kill kill kill kill kill the applevel versions (they are buggy
anyway)


Modified: pypy/branch/more-unicode-improvements/pypy/module/_codecs/app_codecs.py
==============================================================================
--- pypy/branch/more-unicode-improvements/pypy/module/_codecs/app_codecs.py	(original)
+++ pypy/branch/more-unicode-improvements/pypy/module/_codecs/app_codecs.py	Mon Nov 12 22:34:09 2007
@@ -42,12 +42,6 @@
 
 import sys
 
-def latin_1_encode( obj, errors='strict'):
-    """None
-    """
-    res = PyUnicode_EncodeLatin1(obj, len(obj), errors)
-    res = ''.join(res)
-    return res, len(res)
 # XXX MBCS codec might involve ctypes ?
 def mbcs_decode():
     """None
@@ -67,16 +61,6 @@
     v = s[1:-1]
     return v, len(v)
 
-def utf_8_decode( data, errors='strict', final=False):
-    """None
-    """
-    consumed = len(data)
-    if final:
-        consumed = 0
-    res, consumed = PyUnicode_DecodeUTF8Stateful(data, len(data), errors, final)
-    res = u''.join(res)
-    return res, consumed
-
 def raw_unicode_escape_decode( data, errors='strict'):
     """None
     """
@@ -98,23 +82,6 @@
     res = ''.join(res)
     return res, len(res)
 
-def latin_1_decode( data, errors='strict'):
-    """None
-    """
-    res = PyUnicode_DecodeLatin1(data, len(data), errors)
-    res = u''.join(res)
-    return res, len(res)
-
-def utf_16_decode( data, errors='strict', final=False):
-    """None
-    """
-    consumed = len(data)
-    if final:
-        consumed = 0
-    res, consumed, byteorder = PyUnicode_DecodeUTF16Stateful(data, len(data), errors, 'native', final)
-    res = ''.join(res)
-    return res, consumed
-
 def unicode_escape_decode( data, errors='strict'):
     """None
     """
@@ -123,13 +90,6 @@
     return res, len(res)
 
 
-def ascii_decode( data, errors='strict'):
-    """None
-    """
-    res = PyUnicode_DecodeASCII(data, len(data), errors)
-    res = u''.join(res)
-    return res, len(res)
-
 def charmap_encode(obj, errors='strict', mapping='latin-1'):
     """None
     """
@@ -289,20 +249,6 @@
 ##                  len(obj))
     
 
-def ascii_encode( obj, errors='strict'):
-    """None
-    """
-    res = PyUnicode_EncodeASCII(obj, len(obj), errors)
-    res = ''.join(res)
-    return res, len(res)
-
-def utf_16_encode( obj, errors='strict'):
-    """None
-    """
-    res = PyUnicode_EncodeUTF16(obj, len(obj), errors, 'native')
-    res = ''.join(res)
-    return res, len(res)
-
 def raw_unicode_escape_encode( obj, errors='strict'):
     """None
     """
@@ -310,47 +256,6 @@
     res = ''.join(res)
     return res, len(res)
 
-def utf_8_encode( obj, errors='strict'):
-    """None
-    """
-    res = PyUnicode_EncodeUTF8(obj, len(obj), errors)
-    res = ''.join(res)
-    return res, len(res)
-
-def utf_16_le_encode( obj, errors='strict'):
-    """None
-    """
-    res = PyUnicode_EncodeUTF16(obj, len(obj), errors, 'little')
-    res = ''.join(res)
-    return res, len(res)
-
-def utf_16_be_encode( obj, errors='strict'):
-    """None
-    """
-    res = PyUnicode_EncodeUTF16(obj, len(obj), errors, 'big')
-    res = ''.join(res)
-    return res, len(res)
-
-def utf_16_le_decode( data, errors='strict', byteorder=0, final = 0):
-    """None
-    """
-    consumed = len(data)
-    if final:
-        consumed = 0
-    res, consumed, byteorder = PyUnicode_DecodeUTF16Stateful(data, len(data), errors, 'little', final)
-    res = u''.join(res)
-    return res, consumed
-
-def utf_16_be_decode( data, errors='strict', byteorder=0, final = 0):
-    """None
-    """
-    consumed = len(data)
-    if final:
-        consumed = 0
-    res, consumed, byteorder = PyUnicode_DecodeUTF16Stateful(data, len(data), errors, 'big', final)
-    res = u''.join(res)
-    return res, consumed
-
 def strict_errors(exc):
     if isinstance(exc, Exception):
         raise exc
@@ -714,40 +619,6 @@
         p += p[1]
     return p
 
-def PyUnicode_DecodeASCII(s, size, errors):
-
-#    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
-    if (size == 1 and ord(s) < 128) :
-        return [unichr(ord(s))]
-    if (size == 0):
-        return [u''] #unicode('')
-    p = []
-    pos = 0
-    while pos < len(s):
-        c = s[pos]
-        if ord(c) < 128:
-            p += unichr(ord(c))
-            pos += 1
-        else:
-            
-            res = unicode_call_errorhandler(
-                    errors, "ascii", "ordinal not in range(128)",
-                    s,  pos, pos+1)
-            p += [unichr(ord(x)) for x in res[0]]
-            pos = res[1]
-    return p
-
-def PyUnicode_EncodeASCII(p, size, errors):
-
-    return unicode_encode_ucs1(p, size, errors, 128)
-
-def PyUnicode_AsASCIIString(unistr):
-
-    if not type(unistr) == unicode:
-        raise TypeError
-    return PyUnicode_EncodeASCII(unicode(unistr),
-                                 len(unicode),
-                                None)
 
 def PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder='native', final=True):
 
@@ -871,40 +742,6 @@
     else:
         return [hi, lo]
 
-def PyUnicode_EncodeUTF16(s, size, errors, byteorder='little'):
-
-#    /* Offsets from p for storing byte pairs in the right order. */
-
-        
-    p = []
-    bom = sys.byteorder
-    if (byteorder == 'native'):
-        
-        bom = sys.byteorder
-        p += STORECHAR(0xFEFF, bom)
-        
-    if (size == 0):
-        return ""
-
-    if (byteorder == 'little' ):
-        bom = 'little'
-    elif (byteorder == 'big'):
-        bom = 'big'
-
-
-    for c in s:
-        ch = ord(c)
-        ch2 = 0
-        if (ch >= 0x10000) :
-            ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF)
-            ch  = 0xD800 | ((ch-0x10000) >> 10)
-
-        p += STORECHAR(ch, bom)
-        if (ch2):
-            p += STORECHAR(ch2, bom)
-
-    return p
-
 
 def PyUnicode_DecodeMBCS(s, size, errors):
     pass
@@ -932,239 +769,6 @@
     else:
         raise TypeError("encoding error handler must return (unicode, int) tuple, not %s" % repr(res))
 
-def PyUnicode_DecodeUTF8(s, size, errors):
-    return PyUnicode_DecodeUTF8Stateful(s, size, errors, False)
-
-##    /* Map UTF-8 encoded prefix byte to sequence length.  zero means
-##       illegal prefix.  see RFC 2279 for details */
-utf8_code_length = [
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-    4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
-]
-
-def PyUnicode_DecodeUTF8Stateful(s, size, errors, final):
-    
-    consumed = 0
-    if (size == 0):
-        if not final:
-            consumed = 0
-        return u'', consumed
-    p = []
-    pos = 0
-    while pos < size:
-        ch = s[pos]
-        if ord(ch) < 0x80:
-            p += ch
-            pos += 1
-            continue
-        
-        n = utf8_code_length[ord(ch)]
-        startinpos =  pos 
-        if (startinpos + n > size):
-            if not final:
-                break
-            else:
-                errmsg = "unexpected end of data"
-                endinpos = size 
-                res = unicode_call_errorhandler(
-                                    errors, "utf8", errmsg,
-                                    s,  startinpos, endinpos)
-                p += res[0]
-                pos = res[1]
-        if n == 0:
-            errmsg = "unexpected code byte"
-            endinpos = startinpos+1
-            res = unicode_call_errorhandler(
-                                    errors, "utf8", errmsg,
-                                    s,  startinpos, endinpos)
-            p += res[0]
-            pos = res[1]
-        elif n == 1:
-            errmsg = "internal error"
-            endinpos = startinpos+1
-            res = unicode_call_errorhandler(
-                                    errors, "utf8", errmsg,
-                                    s,  startinpos, endinpos)
-            p += res[0]
-            pos = res[1]
-        elif n == 2:
-            if ((ord(s[pos+1]) & 0xc0) != 0x80):
-                errmsg = "invalid data"
-                endinpos = startinpos+2
-                res = unicode_call_errorhandler(
-                                    errors, "utf8", errmsg,
-                                    s,  startinpos, endinpos)
-                p += res[0]
-                pos = res[1]
-            else:
-                c = ((ord(s[pos]) & 0x1f) << 6) + (ord(s[pos+1]) & 0x3f)
-                if c < 0x80:
-                    errmsg = "illegal encoding"
-                    endinpos = startinpos+2
-                    res = unicode_call_errorhandler(
-                                            errors, "utf8", errmsg,
-                                            s,  startinpos, endinpos)
-                    p += res[0]
-                    pos = res[1]
-                else:
-                    p += unichr(c)
-                    pos += n
-                    #break
-        elif n == 3:
-            if ((ord(s[pos+1]) & 0xc0) != 0x80 or
-                    (ord(s[pos+2]) & 0xc0) != 0x80):
-                errmsg = "invalid data"
-                endinpos = startinpos+3
-                res = unicode_call_errorhandler(
-                                            errors, "utf8", errmsg,
-                                            s,  startinpos, endinpos)
-                p += res[0]
-                pos = res[1]
-            else:
-                c = ((ord(s[pos]) & 0x0f) << 12) + \
-                        ((ord(s[pos+1]) & 0x3f) << 6) +\
-                        (ord(s[pos+2]) & 0x3f)       
-                        
-##              /* Note: UTF-8 encodings of surrogates are considered
-##                 legal UTF-8 sequences;
-##
-##                 XXX For wide builds (UCS-4) we should probably try
-##                     to recombine the surrogates into a single code
-##                     unit.
-##              */
-                if c < 0x0800:
-                    errmsg = "illegal encoding"
-                    endinpos = startinpos+3
-                    res = unicode_call_errorhandler(
-                                        errors, "utf8", errmsg,
-                                        s,  startinpos, endinpos)
-                    p += res[0]
-                    pos = res[1]
-                else:
-                    p += unichr(c)
-                    pos += n
-        elif n == 4:
-##        case 4:
-            if ((ord(s[pos+1]) & 0xc0) != 0x80 or
-                (ord(s[pos+2]) & 0xc0) != 0x80 or
-                (ord(s[pos+3]) & 0xc0) != 0x80):
-                
-                errmsg = "invalid data"
-                startinpos = pos
-                endinpos = startinpos+4
-                res = unicode_call_errorhandler(
-                            errors, "utf8", errmsg,
-                            s,  startinpos, endinpos)
-                p += res[0]
-                pos = res[1]
-            else:
-                c = ((ord(s[pos+0]) & 0x7) << 18) + ((ord(s[pos+1]) & 0x3f) << 12) +\
-                     ((ord(s[pos+2]) & 0x3f) << 6) + (ord(s[pos+3]) & 0x3f)
-                #/* validate and convert to UTF-16 */
-                if ((c < 0x10000) or (c > 0x10ffff)):
-                    #/* minimum value allowed for 4 byte encoding */
-                    #/* maximum value allowed for UTF-16 */
-           
-                    errmsg = "illegal encoding"
-                    startinpos = pos
-                    endinpos = startinpos+4
-                    res = unicode_call_errorhandler(
-                                            errors, "utf8", errmsg,
-                                            s,  startinpos, endinpos)
-                    p += res[0]
-                    pos = res[1]
-                else:
-#ifdef Py_UNICODE_WIDE
-                    if c < sys.maxunicode:
-                        p += unichr(c)
-                        pos += n
-                    else:
-##                /*  compute and append the two surrogates: */
-##                /*  translate from 10000..10FFFF to 0..FFFF */
-                        c -= 0x10000
-            #/*  high surrogate = top 10 bits added to D800 */
-                        p += unichr(0xD800 + (c >> 10))
-            #/*  low surrogate = bottom 10 bits added to DC00 */
-                        p += unichr(0xDC00 + (c & 0x03FF))
-                        pos += n
-        else:
-##        default:
-##            /* Other sizes are only needed for UCS-4 */
-            errmsg = "unsupported Unicode code range"
-            startinpos = pos
-            endinpos = startinpos+n
-            res = unicode_call_errorhandler(
-                     errors, "utf8", errmsg,
-                     s,  startinpos, endinpos)
-            p += res[0]
-            pos = res[1]
-            
-        #continue
-
-    if not final:
-        consumed = pos
-    return p, pos # consumed
-
-def PyUnicode_EncodeUTF8(s, size, errors):
-
-    #assert(s != None)
-    assert(size >= 0)
-    p = []
-    i = 0
-    while i < size:
-        ch = s[i]
-        i += 1
-        if (ord(ch) < 0x80):
-##         /* Encode ASCII */
-            p += chr(ord(ch))
-        elif (ord(ch) < 0x0800) :
-##            /* Encode Latin-1 */
-            p += chr((0xc0 | (ord(ch) >> 6)))
-            p += chr((0x80 | (ord(ch) & 0x3f)))
-        else:
-##            /* Encode UCS2 Unicode ordinals */
-            if (ord(ch) < 0x10000):
-##                /* Special case: check for high surrogate */
-                if (0xD800 <= ord(ch) and ord(ch) <= 0xDBFF and i != size) :
-                    ch2 = s[i]
-##                    /* Check for low surrogate and combine the two to
-##                       form a UCS4 value */
-                    if (0xDC00 <= ord(ch2) and ord(ch2) <= 0xDFFF) :
-                        ch3 = ((ord(ch) - 0xD800) << 10 | (ord(ch2) - 0xDC00)) + 0x10000
-                        i += 1
-                        p.extend(encodeUCS4(ch3))
-                        continue
-##                    /* Fall through: handles isolated high surrogates */
-                p += (chr((0xe0 | (ord(ch) >> 12))))
-                p += (chr((0x80 | ((ord(ch) >> 6) & 0x3f))))
-                p += (chr((0x80 | (ord(ch) & 0x3f))))
-                continue
-            else:
-                p.extend(encodeUCS4(ord(ch)))
-    return p
-
-def encodeUCS4(ch):
-##      /* Encode UCS4 Unicode ordinals */
-    p = []
-    p +=  (chr((0xf0 | (ch >> 18))))
-    p +=  (chr((0x80 | ((ch >> 12) & 0x3f))))
-    p +=  (chr((0x80 | ((ch >> 6) & 0x3f))))
-    p +=  (chr((0x80 | (ch & 0x3f))))
-    return p
 
 #/* --- Latin-1 Codec ------------------------------------------------------ */
 



More information about the Pypy-commit mailing list