[pypy-svn] r48616 - pypy/branch/more-unicode-improvements/pypy/module/_codecs
cfbolz at codespeak.net
cfbolz at codespeak.net
Mon Nov 12 22:34:10 CET 2007
Author: cfbolz
Date: Mon Nov 12 22:34:09 2007
New Revision: 48616
Modified:
pypy/branch/more-unicode-improvements/pypy/module/_codecs/app_codecs.py
Log:
kill kill kill kill kill kill kill kill the applevel versions (they are buggy
anyway)
Modified: pypy/branch/more-unicode-improvements/pypy/module/_codecs/app_codecs.py
==============================================================================
--- pypy/branch/more-unicode-improvements/pypy/module/_codecs/app_codecs.py (original)
+++ pypy/branch/more-unicode-improvements/pypy/module/_codecs/app_codecs.py Mon Nov 12 22:34:09 2007
@@ -42,12 +42,6 @@
import sys
-def latin_1_encode( obj, errors='strict'):
- """None
- """
- res = PyUnicode_EncodeLatin1(obj, len(obj), errors)
- res = ''.join(res)
- return res, len(res)
# XXX MBCS codec might involve ctypes ?
def mbcs_decode():
"""None
@@ -67,16 +61,6 @@
v = s[1:-1]
return v, len(v)
-def utf_8_decode( data, errors='strict', final=False):
- """None
- """
- consumed = len(data)
- if final:
- consumed = 0
- res, consumed = PyUnicode_DecodeUTF8Stateful(data, len(data), errors, final)
- res = u''.join(res)
- return res, consumed
-
def raw_unicode_escape_decode( data, errors='strict'):
"""None
"""
@@ -98,23 +82,6 @@
res = ''.join(res)
return res, len(res)
-def latin_1_decode( data, errors='strict'):
- """None
- """
- res = PyUnicode_DecodeLatin1(data, len(data), errors)
- res = u''.join(res)
- return res, len(res)
-
-def utf_16_decode( data, errors='strict', final=False):
- """None
- """
- consumed = len(data)
- if final:
- consumed = 0
- res, consumed, byteorder = PyUnicode_DecodeUTF16Stateful(data, len(data), errors, 'native', final)
- res = ''.join(res)
- return res, consumed
-
def unicode_escape_decode( data, errors='strict'):
"""None
"""
@@ -123,13 +90,6 @@
return res, len(res)
-def ascii_decode( data, errors='strict'):
- """None
- """
- res = PyUnicode_DecodeASCII(data, len(data), errors)
- res = u''.join(res)
- return res, len(res)
-
def charmap_encode(obj, errors='strict', mapping='latin-1'):
"""None
"""
@@ -289,20 +249,6 @@
## len(obj))
-def ascii_encode( obj, errors='strict'):
- """None
- """
- res = PyUnicode_EncodeASCII(obj, len(obj), errors)
- res = ''.join(res)
- return res, len(res)
-
-def utf_16_encode( obj, errors='strict'):
- """None
- """
- res = PyUnicode_EncodeUTF16(obj, len(obj), errors, 'native')
- res = ''.join(res)
- return res, len(res)
-
def raw_unicode_escape_encode( obj, errors='strict'):
"""None
"""
@@ -310,47 +256,6 @@
res = ''.join(res)
return res, len(res)
-def utf_8_encode( obj, errors='strict'):
- """None
- """
- res = PyUnicode_EncodeUTF8(obj, len(obj), errors)
- res = ''.join(res)
- return res, len(res)
-
-def utf_16_le_encode( obj, errors='strict'):
- """None
- """
- res = PyUnicode_EncodeUTF16(obj, len(obj), errors, 'little')
- res = ''.join(res)
- return res, len(res)
-
-def utf_16_be_encode( obj, errors='strict'):
- """None
- """
- res = PyUnicode_EncodeUTF16(obj, len(obj), errors, 'big')
- res = ''.join(res)
- return res, len(res)
-
-def utf_16_le_decode( data, errors='strict', byteorder=0, final = 0):
- """None
- """
- consumed = len(data)
- if final:
- consumed = 0
- res, consumed, byteorder = PyUnicode_DecodeUTF16Stateful(data, len(data), errors, 'little', final)
- res = u''.join(res)
- return res, consumed
-
-def utf_16_be_decode( data, errors='strict', byteorder=0, final = 0):
- """None
- """
- consumed = len(data)
- if final:
- consumed = 0
- res, consumed, byteorder = PyUnicode_DecodeUTF16Stateful(data, len(data), errors, 'big', final)
- res = u''.join(res)
- return res, consumed
-
def strict_errors(exc):
if isinstance(exc, Exception):
raise exc
@@ -714,40 +619,6 @@
p += p[1]
return p
-def PyUnicode_DecodeASCII(s, size, errors):
-
-# /* ASCII is equivalent to the first 128 ordinals in Unicode. */
- if (size == 1 and ord(s) < 128) :
- return [unichr(ord(s))]
- if (size == 0):
- return [u''] #unicode('')
- p = []
- pos = 0
- while pos < len(s):
- c = s[pos]
- if ord(c) < 128:
- p += unichr(ord(c))
- pos += 1
- else:
-
- res = unicode_call_errorhandler(
- errors, "ascii", "ordinal not in range(128)",
- s, pos, pos+1)
- p += [unichr(ord(x)) for x in res[0]]
- pos = res[1]
- return p
-
-def PyUnicode_EncodeASCII(p, size, errors):
-
- return unicode_encode_ucs1(p, size, errors, 128)
-
-def PyUnicode_AsASCIIString(unistr):
-
- if not type(unistr) == unicode:
- raise TypeError
- return PyUnicode_EncodeASCII(unicode(unistr),
- len(unicode),
- None)
def PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder='native', final=True):
@@ -871,40 +742,6 @@
else:
return [hi, lo]
-def PyUnicode_EncodeUTF16(s, size, errors, byteorder='little'):
-
-# /* Offsets from p for storing byte pairs in the right order. */
-
-
- p = []
- bom = sys.byteorder
- if (byteorder == 'native'):
-
- bom = sys.byteorder
- p += STORECHAR(0xFEFF, bom)
-
- if (size == 0):
- return ""
-
- if (byteorder == 'little' ):
- bom = 'little'
- elif (byteorder == 'big'):
- bom = 'big'
-
-
- for c in s:
- ch = ord(c)
- ch2 = 0
- if (ch >= 0x10000) :
- ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF)
- ch = 0xD800 | ((ch-0x10000) >> 10)
-
- p += STORECHAR(ch, bom)
- if (ch2):
- p += STORECHAR(ch2, bom)
-
- return p
-
def PyUnicode_DecodeMBCS(s, size, errors):
pass
@@ -932,239 +769,6 @@
else:
raise TypeError("encoding error handler must return (unicode, int) tuple, not %s" % repr(res))
-def PyUnicode_DecodeUTF8(s, size, errors):
- return PyUnicode_DecodeUTF8Stateful(s, size, errors, False)
-
-## /* Map UTF-8 encoded prefix byte to sequence length. zero means
-## illegal prefix. see RFC 2279 for details */
-utf8_code_length = [
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
- 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
-]
-
-def PyUnicode_DecodeUTF8Stateful(s, size, errors, final):
-
- consumed = 0
- if (size == 0):
- if not final:
- consumed = 0
- return u'', consumed
- p = []
- pos = 0
- while pos < size:
- ch = s[pos]
- if ord(ch) < 0x80:
- p += ch
- pos += 1
- continue
-
- n = utf8_code_length[ord(ch)]
- startinpos = pos
- if (startinpos + n > size):
- if not final:
- break
- else:
- errmsg = "unexpected end of data"
- endinpos = size
- res = unicode_call_errorhandler(
- errors, "utf8", errmsg,
- s, startinpos, endinpos)
- p += res[0]
- pos = res[1]
- if n == 0:
- errmsg = "unexpected code byte"
- endinpos = startinpos+1
- res = unicode_call_errorhandler(
- errors, "utf8", errmsg,
- s, startinpos, endinpos)
- p += res[0]
- pos = res[1]
- elif n == 1:
- errmsg = "internal error"
- endinpos = startinpos+1
- res = unicode_call_errorhandler(
- errors, "utf8", errmsg,
- s, startinpos, endinpos)
- p += res[0]
- pos = res[1]
- elif n == 2:
- if ((ord(s[pos+1]) & 0xc0) != 0x80):
- errmsg = "invalid data"
- endinpos = startinpos+2
- res = unicode_call_errorhandler(
- errors, "utf8", errmsg,
- s, startinpos, endinpos)
- p += res[0]
- pos = res[1]
- else:
- c = ((ord(s[pos]) & 0x1f) << 6) + (ord(s[pos+1]) & 0x3f)
- if c < 0x80:
- errmsg = "illegal encoding"
- endinpos = startinpos+2
- res = unicode_call_errorhandler(
- errors, "utf8", errmsg,
- s, startinpos, endinpos)
- p += res[0]
- pos = res[1]
- else:
- p += unichr(c)
- pos += n
- #break
- elif n == 3:
- if ((ord(s[pos+1]) & 0xc0) != 0x80 or
- (ord(s[pos+2]) & 0xc0) != 0x80):
- errmsg = "invalid data"
- endinpos = startinpos+3
- res = unicode_call_errorhandler(
- errors, "utf8", errmsg,
- s, startinpos, endinpos)
- p += res[0]
- pos = res[1]
- else:
- c = ((ord(s[pos]) & 0x0f) << 12) + \
- ((ord(s[pos+1]) & 0x3f) << 6) +\
- (ord(s[pos+2]) & 0x3f)
-
-## /* Note: UTF-8 encodings of surrogates are considered
-## legal UTF-8 sequences;
-##
-## XXX For wide builds (UCS-4) we should probably try
-## to recombine the surrogates into a single code
-## unit.
-## */
- if c < 0x0800:
- errmsg = "illegal encoding"
- endinpos = startinpos+3
- res = unicode_call_errorhandler(
- errors, "utf8", errmsg,
- s, startinpos, endinpos)
- p += res[0]
- pos = res[1]
- else:
- p += unichr(c)
- pos += n
- elif n == 4:
-## case 4:
- if ((ord(s[pos+1]) & 0xc0) != 0x80 or
- (ord(s[pos+2]) & 0xc0) != 0x80 or
- (ord(s[pos+3]) & 0xc0) != 0x80):
-
- errmsg = "invalid data"
- startinpos = pos
- endinpos = startinpos+4
- res = unicode_call_errorhandler(
- errors, "utf8", errmsg,
- s, startinpos, endinpos)
- p += res[0]
- pos = res[1]
- else:
- c = ((ord(s[pos+0]) & 0x7) << 18) + ((ord(s[pos+1]) & 0x3f) << 12) +\
- ((ord(s[pos+2]) & 0x3f) << 6) + (ord(s[pos+3]) & 0x3f)
- #/* validate and convert to UTF-16 */
- if ((c < 0x10000) or (c > 0x10ffff)):
- #/* minimum value allowed for 4 byte encoding */
- #/* maximum value allowed for UTF-16 */
-
- errmsg = "illegal encoding"
- startinpos = pos
- endinpos = startinpos+4
- res = unicode_call_errorhandler(
- errors, "utf8", errmsg,
- s, startinpos, endinpos)
- p += res[0]
- pos = res[1]
- else:
-#ifdef Py_UNICODE_WIDE
- if c < sys.maxunicode:
- p += unichr(c)
- pos += n
- else:
-## /* compute and append the two surrogates: */
-## /* translate from 10000..10FFFF to 0..FFFF */
- c -= 0x10000
- #/* high surrogate = top 10 bits added to D800 */
- p += unichr(0xD800 + (c >> 10))
- #/* low surrogate = bottom 10 bits added to DC00 */
- p += unichr(0xDC00 + (c & 0x03FF))
- pos += n
- else:
-## default:
-## /* Other sizes are only needed for UCS-4 */
- errmsg = "unsupported Unicode code range"
- startinpos = pos
- endinpos = startinpos+n
- res = unicode_call_errorhandler(
- errors, "utf8", errmsg,
- s, startinpos, endinpos)
- p += res[0]
- pos = res[1]
-
- #continue
-
- if not final:
- consumed = pos
- return p, pos # consumed
-
-def PyUnicode_EncodeUTF8(s, size, errors):
-
- #assert(s != None)
- assert(size >= 0)
- p = []
- i = 0
- while i < size:
- ch = s[i]
- i += 1
- if (ord(ch) < 0x80):
-## /* Encode ASCII */
- p += chr(ord(ch))
- elif (ord(ch) < 0x0800) :
-## /* Encode Latin-1 */
- p += chr((0xc0 | (ord(ch) >> 6)))
- p += chr((0x80 | (ord(ch) & 0x3f)))
- else:
-## /* Encode UCS2 Unicode ordinals */
- if (ord(ch) < 0x10000):
-## /* Special case: check for high surrogate */
- if (0xD800 <= ord(ch) and ord(ch) <= 0xDBFF and i != size) :
- ch2 = s[i]
-## /* Check for low surrogate and combine the two to
-## form a UCS4 value */
- if (0xDC00 <= ord(ch2) and ord(ch2) <= 0xDFFF) :
- ch3 = ((ord(ch) - 0xD800) << 10 | (ord(ch2) - 0xDC00)) + 0x10000
- i += 1
- p.extend(encodeUCS4(ch3))
- continue
-## /* Fall through: handles isolated high surrogates */
- p += (chr((0xe0 | (ord(ch) >> 12))))
- p += (chr((0x80 | ((ord(ch) >> 6) & 0x3f))))
- p += (chr((0x80 | (ord(ch) & 0x3f))))
- continue
- else:
- p.extend(encodeUCS4(ord(ch)))
- return p
-
-def encodeUCS4(ch):
-## /* Encode UCS4 Unicode ordinals */
- p = []
- p += (chr((0xf0 | (ch >> 18))))
- p += (chr((0x80 | ((ch >> 12) & 0x3f))))
- p += (chr((0x80 | ((ch >> 6) & 0x3f))))
- p += (chr((0x80 | (ch & 0x3f))))
- return p
#/* --- Latin-1 Codec ------------------------------------------------------ */
More information about the Pypy-commit
mailing list