[pypy-svn] r75591 - pypy/branch/interplevel-codecs/pypy/rlib
afa at codespeak.net
afa at codespeak.net
Fri Jun 25 15:49:12 CEST 2010
Author: afa
Date: Fri Jun 25 15:49:11 2010
New Revision: 75591
Modified:
pypy/branch/interplevel-codecs/pypy/rlib/runicode.py
Log:
Move code, order functions by codec
Modified: pypy/branch/interplevel-codecs/pypy/rlib/runicode.py
==============================================================================
--- pypy/branch/interplevel-codecs/pypy/rlib/runicode.py (original)
+++ pypy/branch/interplevel-codecs/pypy/rlib/runicode.py Fri Jun 25 15:49:11 2010
@@ -55,8 +55,8 @@
assert isinstance(u, unicode)
raise UnicodeEncodeError(encoding, u, startingpos, endingpos, msg)
-# ____________________________________________________________
-# unicode decoding
+# ____________________________________________________________
+# utf-8
utf8_code_length = [
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -199,6 +199,51 @@
return u"".join(result), pos
+def _encodeUCS4(result, ch):
+ # Encode UCS4 Unicode ordinals
+ result.append((chr((0xf0 | (ch >> 18)))))
+ result.append((chr((0x80 | ((ch >> 12) & 0x3f)))))
+ result.append((chr((0x80 | ((ch >> 6) & 0x3f)))))
+ result.append((chr((0x80 | (ch & 0x3f)))))
+
+def unicode_encode_utf_8(s, size, errors, errorhandler=None):
+ assert(size >= 0)
+ result = []
+ i = 0
+ while i < size:
+ ch = ord(s[i])
+ i += 1
+ if (ch < 0x80):
+ # Encode ASCII
+ result.append(chr(ch))
+ elif (ch < 0x0800) :
+ # Encode Latin-1
+ result.append(chr((0xc0 | (ch >> 6))))
+ result.append(chr((0x80 | (ch & 0x3f))))
+ else:
+ # Encode UCS2 Unicode ordinals
+ if (ch < 0x10000):
+ # Special case: check for high surrogate
+ if (0xD800 <= ch and ch <= 0xDBFF and i != size) :
+ ch2 = ord(s[i])
+ # Check for low surrogate and combine the two to
+ # form a UCS4 value
+ if (0xDC00 <= ch2 and ch2 <= 0xDFFF) :
+ ch3 = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000
+ i += 1
+ _encodeUCS4(result, ch3)
+ continue
+ # Fall through: handles isolated high surrogates
+ result.append((chr((0xe0 | (ch >> 12)))))
+ result.append((chr((0x80 | ((ch >> 6) & 0x3f)))))
+ result.append((chr((0x80 | (ch & 0x3f)))))
+ continue
+ else:
+ _encodeUCS4(result, ch)
+ return "".join(result)
+
+# ____________________________________________________________
+# utf-16
def str_decode_utf_16(s, size, errors, final=True,
errorhandler=None):
@@ -320,6 +365,60 @@
result.append(r)
return u"".join(result), pos, bo
+def _STORECHAR(result, CH, byteorder):
+ hi = chr(((CH) >> 8) & 0xff)
+ lo = chr((CH) & 0xff)
+ if byteorder == 'little':
+ result.append(lo)
+ result.append(hi)
+ else:
+ result.append(hi)
+ result.append(lo)
+
+def unicode_encode_utf_16_helper(s, size, errors,
+ errorhandler=None,
+ byteorder='little'):
+ result = []
+ if (byteorder == 'native'):
+ _STORECHAR(result, 0xFEFF, BYTEORDER)
+ byteorder = BYTEORDER
+
+ if size == 0:
+ return ""
+
+ i = 0
+ while i < size:
+ ch = ord(s[i])
+ i += 1
+ ch2 = 0
+ if (ch >= 0x10000) :
+ ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF)
+ ch = 0xD800 | ((ch-0x10000) >> 10)
+
+ _STORECHAR(result, ch, byteorder)
+ if ch2:
+ _STORECHAR(result, ch2, byteorder)
+
+ return "".join(result)
+
+def unicode_encode_utf_16(s, size, errors,
+ errorhandler=None):
+ return unicode_encode_utf_16_helper(s, size, errors, errorhandler, "native")
+
+
+def unicode_encode_utf_16_be(s, size, errors,
+ errorhandler=None):
+ return unicode_encode_utf_16_helper(s, size, errors, errorhandler, "big")
+
+
+def unicode_encode_utf_16_le(s, size, errors,
+ errorhandler=None):
+ return unicode_encode_utf_16_helper(s, size, errors, errorhandler, "little")
+
+
+# ____________________________________________________________
+# ascii and latin-1
+
def str_decode_latin_1(s, size, errors, final=False,
errorhandler=None):
# latin1 is equivalent to the first 256 ordinals in Unicode.
@@ -350,54 +449,6 @@
return u"".join(result), pos
-# ____________________________________________________________
-# unicode encoding
-
-
-def unicode_encode_utf_8(s, size, errors, errorhandler=None):
- assert(size >= 0)
- result = []
- i = 0
- while i < size:
- ch = ord(s[i])
- i += 1
- if (ch < 0x80):
- # Encode ASCII
- result.append(chr(ch))
- elif (ch < 0x0800) :
- # Encode Latin-1
- result.append(chr((0xc0 | (ch >> 6))))
- result.append(chr((0x80 | (ch & 0x3f))))
- else:
- # Encode UCS2 Unicode ordinals
- if (ch < 0x10000):
- # Special case: check for high surrogate
- if (0xD800 <= ch and ch <= 0xDBFF and i != size) :
- ch2 = ord(s[i])
- # Check for low surrogate and combine the two to
- # form a UCS4 value
- if (0xDC00 <= ch2 and ch2 <= 0xDFFF) :
- ch3 = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000
- i += 1
- _encodeUCS4(result, ch3)
- continue
- # Fall through: handles isolated high surrogates
- result.append((chr((0xe0 | (ch >> 12)))))
- result.append((chr((0x80 | ((ch >> 6) & 0x3f)))))
- result.append((chr((0x80 | (ch & 0x3f)))))
- continue
- else:
- _encodeUCS4(result, ch)
- return "".join(result)
-
-def _encodeUCS4(result, ch):
- # Encode UCS4 Unicode ordinals
- result.append((chr((0xf0 | (ch >> 18)))))
- result.append((chr((0x80 | ((ch >> 12) & 0x3f)))))
- result.append((chr((0x80 | ((ch >> 6) & 0x3f)))))
- result.append((chr((0x80 | (ch & 0x3f)))))
-
-
def unicode_encode_ucs1_helper(p, size, errors,
errorhandler=None, limit=256):
if errorhandler is None:
@@ -440,57 +491,6 @@
return res
-def _STORECHAR(result, CH, byteorder):
- hi = chr(((CH) >> 8) & 0xff)
- lo = chr((CH) & 0xff)
- if byteorder == 'little':
- result.append(lo)
- result.append(hi)
- else:
- result.append(hi)
- result.append(lo)
-
-def unicode_encode_utf_16_helper(s, size, errors,
- errorhandler=None,
- byteorder='little'):
- result = []
- if (byteorder == 'native'):
- _STORECHAR(result, 0xFEFF, BYTEORDER)
- byteorder = BYTEORDER
-
- if size == 0:
- return ""
-
- i = 0
- while i < size:
- ch = ord(s[i])
- i += 1
- ch2 = 0
- if (ch >= 0x10000) :
- ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF)
- ch = 0xD800 | ((ch-0x10000) >> 10)
-
- _STORECHAR(result, ch, byteorder)
- if ch2:
- _STORECHAR(result, ch2, byteorder)
-
- return "".join(result)
-
-def unicode_encode_utf_16(s, size, errors,
- errorhandler=None):
- return unicode_encode_utf_16_helper(s, size, errors, errorhandler, "native")
-
-
-def unicode_encode_utf_16_be(s, size, errors,
- errorhandler=None):
- return unicode_encode_utf_16_helper(s, size, errors, errorhandler, "big")
-
-
-def unicode_encode_utf_16_le(s, size, errors,
- errorhandler=None):
- return unicode_encode_utf_16_helper(s, size, errors, errorhandler, "little")
-
-
# ____________________________________________________________
# MBCS codecs for Windows
More information about the Pypy-commit
mailing list