[pypy-svn] r75644 - in pypy/branch/interplevel-codecs/pypy: module/_codecs rlib
afa at codespeak.net
afa at codespeak.net
Mon Jun 28 18:53:43 CEST 2010
Author: afa
Date: Mon Jun 28 18:53:42 2010
New Revision: 75644
Modified:
pypy/branch/interplevel-codecs/pypy/module/_codecs/__init__.py
pypy/branch/interplevel-codecs/pypy/module/_codecs/app_codecs.py
pypy/branch/interplevel-codecs/pypy/module/_codecs/interp_codecs.py
pypy/branch/interplevel-codecs/pypy/rlib/runicode.py
Log:
Rewrite utf-7 codec at interp-level.
This should dramatically reduce the generated C code.
Modified: pypy/branch/interplevel-codecs/pypy/module/_codecs/__init__.py
==============================================================================
--- pypy/branch/interplevel-codecs/pypy/module/_codecs/__init__.py (original)
+++ pypy/branch/interplevel-codecs/pypy/module/_codecs/__init__.py Mon Jun 28 18:53:42 2010
@@ -11,8 +11,6 @@
'escape_encode' : 'app_codecs.escape_encode',
'unicode_internal_decode' : 'app_codecs.unicode_internal_decode',
'unicode_internal_encode' : 'app_codecs.unicode_internal_encode',
- 'utf_7_decode' : 'app_codecs.utf_7_decode',
- 'utf_7_encode' : 'app_codecs.utf_7_encode',
'charmap_build' : 'app_codecs.charmap_build'
}
interpleveldefs = {
@@ -28,6 +26,8 @@
'ascii_encode' : 'interp_codecs.ascii_encode',
'latin_1_decode' : 'interp_codecs.latin_1_decode',
'latin_1_encode' : 'interp_codecs.latin_1_encode',
+ 'utf_7_decode' : 'interp_codecs.utf_7_decode',
+ 'utf_7_encode' : 'interp_codecs.utf_7_encode',
'utf_8_decode' : 'interp_codecs.utf_8_decode',
'utf_8_encode' : 'interp_codecs.utf_8_encode',
'utf_16_be_decode' : 'interp_codecs.utf_16_be_decode',
Modified: pypy/branch/interplevel-codecs/pypy/module/_codecs/app_codecs.py
==============================================================================
--- pypy/branch/interplevel-codecs/pypy/module/_codecs/app_codecs.py (original)
+++ pypy/branch/interplevel-codecs/pypy/module/_codecs/app_codecs.py Mon Jun 28 18:53:42 2010
@@ -46,13 +46,6 @@
v = s[1:-1]
return v, len(v)
-def utf_7_decode( data, errors='strict'):
- """None
- """
- res = PyUnicode_DecodeUTF7(data, len(data), errors)
- res = u''.join(res)
- return res, len(data)
-
def charmap_encode(obj, errors='strict', mapping=None):
"""None
"""
@@ -175,238 +168,8 @@
res = ''.join(res)
return res, len(data)
-
-def utf_7_encode( obj, errors='strict'):
- """None
- """
- res = PyUnicode_EncodeUTF7(obj, len(obj), 0, 0, errors)
- res = ''.join(res)
- return res, len(res)
-
# ----------------------------------------------------------------------
-##import sys
-##""" Python implementation of CPythons builtin unicode codecs.
-##
-## Generally the functions in this module take a list of characters an returns
-## a list of characters.
-##
-## For use in the PyPy project"""
-
-
-## indicate whether a UTF-7 character is special i.e. cannot be directly
-## encoded:
-## 0 - not special
-## 1 - special
-## 2 - whitespace (optional)
-## 3 - RFC2152 Set O (optional)
-
-utf7_special = [
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
- 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
- 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
-]
-
-
-def SPECIAL(c, encodeO, encodeWS):
- c = ord(c)
- return (c>127 or utf7_special[c] == 1) or \
- (encodeWS and (utf7_special[(c)] == 2)) or \
- (encodeO and (utf7_special[(c)] == 3))
-def B64(n):
- return ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
-def B64CHAR(c):
- return (c.isalnum() or (c) == '+' or (c) == '/')
-def UB64(c):
- if (c) == '+' :
- return 62
- elif (c) == '/':
- return 63
- elif (c) >= 'a':
- return ord(c) - 71
- elif (c) >= 'A':
- return ord(c) - 65
- else:
- return ord(c) + 4
-
-def ENCODE( ch, bits) :
- out = []
- while (bits >= 6):
- out += B64(ch >> (bits-6))
- bits -= 6
- return out, bits
-
-def PyUnicode_DecodeUTF7(s, size, errors):
- from _codecs import lookup_error
- errmsg = ""
- inShift = 0
- bitsleft = 0
- charsleft = 0
- surrogate = 0
- startinpos = 0
- p = []
- errorHandler = None
- exc = None
-
- if (size == 0):
- return unicode('')
- i = 0
- while i < size:
-
- ch = s[i]
- if (inShift):
- if ((ch == '-') or not B64CHAR(ch)):
- inShift = 0
- i += 1
-
- while (bitsleft >= 16):
- outCh = ((charsleft) >> (bitsleft-16)) & 0xffff
- bitsleft -= 16
-
- if (surrogate):
- ## We have already generated an error for the high surrogate
- ## so let's not bother seeing if the low surrogate is correct or not
- surrogate = 0
- elif (0xDC00 <= (outCh) and (outCh) <= 0xDFFF):
- ## This is a surrogate pair. Unfortunately we can't represent
- ## it in a 16-bit character
- surrogate = 1
- msg = "code pairs are not supported"
- out, i = unicode_call_errorhandler(errors, 'utf-7', msg, s, i-1, i)
- p += out
- bitsleft = 0
- break
- else:
- p += unichr(outCh )
- #p += out
- if (bitsleft >= 6):
-## /* The shift sequence has a partial character in it. If
-## bitsleft < 6 then we could just classify it as padding
-## but that is not the case here */
- msg = "partial character in shift sequence"
- out, i = unicode_call_errorhandler(errors, 'utf-7', msg, s, i-1, i)
-
-## /* According to RFC2152 the remaining bits should be zero. We
-## choose to signal an error/insert a replacement character
-## here so indicate the potential of a misencoded character. */
-
-## /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
-## if (bitsleft and (charsleft << (sizeof(charsleft) * 8 - bitsleft))):
-## raise UnicodeDecodeError, "non-zero padding bits in shift sequence"
- if (ch == '-') :
- if ((i < size) and (s[i] == '-')) :
- p += '-'
- inShift = 1
-
- elif SPECIAL(ch, 0, 0) :
- msg = "unexpected special character"
- out, i = unicode_call_errorhandler(errors, 'utf-7', msg, s, i-1, i)
- p += out
- else:
- p += ch
- else:
- charsleft = (charsleft << 6) | UB64(ch)
- bitsleft += 6
- i += 1
-## /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
- elif ( ch == '+' ):
- startinpos = i
- i += 1
- if (i<size and s[i] == '-'):
- i += 1
- p += '+'
- else:
- inShift = 1
- bitsleft = 0
-
- elif (SPECIAL(ch, 0, 0)):
- i += 1
- msg = "unexpected special character"
- out, i = unicode_call_errorhandler(errors, 'utf-7', msg, s, i-1, i)
- p += out
- else:
- p += ch
- i += 1
-
- if (inShift) :
- endinpos = size
- msg = "unterminated shift sequence"
- out, i = unicode_call_errorhandler(errors, 'utf-7', msg, s, startinpos, i)
- p += out
- return p
-
-def PyUnicode_EncodeUTF7(s, size, encodeSetO, encodeWhiteSpace, errors):
-
-# /* It might be possible to tighten this worst case */
- inShift = False
- i = 0
- bitsleft = 0
- charsleft = 0
- out = []
- for ch in s:
- if (not inShift) :
- if (ch == '+'):
- out += '+'
- out += '-'
- elif (SPECIAL(ch, encodeSetO, encodeWhiteSpace)):
- charsleft = ord(ch)
- bitsleft = 16
- out += '+'
- p, bitsleft = ENCODE( charsleft, bitsleft)
- out += p
- inShift = bitsleft > 0
- else:
- out += chr(ord(ch))
- else:
- if (not SPECIAL(ch, encodeSetO, encodeWhiteSpace)):
- out += B64((charsleft) << (6-bitsleft))
- charsleft = 0
- bitsleft = 0
-## /* Characters not in the BASE64 set implicitly unshift the sequence
-## so no '-' is required, except if the character is itself a '-' */
- if (B64CHAR(ch) or ch == '-'):
- out += '-'
- inShift = False
- out += chr(ord(ch))
- else:
- bitsleft += 16
- charsleft = (((charsleft) << 16) | ord(ch))
- p, bitsleft = ENCODE(charsleft, bitsleft)
- out += p
-## /* If the next character is special then we dont' need to terminate
-## the shift sequence. If the next character is not a BASE64 character
-## or '-' then the shift sequence will be terminated implicitly and we
-## don't have to insert a '-'. */
-
- if (bitsleft == 0):
- if (i + 1 < size):
- ch2 = s[i+1]
-
- if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)):
- pass
- elif (B64CHAR(ch2) or ch2 == '-'):
- out += '-'
- inShift = False
- else:
- inShift = False
- else:
- out += '-'
- inShift = False
- i += 1
-
- if (bitsleft):
- out += B64(charsleft << (6-bitsleft) )
- out += '-'
-
- return out
-
-unicode_empty = u''
-
def unicode_call_errorhandler(errors, encoding,
reason, input, startinpos, endinpos, decode=True):
Modified: pypy/branch/interplevel-codecs/pypy/module/_codecs/interp_codecs.py
==============================================================================
--- pypy/branch/interplevel-codecs/pypy/module/_codecs/interp_codecs.py (original)
+++ pypy/branch/interplevel-codecs/pypy/module/_codecs/interp_codecs.py Mon Jun 28 18:53:42 2010
@@ -368,6 +368,7 @@
for encoders in [
"ascii_encode",
"latin_1_encode",
+ "utf_7_encode",
"utf_8_encode",
"utf_16_encode",
"utf_16_be_encode",
@@ -380,6 +381,7 @@
for decoders in [
"ascii_decode",
"latin_1_decode",
+ "utf_7_decode",
"utf_8_decode",
"utf_16_decode",
"utf_16_be_decode",
Modified: pypy/branch/interplevel-codecs/pypy/rlib/runicode.py
==============================================================================
--- pypy/branch/interplevel-codecs/pypy/rlib/runicode.py (original)
+++ pypy/branch/interplevel-codecs/pypy/rlib/runicode.py Mon Jun 28 18:53:42 2010
@@ -418,6 +418,228 @@
# ____________________________________________________________
+# utf-7
+
+## indicate whether a UTF-7 character is special i.e. cannot be directly
+## encoded:
+## 0 - not special
+## 1 - special
+## 2 - whitespace (optional)
+## 3 - RFC2152 Set O (optional)
+
+_utf7_special = [
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
+ 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
+ 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
+]
+
+def _utf7_SPECIAL(oc, encodeO=False, encodeWS=False):
+ return (oc > 127 or _utf7_special[oc] == 1 or
+ (encodeWS and _utf7_special[oc] == 2) or
+ (encodeO and _utf7_special[oc] == 3))
+
+def _utf7_B64CHAR(oc):
+ if oc > 127:
+ return False
+ c = chr(oc)
+ return c.isalnum() or c == '+' or c == '/'
+def _utf7_TO_BASE64(n):
+ "Returns the base-64 character of the bottom 6 bits of n"
+ return "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[n & 0x3f]
+def _utf7_FROM_BASE64(c):
+ "Retuns the base-64 value of a base-64 character"
+ if c == '+':
+ return 62
+ elif c == '/':
+ return 63
+ elif c >= 'a':
+ return ord(c) - 71
+ elif c >= 'A':
+ return ord(c) - 65
+ else:
+ return ord(c) + 4
+
+def _utf7_ENCODE(result, ch, bits) :
+ while (bits >= 6):
+ result.append(_utf7_TO_BASE64(ch >> (bits - 6)))
+ bits -= 6
+ return bits
+
+def str_decode_utf_7(s, size, errors, final=False,
+ errorhandler=None):
+ if errorhandler is None:
+ errorhandler = raise_unicode_exception_decode
+ if (size == 0):
+ return u'', 0
+
+ inShift = False
+ bitsleft = 0
+ startinpos = 0
+ charsleft = 0
+ surrogate = False
+
+ result = UnicodeBuilder(size)
+ pos = 0
+ while pos < size:
+ ch = s[pos]
+ oc = ord(ch)
+
+ if inShift:
+ if ch == '-' or not _utf7_B64CHAR(oc):
+ inShift = 0
+ pos += 1
+
+ while bitsleft >= 16:
+ outCh = (charsleft >> (bitsleft-16)) & 0xffff
+ bitsleft -= 16
+
+ if surrogate:
+ ## We have already generated an error for the high
+ ## surrogate so let's not bother seeing if the low
+ ## surrogate is correct or not
+ surrogate = False
+ elif 0xDC00 <= outCh <= 0xDFFF:
+ ## This is a surrogate pair. Unfortunately we can't
+ ## represent it in a 16-bit character
+ surrogate = True
+ msg = "code pairs are not supported"
+ res, pos = errorhandler(errors, 'utf-7',
+ msg, s, pos-1, pos)
+ result.append(res)
+ bitsleft = 0
+ break
+ else:
+ result.append(unichr(outCh))
+ if bitsleft >= 6:
+ ## The shift sequence has a partial character in it. If
+ ## bitsleft < 6 then we could just classify it as padding
+ ## but that is not the case here
+ msg = "partial character in shift sequence"
+ res, pos = errorhandler(errors, 'utf-7',
+ msg, s, pos-1, pos)
+ result.append(res)
+ ## According to RFC2152 the remaining bits should be
+ ## zero. We choose to signal an error/insert a replacement
+ ## character here so indicate the potential of a
+ ## misencoded character.
+ if ch == '-':
+ if pos < size and s[pos] == '-':
+ result.append(u'-')
+ inShift = True
+
+ elif _utf7_SPECIAL(oc) :
+ msg = "unexpected special character"
+ res, pos = errorhandler(errors, 'utf-7',
+ msg, s, pos-1, pos)
+ result.append(res)
+ else:
+ result.append(unichr(ord(ch)))
+ else:
+ charsleft = (charsleft << 6) | _utf7_FROM_BASE64(ch)
+ bitsleft += 6
+ pos += 1
+ elif ch == '+':
+ startinpos = pos
+ pos += 1
+ if pos < size and s[pos] == '-':
+ pos += 1
+ result.append(u'+')
+ else:
+ inShift = 1
+ bitsleft = 0
+
+ elif _utf7_SPECIAL(oc):
+ pos += 1
+ msg = "unexpected special character"
+ res, pos = errorhandler(errors, 'utf-7', msg, s, pos-1, pos)
+ result.append(res)
+ else:
+ result.append(unichr(oc))
+ pos += 1
+
+ if inShift:
+ endinpos = size
+ msg = "unterminated shift sequence"
+ res, pos = errorhandler(errors, 'utf-7', msg, s, startinpos, pos)
+ result.append(res)
+
+ return result.build(), pos
+
+def unicode_encode_utf_7(s, size, errors, errorhandler=None):
+ if (size == 0):
+ return ''
+ result = StringBuilder(size)
+
+ encodeSetO = encodeWhiteSpace = False
+
+ inShift = False
+ bitsleft = 0
+ charsleft = 0
+
+ pos = 0
+ while pos < size:
+ ch = s[pos]
+ oc = ord(ch)
+ if not inShift:
+ if ch == u'+':
+ result.append('+-')
+ elif _utf7_SPECIAL(oc, encodeSetO, encodeWhiteSpace):
+ charsleft = oc
+ bitsleft = 16
+ result.append('+')
+ bitsleft = _utf7_ENCODE(result, charsleft, bitsleft)
+ inShift = bitsleft > 0
+ else:
+ result.append(chr(oc))
+ else:
+ if not _utf7_SPECIAL(oc, encodeSetO, encodeWhiteSpace):
+ result.append(_utf7_TO_BASE64(charsleft << (6-bitsleft)))
+ charsleft = 0
+ bitsleft = 0
+ ## Characters not in the BASE64 set implicitly unshift the
+ ## sequence so no '-' is required, except if the character is
+ ## itself a '-'
+ if _utf7_B64CHAR(oc) or ch == u'-':
+ result.append('-')
+ inShift = False
+ result.append(chr(oc))
+ else:
+ bitsleft += 16
+ charsleft = (charsleft << 16) | oc
+ bitsleft = _utf7_ENCODE(result, charsleft, bitsleft)
+ ## If the next character is special then we dont' need to
+ ## terminate the shift sequence. If the next character is not
+ ## a BASE64 character or '-' then the shift sequence will be
+ ## terminated implicitly and we don't have to insert a '-'.
+ if bitsleft == 0:
+ if pos + 1 < size:
+ ch2 = s[pos + 1]
+ oc2 = ord(ch2)
+
+ if (_utf7_SPECIAL(oc2, encodeSetO, encodeWhiteSpace)):
+ pass
+ elif (_utf7_B64CHAR(oc2) or ch2 == u'-'):
+ result.append('-')
+ inShift = False
+ else:
+ inShift = False
+ else:
+ result.append('-')
+ inShift = False
+ pos += 1
+
+ if bitsleft:
+ result.append(_utf7_TO_BASE64(charsleft << (6 - bitsleft)))
+ result.append('-')
+
+ return result.build()
+
+# ____________________________________________________________
# ascii and latin-1
def str_decode_latin_1(s, size, errors, final=False,
@@ -563,7 +785,7 @@
ch = s[pos]
pos += 1
- # \x escapes */
+ # \x escapes
if ch == '\n': pass
elif ch == '\\': builder.append(u'\\')
elif ch == '\'': builder.append(u'\'')
@@ -697,7 +919,7 @@
pos += 1
oc2 = ord(s[pos])
- if 0xDC00 <= oc2 < 0xDFFF:
+ if 0xDC00 <= oc2 <= 0xDFFF:
ucs = (((oc & 0x03FF) << 10) | (oc2 & 0x03FF)) + 0x00010000
raw_unicode_escape_helper(result, ucs)
pos += 1
More information about the Pypy-commit
mailing list