[pypy-svn] r75634 - in pypy/branch/interplevel-codecs/pypy: module/_codecs module/_codecs/test objspace/std rlib

afa at codespeak.net afa at codespeak.net
Mon Jun 28 17:05:27 CEST 2010


Author: afa
Date: Mon Jun 28 17:05:25 2010
New Revision: 75634

Modified:
   pypy/branch/interplevel-codecs/pypy/module/_codecs/__init__.py
   pypy/branch/interplevel-codecs/pypy/module/_codecs/app_codecs.py
   pypy/branch/interplevel-codecs/pypy/module/_codecs/interp_codecs.py
   pypy/branch/interplevel-codecs/pypy/module/_codecs/test/test_codecs.py
   pypy/branch/interplevel-codecs/pypy/objspace/std/unicodeobject.py
   pypy/branch/interplevel-codecs/pypy/rlib/runicode.py
Log:
Rewrite the unicode-escape codec at interplevel.
Use it for unicode.__repr__


Modified: pypy/branch/interplevel-codecs/pypy/module/_codecs/__init__.py
==============================================================================
--- pypy/branch/interplevel-codecs/pypy/module/_codecs/__init__.py	(original)
+++ pypy/branch/interplevel-codecs/pypy/module/_codecs/__init__.py	Mon Jun 28 17:05:25 2010
@@ -9,8 +9,6 @@
          'charmap_encode' :  'app_codecs.charmap_encode',
          'escape_decode' :  'app_codecs.escape_decode',
          'escape_encode' :  'app_codecs.escape_encode',
-         'unicode_escape_decode' :  'app_codecs.unicode_escape_decode',
-         'unicode_escape_encode' :  'app_codecs.unicode_escape_encode',
          'unicode_internal_decode' :  'app_codecs.unicode_internal_decode',
          'unicode_internal_encode' :  'app_codecs.unicode_internal_encode',
          'utf_7_decode' :  'app_codecs.utf_7_decode',
@@ -42,6 +40,8 @@
          'charbuffer_encode': 'interp_codecs.buffer_encode',
          'readbuffer_encode': 'interp_codecs.buffer_encode',
          'charmap_decode'   : 'interp_codecs.charmap_decode',
+         'unicode_escape_decode'     :  'interp_codecs.unicode_escape_decode',
+         'unicode_escape_encode'     :  'interp_codecs.unicode_escape_encode',
          'raw_unicode_escape_decode' :  'interp_codecs.raw_unicode_escape_decode',
          'raw_unicode_escape_encode' :  'interp_codecs.raw_unicode_escape_encode',
     }

Modified: pypy/branch/interplevel-codecs/pypy/module/_codecs/app_codecs.py
==============================================================================
--- pypy/branch/interplevel-codecs/pypy/module/_codecs/app_codecs.py	(original)
+++ pypy/branch/interplevel-codecs/pypy/module/_codecs/app_codecs.py	Mon Jun 28 17:05:25 2010
@@ -53,21 +53,6 @@
     res = u''.join(res)
     return res, len(data)
 
-def unicode_escape_encode( obj, errors='strict'):
-    """None
-    """
-    res = unicodeescape_string(obj, len(obj), 0)
-    res = ''.join(res)
-    return res, len(obj)
-
-def unicode_escape_decode( data, errors='strict'):
-    """None
-    """
-    res = PyUnicode_DecodeUnicodeEscape(data, len(data), errors)
-    res = u''.join(res)
-    return res, len(data)
-
-
 def charmap_encode(obj, errors='strict', mapping=None):
     """None
     """
@@ -422,84 +407,6 @@
 
 unicode_empty = u''
 
-def unicodeescape_string(s, size, quotes):
-
-    p = []
-    if (quotes) :
-        p += 'u'
-        if (s.find('\'') != -1 and s.find('"') == -1):
-            p += '"' 
-        else:
-            p += '\''
-    pos = 0
-    while (pos < size):
-        ch = s[pos]
-        #/* Escape quotes */
-        if (quotes and (ch == p[1] or ch == '\\')):
-            p += '\\'
-            p += chr(ord(ch))
-            pos += 1
-            continue
-
-#ifdef Py_UNICODE_WIDE
-        #/* Map 21-bit characters to '\U00xxxxxx' */
-        elif (ord(ch) >= 0x10000):
-            p += '\\'
-            p += 'U'
-            p += '%08x' % ord(ch)
-            pos += 1
-            continue        
-#endif
-        #/* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
-        elif (ord(ch) >= 0xD800 and ord(ch) < 0xDC00):
-            pos += 1
-            ch2 = s[pos]
-            
-            if (ord(ch2) >= 0xDC00 and ord(ch2) <= 0xDFFF):
-                ucs = (((ord(ch) & 0x03FF) << 10) | (ord(ch2) & 0x03FF)) + 0x00010000
-                p += '\\'
-                p += 'U'
-                p += '%08x' % ucs
-                pos += 1
-                continue
-           
-            #/* Fall through: isolated surrogates are copied as-is */
-            pos -= 1
-            
-        #/* Map 16-bit characters to '\uxxxx' */
-        elif (ord(ch) >= 256):
-            p += '\\'
-            p += 'u'
-            p += '%04x' % ord(ch)
-            
-        #/* Map special whitespace to '\t', \n', '\r' */
-        elif (ch == '\t'):
-            p += '\\'
-            p += 't'
-        
-        elif (ch == '\n'):
-            p += '\\'
-            p += 'n'
-
-        elif (ch == '\r'):
-            p += '\\'
-            p += 'r'
-        elif ch == '\\':
-            p += '\\\\'
-
-        #/* Map non-printable US ASCII to '\xhh' */
-        elif (ch < ' ' or ord(ch) >= 0x7F) :
-            p += '\\'
-            p += 'x'
-            p += '%02x' % ord(ch)
-        #/* Copy everything else as-is */
-        else:
-            p += chr(ord(ch))
-        pos += 1
-    if (quotes):
-        p += p[1]
-    return p
-
 def unicode_call_errorhandler(errors,  encoding, 
                 reason, input, startinpos, endinpos, decode=True):
     
@@ -522,160 +429,6 @@
 
 
 
-hexdigits = [hex(i)[-1] for i in range(16)]+[hex(i)[-1].upper() for i in range(10, 16)]
-
-def hexescape(s, pos, digits, message, errors):
-    import sys
-    chr = 0
-    p = []
-    if (pos+digits>len(s)):
-        message = "end of string in escape sequence"
-        x = unicode_call_errorhandler(errors, "unicodeescape", message, s, pos-2, len(s))
-        p += x[0]
-        pos = x[1]
-    else:
-        try:
-            chr = int(s[pos:pos+digits], 16)
-        except ValueError:
-            endinpos = pos
-            while s[endinpos] in hexdigits: 
-                endinpos += 1
-            x = unicode_call_errorhandler(errors, "unicodeescape", message, s, pos-2,
-                        endinpos+1)
-            p += x[0]
-            pos = x[1]
-        #/* when we get here, chr is a 32-bit unicode character */
-        else:
-            if chr <= sys.maxunicode:
-                p += unichr(chr)
-                pos += digits
-            
-            elif (chr <= 0x10ffff):
-                chr -= 0x10000L
-                p += unichr(0xD800 + (chr >> 10))
-                p += unichr(0xDC00 +  (chr & 0x03FF))
-                pos += digits
-            else:
-                message = "illegal Unicode character"
-                x = unicode_call_errorhandler(errors, "unicodeescape", message, s, pos-2,
-                        pos+1)
-                p += x[0]
-                pos = x[1]
-    res = p
-    return res, pos
-
-def PyUnicode_DecodeUnicodeEscape(s, size, errors):
-    import sys
-
-    if (size == 0):
-        return u''
-    
-    p = []
-    pos = 0
-    while (pos < size): 
-##        /* Non-escape characters are interpreted as Unicode ordinals */
-        if (s[pos] != '\\') :
-            p += unichr(ord(s[pos]))
-            pos += 1
-            continue
-##        /* \ - Escapes */
-        else:
-            pos += 1
-            if pos >= len(s):
-                errmessage = "\\ at end of string"
-                unicode_call_errorhandler(errors, "unicodeescape", errmessage, s, pos-1, size)
-            ch = s[pos]
-            pos += 1
-    ##        /* \x escapes */
-            if ch == '\n': pass
-            elif ch == '\\'  : p += u'\\'
-            elif ch == '\'': p += u'\''
-            elif ch == '\"': p += u'\"' 
-            elif ch == 'b' : p += u'\b' 
-            elif ch == 'f' : p += u'\014' #/* FF */
-            elif ch == 't' : p += u'\t' 
-            elif ch == 'n' : p += u'\n'
-            elif ch == 'r' : p += u'\r' 
-            elif ch == 'v': p += u'\013' #break; /* VT */
-            elif ch == 'a': p += u'\007' # break; /* BEL, not classic C */
-            elif '0' <= ch <= '7':
-                x = ord(ch) - ord('0')
-                if pos < size:
-                    ch = s[pos]
-                    if '0' <= ch <= '7':
-                        pos += 1
-                        x = (x<<3) + ord(ch) - ord('0')
-                        if pos < size:
-                            ch = s[pos]
-                            if '0' <= ch <= '7':
-                                pos += 1
-                                x = (x<<3) + ord(ch) - ord('0')
-                p += unichr(x)
-    ##        /* hex escapes */
-    ##        /* \xXX */
-            elif ch == 'x':
-                digits = 2
-                message = "truncated \\xXX escape"
-                x = hexescape(s, pos, digits, message, errors)
-                p += x[0]
-                pos = x[1]
-    
-         #   /* \uXXXX */
-            elif ch == 'u':
-                digits = 4
-                message = "truncated \\uXXXX escape"
-                x = hexescape(s, pos, digits, message, errors)
-                p += x[0]
-                pos = x[1]
-    
-          #  /* \UXXXXXXXX */
-            elif ch == 'U':
-                digits = 8
-                message = "truncated \\UXXXXXXXX escape"
-                x = hexescape(s, pos, digits, message, errors)
-                p += x[0]
-                pos = x[1]
-##        /* \N{name} */
-            elif ch == 'N':
-                message = "malformed \\N character escape"
-                #pos += 1
-                look = pos
-                try:
-                    import unicodedata
-                except ImportError:
-                    message = "\\N escapes not supported (can't load unicodedata module)"
-                    unicode_call_errorhandler(errors, "unicodeescape", message, s, pos-1, size)
-                if look < size and s[look] == '{':
-                    #/* look for the closing brace */
-                    while (look < size and s[look] != '}'):
-                        look += 1
-                    if (look > pos+1 and look < size and s[look] == '}'):
-                        #/* found a name.  look it up in the unicode database */
-                        message = "unknown Unicode character name"
-                        st = s[pos+1:look]
-                        try:
-                            ch = unicodedata._get_code("%s" % st)
-                        except KeyError, e:
-                            x = unicode_call_errorhandler(errors, "unicodeescape", message, s, pos-1, look+1)
-                            p += x[0]
-                            pos = x[1]
-                        else:
-                            pos = look + 1
-                            if ch <= sys.maxunicode:
-                                p += unichr(ch)
-                            else:
-                                ch -= 0x10000L
-                                p += unichr(0xD800 + (ch >> 10))
-                                p += unichr(0xDC00 +  (ch & 0x03FF))
-                    else:        
-                        x = unicode_call_errorhandler(errors, "unicodeescape", message, s, pos-1, look+1)
-                else:        
-                    x = unicode_call_errorhandler(errors, "unicodeescape", message, s, pos-1, look+1)
-            else:
-                p += '\\'
-                p += ch
-    return p
-
 def charmapencode_output(c, mapping):
 
     rep = mapping[c]

Modified: pypy/branch/interplevel-codecs/pypy/module/_codecs/interp_codecs.py
==============================================================================
--- pypy/branch/interplevel-codecs/pypy/module/_codecs/interp_codecs.py	(original)
+++ pypy/branch/interplevel-codecs/pypy/module/_codecs/interp_codecs.py	Mon Jun 28 17:05:25 2010
@@ -1,5 +1,6 @@
 from pypy.interpreter.error import OperationError, operationerrfmt
 from pypy.interpreter.gateway import ObjSpace, NoneNotWrapped, interp2app
+from pypy.interpreter.gateway import unwrap_spec
 from pypy.interpreter.baseobjspace import W_Root
 from pypy.rlib.rstring import StringBuilder, UnicodeBuilder
 from pypy.rlib.objectmodel import we_are_translated
@@ -13,6 +14,8 @@
         self.decode_error_handler = self.make_errorhandler(space, True)
         self.encode_error_handler = self.make_errorhandler(space, False)
 
+        self.unicodedata_getcode = None
+
     def make_errorhandler(self, space, decode):
         def unicode_call_errorhandler(errors,  encoding, reason, input,
                                       startpos, endpos):
@@ -53,6 +56,17 @@
                 return replace, newpos
         return unicode_call_errorhandler
 
+    def get_unicodedata_function(self, space):
+        if self.unicodedata_getcode:
+            return self.unicodedata_getcode
+        w_builtin = space.getbuiltinmodule('__builtin__')
+        w_import = space.getattr(w_builtin, space.wrap("__import__"))
+        w_unicodedata = space.call_function(w_import,
+                                            space.wrap("unicodedata"))
+        self.unicodedata_getcode = space.getattr(w_unicodedata,
+                                                 space.wrap("_get_code"))
+        return self.unicodedata_getcode
+
     def _freeze_(self):
         assert not self.codec_search_path
         return False
@@ -184,7 +198,7 @@
         while pos < end:
             ch = obj[pos]
             builder.append(u"&#")
-            builder.append(unicode(ord(ch)))
+            builder.append(unicode(str(ord(ch))))
             builder.append(u";")
             pos += 1
         return space.newtuple([space.wrap(builder.build()), w_end])
@@ -358,6 +372,7 @@
          "utf_16_encode",
          "utf_16_be_encode",
          "utf_16_le_encode",
+         "unicode_escape_encode",
          "raw_unicode_escape_encode",
         ]:
     make_encoder_wrapper(encoders)
@@ -471,3 +486,40 @@
     res = builder.build()
     return space.newtuple([space.wrap(res), space.wrap(size)])
 charmap_decode.unwrap_spec = [ObjSpace, str, str, W_Root]
+
+# ____________________________________________________________
+# Unicode escape
+
+class UnicodeData_Handler:
+    def __init__(self, space):
+        self.space = space
+        state = space.fromcache(CodecState)
+        self.get_code = state.get_unicodedata_function(space)
+
+    def call(self, name):
+        space = self.space
+        try:
+            w_code = space.call_function(self.get_code, space.wrap(name))
+        except OperationError, e:
+            if not e.match(space, space.w_KeyError):
+                raise
+            return -1
+        return space.int_w(w_code)
+
+ at unwrap_spec(ObjSpace, 'bufferstr', str, W_Root)
+def unicode_escape_decode(space, string, errors="strict", w_final=False):
+    final = space.is_true(w_final)
+    state = space.fromcache(CodecState)
+    errorhandler=state.decode_error_handler
+
+    try:
+        unicode_name_handler = UnicodeData_Handler(space)
+    except OperationError:
+        unicode_name_handler = None
+
+    result, consumed = runicode.str_decode_unicode_escape(
+        string, len(string), errors,
+        final, state.decode_error_handler,
+        unicode_name_handler)
+
+    return space.newtuple([space.wrap(result), space.wrap(consumed)])

Modified: pypy/branch/interplevel-codecs/pypy/module/_codecs/test/test_codecs.py
==============================================================================
--- pypy/branch/interplevel-codecs/pypy/module/_codecs/test/test_codecs.py	(original)
+++ pypy/branch/interplevel-codecs/pypy/module/_codecs/test/test_codecs.py	Mon Jun 28 17:05:25 2010
@@ -1,7 +1,6 @@
 import autopath
 from pypy.conftest import gettestobjspace
-from pypy.module._codecs.app_codecs import unicode_escape_encode,\
-     charmap_encode, unicode_escape_decode
+from pypy.module._codecs.app_codecs import charmap_encode
 
 
 class AppTestCodecs:
@@ -120,6 +119,11 @@
         map = tuple([unichr(i) for i in range(256)])
         assert charmap_decode('xxx\xff', 'strict', map) == (u'xxx\xff', 4)
 
+    def test_unicode_escape(self):
+        from _codecs import unicode_escape_encode, unicode_escape_decode
+        assert unicode_escape_encode(u'abc') == (u'abc'.encode('unicode_escape'), 3)
+        assert unicode_escape_decode('abc') == (u'abc'.decode('unicode_escape'), 3)
+        assert unicode_escape_decode('\\x61\\x62\\x63') == (u'abc', 12)
 
 class AppTestPartialEvaluation:
 
@@ -542,6 +546,7 @@
     def test_unicode_escape(self):        
         assert u'\\'.encode('unicode-escape') == '\\\\'
         assert '\\\\'.decode('unicode-escape') == u'\\'
+        assert u'\ud801'.encode('unicode-escape') == '\\ud801'
 
     def test_mbcs(self):
         import sys
@@ -557,8 +562,3 @@
     def test_charmap_encode(self):
         assert charmap_encode(u'xxx') == ('xxx', 3)
         assert charmap_encode(u'xxx', 'strict', {ord('x'): 'XX'}) ==  ('XXXXXX', 6)
-
-    def test_unicode_escape(self):
-        assert unicode_escape_encode(u'abc') == (u'abc'.encode('unicode_escape'), 3)
-        assert unicode_escape_decode('abc') == (u'abc'.decode('unicode_escape'), 3)
-        assert unicode_escape_decode('\\x61\\x62\\x63') == (u'abc', 12)

Modified: pypy/branch/interplevel-codecs/pypy/objspace/std/unicodeobject.py
==============================================================================
--- pypy/branch/interplevel-codecs/pypy/objspace/std/unicodeobject.py	(original)
+++ pypy/branch/interplevel-codecs/pypy/objspace/std/unicodeobject.py	Mon Jun 28 17:05:25 2010
@@ -12,6 +12,7 @@
 from pypy.rlib.rarithmetic import intmask, ovfcheck
 from pypy.rlib.objectmodel import compute_hash
 from pypy.rlib.rstring import string_repeat
+from pypy.rlib.runicode import unicode_encode_unicode_escape
 from pypy.module.unicodedata import unicodedb_4_1_0 as unicodedb
 from pypy.tool.sourcetools import func_with_new_name
 
@@ -892,101 +893,11 @@
                     space.wrap("character mapping must return integer, None or unicode"))
     return W_UnicodeObject(u''.join(result))
 
-# Move this into the _codecs module as 'unicodeescape_string (Remember to cater for quotes)'
 def repr__Unicode(space, w_unicode):
-    hexdigits = "0123456789abcdef"
     chars = w_unicode._value
     size = len(chars)
-    
-    singlequote = doublequote = False
-    for c in chars:
-        if c == u'\'':
-            singlequote = True
-        elif c == u'"':
-            doublequote = True
-    if singlequote and not doublequote:
-        quote = '"'
-    else:
-        quote = '\''
-    result = ['u', quote]
-    j = 0
-    while j<len(chars):
-        ch = chars[j]
-        code = ord(ch)
-        if code >= 0x10000:
-            # Resize if needed
-            result.extend(['\\', "U",
-                           hexdigits[(code >> 28) & 0xf],
-                           hexdigits[(code >> 24) & 0xf],
-                           hexdigits[(code >> 20) & 0xf],
-                           hexdigits[(code >> 16) & 0xf],
-                           hexdigits[(code >> 12) & 0xf],
-                           hexdigits[(code >>  8) & 0xf],
-                           hexdigits[(code >>  4) & 0xf],
-                           hexdigits[(code >>  0) & 0xf],
-                           ])
-            j += 1
-            continue
-        if code >= 0xD800 and code < 0xDC00:
-            if j < size - 1:
-                ch2 = chars[j+1]
-                code2 = ord(ch2)
-                if code2 >= 0xDC00 and code2 <= 0xDFFF:
-                    code = (((code & 0x03FF) << 10) | (code2 & 0x03FF)) + 0x00010000
-                    result.extend(['\\', "U",
-                                   hexdigits[(code >> 28) & 0xf],
-                                   hexdigits[(code >> 24) & 0xf],
-                                   hexdigits[(code >> 20) & 0xf],
-                                   hexdigits[(code >> 16) & 0xf],
-                                   hexdigits[(code >> 12) & 0xf],
-                                   hexdigits[(code >>  8) & 0xf],
-                                   hexdigits[(code >>  4) & 0xf],
-                                   hexdigits[(code >>  0) & 0xf],
-                                  ])
-                    j += 2
-                    continue
-                
-        if code >= 0x100:
-            result.extend(['\\', "u",
-                           hexdigits[(code >> 12) & 0xf],
-                           hexdigits[(code >>  8) & 0xf],
-                           hexdigits[(code >>  4) & 0xf],
-                           hexdigits[(code >>  0) & 0xf],
-                          ])
-            j += 1
-            continue
-        if code == ord('\\') or code == ord(quote):
-            result.append('\\')
-            result.append(chr(code))
-            j += 1
-            continue
-        if code == ord('\t'):
-            result.append('\\')
-            result.append('t')
-            j += 1
-            continue
-        if code == ord('\r'):
-            result.append('\\')
-            result.append('r')
-            j += 1
-            continue
-        if code == ord('\n'):
-            result.append('\\')
-            result.append('n')
-            j += 1
-            continue
-        if code < ord(' ') or code >= 0x7f:
-            result.extend(['\\', "x",
-                           hexdigits[(code >> 4) & 0xf], 
-                           hexdigits[(code >> 0) & 0xf],
-                          ])
-            j += 1
-            continue
-        result.append(chr(code))
-        j += 1
-    result.append(quote)
-    return space.wrap(''.join(result))
-        
+    s = unicode_encode_unicode_escape(chars, size, "strict", quotes=True)
+    return space.wrap(s)
 
 def mod__Unicode_ANY(space, w_format, w_values):
     return mod_format(space, w_format, w_values, do_unicode=True)

Modified: pypy/branch/interplevel-codecs/pypy/rlib/runicode.py
==============================================================================
--- pypy/branch/interplevel-codecs/pypy/rlib/runicode.py	(original)
+++ pypy/branch/interplevel-codecs/pypy/rlib/runicode.py	Mon Jun 28 17:05:25 2010
@@ -319,7 +319,7 @@
     result = UnicodeBuilder(size // 2)
 
     #XXX I think the errors are not correctly handled here
-    while (pos < len(s)):
+    while pos < size:
         # remaining bytes at the end? (size should be even)
         if len(s) - pos < 2:
             if not final:
@@ -425,7 +425,7 @@
     # latin1 is equivalent to the first 256 ordinals in Unicode.
     pos = 0
     result = UnicodeBuilder(size)
-    while (pos < size):
+    while pos < size:
         result.append(unichr(ord(s[pos])))
         pos += 1
     return result.build(), pos
@@ -438,7 +438,7 @@
     # ASCII is equivalent to the first 128 ordinals in Unicode.
     result = UnicodeBuilder(size)
     pos = 0
-    while pos < len(s):
+    while pos < size:
         c = s[pos]
         if ord(c) < 128:
             result.append(unichr(ord(c)))
@@ -465,7 +465,7 @@
         return ''
     result = StringBuilder(size)
     pos = 0
-    while pos < len(p):
+    while pos < size:
         ch = p[pos]
         
         if ord(ch) < limit:
@@ -492,6 +492,248 @@
     return res
 
 # ____________________________________________________________
+# Unicode escape
+
+hexdigits = "0123456789ABCDEFabcdef"
+
+def hexescape(builder, s, pos, digits, errorhandler, message, errors):
+    import sys
+    chr = 0
+    if (pos+digits>len(s)):
+        message = "end of string in escape sequence"
+        res, pos = errorhandler(errors, "unicodeescape",
+                                message, s, pos-2, len(s))
+        builder.append(res)
+    else:
+        try:
+            chr = int(s[pos:pos+digits], 16)
+        except ValueError:
+            endinpos = pos
+            while s[endinpos] in hexdigits:
+                endinpos += 1
+            res, pos = errorhandler(errors, "unicodeescape",
+                                    message, s, pos-2, endinpos+1)
+            builder.append(res)
+        else:
+            # when we get here, chr is a 32-bit unicode character
+            if chr <= MAXUNICODE:
+                builder.append(unichr(chr))
+                pos += digits
+
+            elif (chr <= 0x10ffff):
+                chr -= 0x10000L
+                builder.append(unichr(0xD800 + (chr >> 10)))
+                builder.append(unichr(0xDC00 +  (chr & 0x03FF)))
+                pos += digits
+            else:
+                message = "illegal Unicode character"
+                res, pos = errorhandler(errors, "unicodeescape",
+                                        message, s, pos-2, pos+1)
+                builder.append(res)
+    return pos
+
+def str_decode_unicode_escape(s, size, errors, final=False,
+                              errorhandler=False,
+                              unicodedata_handler=None):
+    if errorhandler is None:
+        errorhandler = raise_unicode_exception_decode
+
+    if (size == 0):
+        return u'', 0
+
+    builder = UnicodeBuilder(size)
+    pos = 0
+    while pos < size:
+        ch = s[pos]
+
+        # Non-escape characters are interpreted as Unicode ordinals
+        if (ch != '\\') :
+            builder.append(unichr(ord(ch)))
+            pos += 1
+            continue
+
+        # - Escapes
+        pos += 1
+        if pos >= size:
+            message = "\\ at end of string"
+            res, pos = errorhandler(errors, "unicodeescape",
+                                    message, s, pos-1, size)
+            builder.append(res)
+            continue
+
+        ch = s[pos]
+        pos += 1
+        # \x escapes */
+        if ch == '\n': pass
+        elif ch == '\\': builder.append(u'\\')
+        elif ch == '\'': builder.append(u'\'')
+        elif ch == '\"': builder.append(u'\"')
+        elif ch == 'b' : builder.append(u'\b')
+        elif ch == 'f' : builder.append(u'\f')
+        elif ch == 't' : builder.append(u'\t')
+        elif ch == 'n' : builder.append(u'\n')
+        elif ch == 'r' : builder.append(u'\r')
+        elif ch == 'v' : builder.append(u'\v')
+        elif ch == 'a' : builder.append(u'\a')
+        elif '0' <= ch <= '7':
+            x = ord(ch) - ord('0')
+            if pos < size:
+                ch = s[pos]
+                if '0' <= ch <= '7':
+                    pos += 1
+                    x = (x<<3) + ord(ch) - ord('0')
+                    if pos < size:
+                        ch = s[pos]
+                        if '0' <= ch <= '7':
+                            pos += 1
+                            x = (x<<3) + ord(ch) - ord('0')
+            builder.append(unichr(x))
+        # hex escapes
+        # \xXX
+        elif ch == 'x':
+            digits = 2
+            message = "truncated \\xXX escape"
+            pos = hexescape(builder, s, pos, digits,
+                            errorhandler, message, errors)
+
+        # \uXXXX
+        elif ch == 'u':
+            digits = 4
+            message = "truncated \\uXXXX escape"
+            pos = hexescape(builder, s, pos, digits,
+                            errorhandler, message, errors)
+
+        #  \UXXXXXXXX
+        elif ch == 'U':
+            digits = 8
+            message = "truncated \\UXXXXXXXX escape"
+            pos = hexescape(builder, s, pos, digits,
+                            errorhandler, message, errors)
+
+        # \N{name}
+        elif ch == 'N':
+            message = "malformed \\N character escape"
+            #pos += 1
+            look = pos
+            if unicodedata_handler is None:
+                message = ("\\N escapes not supported "
+                           "(can't load unicodedata module)")
+                res, pos = errorhandler(errors, "unicodeescape",
+                                        message, s, pos-1, size)
+                builder.append(res)
+                continue
+
+            if look < size and s[look] == '{':
+                # look for the closing brace
+                while (look < size and s[look] != '}'):
+                    look += 1
+                if (look > pos+1 and look < size and s[look] == '}'):
+                    # found a name.  look it up in the unicode database
+                    message = "unknown Unicode character name"
+                    name = s[pos+1:look]
+                    code = unicodedata_handler.call(name)
+                    if code < 0:
+                        res, pos = errorhandler(errors, "unicodeescape",
+                                                message, s, pos-1, look+1)
+                        builder.append(res)
+                        continue
+                    pos = look + 1
+                    if code <= MAXUNICODE:
+                        builder.append(unichr(code))
+                    else:
+                        code -= 0x10000L
+                        builder.append(unichr(0xD800 + (code >> 10)))
+                        builder.append(unichr(0xDC00 + (code & 0x03FF)))
+                else:
+                    res, pos = errorhandler(errors, "unicodeescape",
+                                            message, s, pos-1, look+1)
+                    builder.append(res)
+            else:
+                res, pos = errorhandler(errors, "unicodeescape",
+                                        message, s, pos-1, look+1)
+                builder.append(res)
+        else:
+            builder.append(u'\\')
+            builder.append(unichr(ord(ch)))
+
+    return builder.build(), pos
+
+def unicode_encode_unicode_escape(s, size, errors, errorhandler=None, quotes=False):
+    # errorhandler is not used: this function cannot cause Unicode errors
+    result = StringBuilder(size)
+
+    if quotes:
+        if s.find(u'\'') != -1 and s.find(u'\"') == -1:
+            quote = ord('\"')
+            result.append('u"')
+        else:
+            quote = ord('\'')
+            result.append('u\'')
+    else:
+        quote = 0
+
+        if size == 0:
+            return ''
+
+    pos = 0
+    while pos < size:
+        ch = s[pos]
+        oc = ord(ch)
+
+        # Escape quotes
+        if quotes and (oc == quote or ch == '\\'):
+            result.append('\\')
+            result.append(chr(oc))
+            pos += 1
+            continue
+
+        if oc > 0x10000:
+            raw_unicode_escape_helper(result, oc)
+            pos += 1
+            continue
+
+        if 0xD800 <= oc < 0xDC00 and pos + 1 < size:
+            # Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes
+            pos += 1
+            oc2 = ord(s[pos])
+
+            if 0xDC00 <= oc2 < 0xDFFF:
+                ucs = (((oc & 0x03FF) << 10) | (oc2 & 0x03FF)) + 0x00010000
+                raw_unicode_escape_helper(result, ucs)
+                pos += 1
+                continue
+            # Fall through: isolated surrogates are copied as-is
+            pos -= 1
+
+        # Map 16-bit characters to '\uxxxx'
+        if oc >= 0x100:
+            raw_unicode_escape_helper(result, oc)
+            pos += 1
+            continue
+
+        # Map special whitespace to '\t', \n', '\r'
+        if ch == '\t':
+            result.append('\\t')
+        elif ch == '\n':
+            result.append('\\n')
+        elif ch == '\r':
+            result.append('\\r')
+        elif ch == '\\':
+            result.append('\\\\')
+
+        # Map non-printable US ASCII to '\xhh'
+        elif (oc < 32 or oc >= 0x7F) :
+            raw_unicode_escape_helper(result, oc)
+        # Copy everything else as-is
+        else:
+            result.append(chr(oc))
+        pos += 1
+
+    if quotes:
+        result.append(chr(quote))
+    return result.build()
+
+# ____________________________________________________________
 # Raw unicode escape
 
 def str_decode_raw_unicode_escape(s, size, errors, final=False,
@@ -503,7 +745,7 @@
 
     result = UnicodeBuilder(size)
     pos = 0
-    while pos < len(s):
+    while pos < size:
         ch = s[pos]
 
         # Non-escape characters are interpreted as Unicode ordinals
@@ -518,7 +760,7 @@
         bs = pos
         while pos < size:
             pos += 1
-            if (s[pos] != '\\'):
+            if pos == size or s[pos] != '\\':
                 break
             result.append(u'\\')
 
@@ -564,6 +806,19 @@
 
     return result.build(), pos
 
+def raw_unicode_escape_helper(result, char):
+    num = hex(char)
+    if char >= 0x10000:
+        result.append("\\U")
+        zeros = 8
+    else:
+        result.append("\\u")
+        zeros = 4
+    nb = zeros + 2 - len(num) # num starts with '0x'
+    if nb > 0:
+        result.append_multiple_char('0', nb)
+    result.append_slice(num, 2, 8)
+
 def unicode_encode_raw_unicode_escape(s, size, errors, errorhandler=None):
     # errorhandler is not used: this function cannot cause Unicode errors
     if (size == 0):
@@ -575,17 +830,7 @@
         if oc < 0x100:
             result.append(chr(oc))
         else:
-            num = hex(oc)
-            if (oc >= 0x10000):
-                result.append("\\U")
-                zeros = 8
-            else:
-                result.append("\\u")
-                zeros = 4
-            nb = zeros + 2 - len(num) # num starts with '0x'
-            if nb > 0:
-                result.append_multiple_char('0', nb)
-            result.append_slice(num, 2, 8)
+            raw_unicode_escape_helper(result, oc)
         pos += 1
 
     return result.build()



More information about the Pypy-commit mailing list