UTF-8 in source code (Re: [Python-Dev] Internationalization Toolkit)

Tim Peters tim_one@email.msn.com
Wed, 17 Nov 1999 03:14:38 -0500


[MAL]
> ...
> Here is a sample implementation of what I had in mind:
>
> """ Demo for 'unicode-escape' encoding.
> """
> import struct,string,re
>
> pack_format = '>H'
>
> def convert_string(s):
>
>     l = map(None,s)
>     for i in range(len(l)):
> 	l[i] = struct.pack(pack_format,ord(l[i]))
>     return l
>
> u_escape = re.compile(r'\\u([0-9a-fA-F]{0,4})')
>
> def unicode_unescape(s):
>
>     l = []
>     start = 0
>     while start < len(s):
> 	m = u_escape.search(s,start)
> 	if not m:
> 	    l[len(l):] = convert_string(s[start:])
> 	    break
> 	m_start,m_end = m.span()
> 	if m_start > start:
> 	    l[len(l):] = convert_string(s[start:m_start])
> 	hexcode = m.group(1)
> 	#print hexcode,start,m_start
> 	if len(hexcode) != 4:
> 	    raise SyntaxError,'illegal \\uXXXX sequence: \\u%s' % hexcode
> 	ordinal = string.atoi(hexcode,16)
> 	l.append(struct.pack(pack_format,ordinal))
> 	start = m_end
>     #print l
>     return string.join(l,'')
>
> def hexstr(s,sep=''):
>
>     return string.join(map(lambda x,hex=hex,ord=ord: '%02x' %
> ord(x),s),sep)

It looks like

    r'\\u0000'

will get translated into a 2-character Unicode string.  That's probably not
good, if for no other reason than that Java would not do this (it would
create the obvious 7-character Unicode string), and having something that
looks like a Java escape that doesn't *work* like the Java escape will be
confusing as heck for JPython users.  Keeping track of even-vs-odd number of
backslashes can't be done with a regexp search, but is easy if the code is
simple <wink>:

def unicode_unescape(s):
    from string import atoi
    import array
    i, n = 0, len(s)
    result = array.array('H') # unsigned short, native order
    while i < n:
        ch = s[i]
        i = i+1
        if ch != "\\":
            result.append(ord(ch))
            continue
        if i == n:
            raise ValueError("string ends with lone backslash")
        ch = s[i]
        i = i+1
        if ch != "u":
            result.append(ord("\\"))
            result.append(ord(ch))
            continue
        hexchars = s[i:i+4]
        if len(hexchars) != 4:
            raise ValueError("\\u escape at end not followed by "
                             "at least 4 characters")
        i = i+4
        for ch in hexchars:
            if ch not in "01234567890abcdefABCDEF":
                raise ValueError("\\u" + hexchars + " contains "
                                 "non-hex characters")
        result.append(atoi(hexchars, 16))

    # print result
    return result.tostring()