UTF-8 in source code (Re: [Python-Dev] Internationalization Toolkit)
Tim Peters
tim_one@email.msn.com
Wed, 17 Nov 1999 03:14:38 -0500
[MAL]
> ...
> Here is a sample implementation of what I had in mind:
>
> """ Demo for 'unicode-escape' encoding.
> """
> import struct,string,re
>
> pack_format = '>H'
>
> def convert_string(s):
>
> l = map(None,s)
> for i in range(len(l)):
> l[i] = struct.pack(pack_format,ord(l[i]))
> return l
>
> u_escape = re.compile(r'\\u([0-9a-fA-F]{0,4})')
>
> def unicode_unescape(s):
>
> l = []
> start = 0
> while start < len(s):
> m = u_escape.search(s,start)
> if not m:
> l[len(l):] = convert_string(s[start:])
> break
> m_start,m_end = m.span()
> if m_start > start:
> l[len(l):] = convert_string(s[start:m_start])
> hexcode = m.group(1)
> #print hexcode,start,m_start
> if len(hexcode) != 4:
> raise SyntaxError,'illegal \\uXXXX sequence: \\u%s' % hexcode
> ordinal = string.atoi(hexcode,16)
> l.append(struct.pack(pack_format,ordinal))
> start = m_end
> #print l
> return string.join(l,'')
>
> def hexstr(s,sep=''):
>
> return string.join(map(lambda x,hex=hex,ord=ord: '%02x' %
> ord(x),s),sep)
It looks like
r'\\u0000'
will get translated into a 2-character Unicode string. That's probably not
good, if for no other reason than that Java would not do this (it would
create the obvious 7-character Unicode string), and having something that
looks like a Java escape that doesn't *work* like the Java escape will be
confusing as heck for JPython users. Keeping track of even-vs-odd number of
backslashes can't be done with a regexp search, but is easy if the code is
simple <wink>:
def unicode_unescape(s):
from string import atoi
import array
i, n = 0, len(s)
result = array.array('H') # unsigned short, native order
while i < n:
ch = s[i]
i = i+1
if ch != "\\":
result.append(ord(ch))
continue
if i == n:
raise ValueError("string ends with lone backslash")
ch = s[i]
i = i+1
if ch != "u":
result.append(ord("\\"))
result.append(ord(ch))
continue
hexchars = s[i:i+4]
if len(hexchars) != 4:
raise ValueError("\\u escape at end not followed by "
"at least 4 characters")
i = i+4
for ch in hexchars:
if ch not in "01234567890abcdefABCDEF":
raise ValueError("\\u" + hexchars + " contains "
"non-hex characters")
result.append(atoi(hexchars, 16))
# print result
return result.tostring()