[Tutor] unicode ordinals to/from utf8
spir
denis.spir at free.fr
Sat Dec 26 16:41:07 CET 2009
OK, I'll answer myself ;-)
Found needed information at http://www1.tip.nl/~t876506/utf8tbl.html
See below new version,
Denis
________________________________
la vita e estrany
http://spir.wikidot.com/
=============================
# coding: utf8
import sys ; end = sys.exit
# constant
max_code = 1114111 # U+00101111
def toUTF8(code):
''' UTF-8 single character octet string, from unicode code '''
# case 1 octet: ASCII
if code<128 :
o1 = code
octets = (o1,)
# case 2 octets
elif code < 2048:
o1 = code // 64 + 192
o2 = code % 64 + 128
octets = (o1,o2)
# case 3 octets
elif code < 65536:
o1 = code // 4096 + 224
o2 = (code//64) % 64 + 128
o3 = code % 64 + 128
octets = (o1,o2,o3)
# case 4 octets
elif code > 65535 and code <= max_code:
o1 = code // 262144 + 240
o2 = (code//4096) % 64 + 128
o3 = (code//64) % 64 + 128
o4 = code % 64 + 128
octets = (o1,o2,o3,o4)
# case error
else:
message = "Invalid unicode code: %s" %code
raise ValueError(message)
# octet string
return ''.join(chr(o) for o in octets)
def fromUTF8(string):
''' unicode code, from UTF-8 single character octet string '''
octets = [ord(o) for o in string]
o1 = octets[0]
# case o1 = 0xxxxxxx --> 1 octet: ASCII
if o1<128:
return o1
# case o1 = 110xxxxx --> 2 octets
elif o1>192 and o1<224:
o2 = octets[1]
return (o1-192)*64 + (o2-128)
# case o1 = 1110xxxx --> 3 octets
elif o1>223 and o1<240:
o2,o3 = octets[1:]
return (o1-224)*4096 + (o2-128)*64 + (o3-128)
# case o1 = 11110xxx --> 4 octets
elif o1>239 and o1<248:
o2,o3,o4 = octets[1:]
return (o1-240)*262144 + (o2-128)*4096 + (o3-128)*64 + (o4-128)
# case error
else:
decseq = " ".join(str(v) for v in octets)
hexseq = " ".join(hex(v)[2:] for v in octets)
message = "Invalid UTF-8 sequence: %u [%s] = [%s] (hex)." \
%(ord(string),decseq,hexseq)
raise ValueError(message)
#
def test():
# ASCII, latin, BMP, >BMP
codes = [ 9,10,32,57,65,97,126,127,
160,233,255,
256,2048,65535,
65536,max_code,
]
for c1 in codes:
u = toUTF8(c1)
c2 = fromUTF8(u)
print c1,"\t --> ",u,"\t --> ",c2
test()
More information about the Tutor
mailing list