[Tutor] unicode ordinals to/from utf8
spir
denis.spir at free.fr
Fri Dec 25 18:39:03 CET 2009
Special offer for coders coding on Christmas day!
I'm looking for the simplest way to decode/encode unicode ordinals (called 'codes' below) to/from utf8. Find this rather tricky, esp because of variable number of meaningful bits in first octet. Specifically, for encoding, is there a way to avoid paasing through binary (and back)?
Below what I have so far (test by converting to utf8 & back ;-).
Denis
________________________________
la vita e estrany
http://spir.wikidot.com/
=========================================
# coding: utf8
import sys ; end = sys.exit
sizes_to_values = {2:192, 3:224, 4:240}
def ordinalFromUtf8(s):
n = len(s)
byte0 = ord(s[0])
# case ASCII
if n == 1:
return byte0
# case else
# get actual value for byte #0
value0 = byte0 - sizes_to_values[n]
ordinal = value0 * 64**(n-1)
# compute other bytes
for i in range(1,n):
byte = ord(s[i])
value = byte - 128
weight = 64**(n-i-1)
ordinal = ordinal + (byte - 128) * 64**(n-i-1)
return ordinal
def ordinalToUtf8(o):
# case ASCII
if o < 128 : return chr(o)
# case else
# split into octets,
# each holding '10' & 6 meaningful bits
binary = bin(o)[2:]
octets = list()
while len(binary) > 6:
octet = '10' + binary[-6:]
octets.insert(0, octet)
binary = binary[:-6]
# first octet can have 3 to 5 free bits,
# depending on overall length
bit_count = 6 - len(octets)
rest_bit_count = len(binary)
if rest_bit_count > bit_count:
octet = '10' + '0' * (6 - rest_bit_count) + binary
octets.insert(0, octet)
binary = binary[:-6]
zero_count = 7 - len(octets) - len(binary)
octet = '1' * (len(octets)+1) + '0' * zero_count + binary
octets.insert(0, octet)
# convert to ordinals --> chars --> string
ordinals = [int(bits,2) for bits in octets]
chars = [chr(o) for o in ordinals]
return ''.join(chars)
def test():
def ue(u): return unicode.encode(u, 'utf8')
# ASCII, latin, BMP, >BMP
chars = ['\n','\t',' ','A','a','~',
ue(u'\u00a0'),'£','µ','¿','À','é','ÿ',
ue(u'\u0100'),'€',ue(u'\u1234'),ue(u'\uffff'),
ue(u'\U00010000'),ue(u'\U000fffff')]
for char in chars:
o = ordinalFromUtf8(char)
s = ordinalToUtf8(o)
print char,repr(char), " -->", o,'=',hex(o), " -->", s,repr(s)
test()
More information about the Tutor
mailing list