[Tutor] unicode ordinals to/from utf8

Fri Dec 25 18:39:03 CET 2009

Special offer for coders coding on Christmas day!

I'm looking for the simplest way to decode/encode unicode ordinals (called 'codes' below) to/from utf8. Find this rather tricky, esp because of variable number of meaningful bits in first octet. Specifically, for encoding, is there a way to avoid paasing through binary (and back)?
Below what I have so far (test by converting to utf8 & back ;-).

Denis
________________________________

la vita e estrany

http://spir.wikidot.com/

=========================================
# coding: utf8
import sys ; end = sys.exit

sizes_to_values = {2:192, 3:224, 4:240}
def ordinalFromUtf8(s):
	n = len(s)
	byte0 = ord(s[0])
	# case ASCII
	if n == 1:
		return byte0
	# case else
	# get actual value for byte #0
	value0 = byte0 - sizes_to_values[n]
	ordinal = value0 * 64**(n-1)
	# compute other bytes
	for i in range(1,n):
		byte = ord(s[i])
		value = byte - 128
		weight = 64**(n-i-1)
		ordinal = ordinal + (byte - 128) * 64**(n-i-1)
	return ordinal

def ordinalToUtf8(o):
	# case ASCII
	if o < 128 : return chr(o)
	# case else
	# split into octets,
	# each holding '10' & 6 meaningful bits
	binary = bin(o)[2:]
	octets = list()
	while len(binary) > 6:
		octet = '10' + binary[-6:]
		octets.insert(0, octet)
		binary = binary[:-6]
	# first octet can have 3 to 5 free bits,
	# depending on overall length
	bit_count = 6 - len(octets)
	rest_bit_count = len(binary)
	if rest_bit_count > bit_count:
		octet = '10' + '0' * (6 - rest_bit_count) + binary
		octets.insert(0, octet)
		binary = binary[:-6]
	zero_count = 7 - len(octets) - len(binary)
	octet = '1' * (len(octets)+1) + '0' * zero_count + binary
	octets.insert(0, octet)
	# convert to ordinals --> chars --> string
	ordinals = [int(bits,2) for bits in octets]
	chars = [chr(o) for o in ordinals]
	return ''.join(chars)

def test():
	def ue(u): return unicode.encode(u, 'utf8')
	# ASCII, latin, BMP, >BMP
	chars = ['\n','\t',' ','A','a','~',
			ue(u'\u00a0'),'£','µ','¿','À','é','ÿ',
			ue(u'\u0100'),'€',ue(u'\u1234'),ue(u'\uffff'),
			ue(u'\U00010000'),ue(u'\U000fffff')]
	for char in chars:
		o = ordinalFromUtf8(char)
		s = ordinalToUtf8(o)
		print char,repr(char), "	-->", o,'=',hex(o), "	-->", s,repr(s)
test()