[Tutor] unicode ordinals to/from utf8

spir denis.spir at free.fr
Sat Dec 26 16:41:07 CET 2009


OK, I'll answer myself ;-)
Found needed information at http://www1.tip.nl/~t876506/utf8tbl.html 
See below new version,

Denis
________________________________

la vita e estrany

http://spir.wikidot.com/



=============================
# coding: utf8
import sys ; end = sys.exit

# constant
max_code = 1114111		# U+00101111

def toUTF8(code):
	''' UTF-8 single character octet string, from unicode code '''
	# case 1 octet: ASCII
	if code<128 :
		o1 = code
		octets = (o1,)
	# case 2 octets
	elif code < 2048:
		o1 = code // 64 + 192
		o2 = code % 64 + 128
		octets = (o1,o2)
	# case 3 octets
	elif code < 65536:
		o1 = code // 4096 + 224
		o2 = (code//64) % 64 + 128
		o3 = code % 64 + 128
		octets = (o1,o2,o3)
	# case 4 octets
	elif code > 65535 and code <= max_code:
		o1 = code // 262144 + 240
		o2 = (code//4096) % 64 + 128
		o3 = (code//64) % 64 + 128
		o4 = code % 64 + 128
		octets = (o1,o2,o3,o4)
	# case error
	else:
		message = "Invalid unicode code: %s" %code
		raise ValueError(message)
	# octet string
	return ''.join(chr(o) for o in octets)

def fromUTF8(string):
	''' unicode code, from UTF-8 single character octet string '''
	octets = [ord(o) for o in string]
	o1 = octets[0]
	# case o1 = 0xxxxxxx --> 1 octet: ASCII
	if o1<128:
		return o1
	# case o1 = 110xxxxx --> 2 octets
	elif o1>192 and o1<224:
		o2 = octets[1]
		return (o1-192)*64 + (o2-128)
	# case o1 = 1110xxxx --> 3 octets
	elif o1>223 and o1<240:
		o2,o3 = octets[1:]
		return (o1-224)*4096 + (o2-128)*64 + (o3-128)
	# case o1 = 11110xxx --> 4 octets
	elif o1>239 and o1<248:
		o2,o3,o4 = octets[1:]
		return (o1-240)*262144 + (o2-128)*4096 + (o3-128)*64 + (o4-128)
	# case error
	else:
		decseq = " ".join(str(v) for v in octets)
		hexseq = " ".join(hex(v)[2:] for v in octets)
		message = 	"Invalid UTF-8 sequence: %u [%s] = [%s] (hex)." \
					%(ord(string),decseq,hexseq)
		raise ValueError(message)
#
def test():
	# ASCII, latin, BMP, >BMP
	codes  = [	9,10,32,57,65,97,126,127,
				160,233,255,
				256,2048,65535,
				65536,max_code,
				]
	for c1 in codes:
		u = toUTF8(c1)
		c2 = fromUTF8(u)
		print c1,"\t --> ",u,"\t --> ",c2
test()


More information about the Tutor mailing list