python3 Unicode is slow

Dale Gerdemann dale.gerdemann at googlemail.com
Sun Oct 25 08:12:13 EDT 2009


I've written simple code in 2.6 and 3.0 to read every charcter of a
set of files and print out some information for each of these
characters. I tested each program on a large Cyrillic/Latin text. The
result was that the 2.6 version was about 5x faster. Here are the two
programs:

#!/usr/bin/env python

import sys
import codecs
import unicodedata
for path in sys.argv[1:]:
    lines = codecs.open(path, encoding='UTF-8',
errors='replace').readlines()

    for line in lines:
        for c in line:
            name = unicodedata.name(c,'unknown')
            prnt = prnt_rep = c.encode('utf8')
            if name == 'unknown':
                prnt = ' '
            if ord(c) > 127:
                print('%s %-14r U+%04x %s' % (prnt, prnt_rep, ord(c),
name))
            else:
                if ord(c) == 9:
                    name = 'tab'
                    prnt = ' '
                elif ord(c) == 10:
                    name = 'LF'
                    prnt = ' '
                elif ord(c) == 13:
                    name = 'CR'
                    prnt = ' '
                print("{0:s} '\\x{1:02x}'         U+{2:04x}
{3:s}".format(
                        prnt, ord(c), ord(c), name))


#!/usr/bin/env python3

import sys
import unicodedata

for path in sys.argv[1:]:
    lines = open(path, errors='replace').readlines()

    for line in lines:
        for c in line:
            code_point = ord(c)
            utf8 = c.encode()
            if ord(c) <= 127:
                utf8 = "b'\\" + hex(ord(c))[1:] + "'"
            name = unicodedata.name(c,'unknown')
            if name == 'unknown':
                c = ' '
            if code_point == 9:
                c = ' '
                name = 'tab'
            elif code_point == 10:
                c = ' '
                name = 'LF'
            elif code_point == 13:
                c = ' '
                name = 'CR'
            print("{0:s} {1:15s} U+{2:04x} {3:s}".format(
                    c, utf8, code_point, name))





More information about the Python-list mailing list