[Python-checkins] CVS: python/dist/src/Tools/unicode makeunicodedata.py,NONE,1.1

Sun, 24 Sep 2000 16:18:35 -0700

Update of /cvsroot/python/python/dist/src/Tools/unicode
In directory slayer.i.sourceforge.net:/tmp/cvs-serv24198/Tools/unicode

Added Files:
	makeunicodedata.py 
Log Message:


unicode database compression, step 1:

- use unidb compression for the unicodedata module.  on Windows,
  the new unidatabase module is 120k, down from nearly 600k.


--- NEW FILE ---
#
# makeunidb.py -- generate a compact version of the unicode property
# database (unicodedatabase.h)
#

import sys

SCRIPT = sys.argv[0]
VERSION = "1.0"

UNICODE_DATA = "c:/pythonware/modules/unidb/etc/UnicodeData-Latest.txt"

CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
    "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
    "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
    "So" ]

BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
    "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
    "ON" ]

def maketable():

    unicode = UnicodeData(UNICODE_DATA)

    # extract unicode properties
    dummy = (0, 0, 0, 0, "NULL")
    table = [dummy]
    cache = {0: dummy}
    index = [0] * len(unicode.chars)

    DECOMPOSITION = [""]

    for char in unicode.chars:
        record = unicode.table[char]
        if record:
            # extract database properties
            category = CATEGORY_NAMES.index(record[2])
            combining = int(record[3])
            bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
            mirrored = record[9] == "Y"
            if record[5]:
                decomposition = '"%s"' % record[5]
            else:
                decomposition = "NULL"
            item = (
                category, combining, bidirectional, mirrored, decomposition
                )
            # add entry to index and item tables
            i = cache.get(item)
            if i is None:
                cache[item] = i = len(table)
                table.append(item)
            index[char] = i

    # FIXME: we really should compress the decomposition stuff
    # (see the unidb utilities for one way to do this)

    FILE = "unicodedata_db.h"

    sys.stdout = open(FILE, "w")

    print "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
    print
    print "/* a list of unique database records */"
    print "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
    for item in table:
        print "    {%d, %d, %d, %d, %s}," % item
    print "};"
    print

    print "/* string literals */"
    print "const char *_PyUnicode_CategoryNames[] = {"
    for name in CATEGORY_NAMES:
        print "    \"%s\"," % name
    print "    NULL"
    print "};"

    print "const char *_PyUnicode_BidirectionalNames[] = {"
    for name in BIDIRECTIONAL_NAMES:
        print "    \"%s\"," % name
    print "    NULL"
    print "};"

    # split index table
    index1, index2, shift = splitbins(index)

    print "/* index tables used to find the right database record */"
    print "#define SHIFT", shift
    Array("index1", index1).dump(sys.stdout)
    Array("index2", index2).dump(sys.stdout)

    sys.stdout = sys.__stdout__

# --------------------------------------------------------------------
# the following support code is taken from the unidb utilities
# Copyright (c) 1999-2000 by Secret Labs AB

# load a unicode-data file from disk

import string, sys

class UnicodeData:

    def __init__(self, filename):
        file = open(filename)
        table = [None] * 65536
        while 1:
            s = file.readline()
            if not s:
                break
            s = string.split(string.strip(s), ";")
            char = string.atoi(s[0], 16)
            table[char] = s

        # public attributes
        self.filename = filename
        self.table = table
        self.chars = range(65536) # unicode

    def uselatin1(self):
        # restrict character range to ISO Latin 1
        self.chars = range(256)

# stuff to deal with arrays of unsigned integers

class Array:

    def __init__(self, name, data):
        self.name = name
        self.data = data

    def dump(self, file):
        # write data to file, as a C array
        size = getsize(self.data)
        # print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
        file.write("static ")
        if size == 1:
            file.write("unsigned char")
        elif size == 2:
            file.write("unsigned short")
        else:
            file.write("unsigned int")
        file.write(" " + self.name + "[] = {\n")
        if self.data:
            s = "    "
            for item in self.data:
                i = str(item) + ", "
                if len(s) + len(i) > 78:
                    file.write(s + "\n")
                    s = "    " + i
                else:
                    s = s + i
            if string.strip(s):
                file.write(s + "\n")
        file.write("};\n\n")

def getsize(data):
    # return smallest possible integer size for the given array
    maxdata = max(data)
    if maxdata < 256:
        return 1
    elif maxdata < 65536:
        return 2
    else:
        return 4

def splitbins(bins):
    # split a sparse integer table into two tables, such as:
    #   value = t2[(t1[char>>shift]<<shift)+(char&mask)]
    # and value == 0 means no data
    bytes = sys.maxint
    for shift in range(16):
        bin1 = []
        bin2 = []
        size = 2**shift
        bincache = {}
        for i in range(0, len(bins), size):
            bin = bins[i:i+size]
            index = bincache.get(tuple(bin))
            if index is None:
                index = len(bin2)
                bincache[tuple(bin)] = index
                for v in bin:
                    if v is None:
                        bin2.append(0)
                    else:
                        bin2.append(v)
            bin1.append(index>>shift)
        # determine memory size
        b = len(bin1)*getsize(bin1) + len(bin2)*getsize(bin2)
        if b < bytes:
            best = shift, bin1, bin2
            bytes = b
    shift, bin1, bin2 = best
##     print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
##         len(bin1), len(bin2), shift, bytes
##         )
    return bin1, bin2, shift

if __name__ == "__main__":
    maketable()