[pypy-svn] r59087 - pypy/dist/pypy/module/unicodedata

iko at codespeak.net iko at codespeak.net
Tue Oct 14 13:48:04 CEST 2008


Author: iko
Date: Tue Oct 14 13:48:03 2008
New Revision: 59087

Added:
   pypy/dist/pypy/module/unicodedata/btreecompress.py
Modified:
   pypy/dist/pypy/module/unicodedata/generate_unicodedb.py
Log:
use btree for storing unicode data (WIP)

using a class has too much overhead (20K nodes need 450K to store,
plus 100k for the string data and 150k for the codepoint->name
tables). It should be possible to halve at least the node storage.



Added: pypy/dist/pypy/module/unicodedata/btreecompress.py
==============================================================================
--- (empty file)
+++ pypy/dist/pypy/module/unicodedata/btreecompress.py	Tue Oct 14 13:48:03 2008
@@ -0,0 +1,166 @@
+MINLIST  = 5 # minimum number of codepoints in range to make a list
+MAXBLANK = 8 # max number of holes in a row in list range
+
+classdef = """
+class BTreeEntry(object):
+    substring = ""
+    codepoint = -1
+    left = right = parent = None
+
+    def __init__(self, substring, parent, left=False, codepoint=-1):
+        self.substring = substring
+        self.codepoint = codepoint
+        self.parent = parent
+        self.left = self.right = None
+        if parent:
+            if left:
+                assert parent.left is None
+                parent.left = self
+            else:
+                assert parent.right is None
+                parent.right = self
+
+def btree_lookup(name):
+    charnode = _charnode_0_
+    while charnode:
+        if charnode.codepoint != -1 and name == charnode.substring:
+            return charnode.codepoint
+        if name.startswith(charnode.substring):
+            name = name[len(charnode.substring):]
+            charnode = charnode.left
+        else:
+            charnode = charnode.right
+    raise KeyError, name
+"""
+
+def findranges(d):
+    ranges = []
+    for i in range(max(d)+1):
+        if i in d:
+            if not ranges:
+                ranges.append((i,i))
+                last = i
+                continue
+            if last + 1 == i:
+                ranges[-1] = (ranges[-1][0], i)
+            else:
+                ranges.append((i,i))
+            last = i
+    return ranges
+
+def collapse_ranges(ranges):
+    collapsed = [ranges[0]]
+    for i in range(1,len(ranges)):
+        lows, lowe = collapsed[-1]
+        highs, highe = ranges[i]
+        if highs - lowe < MAXBLANK:
+            collapsed[-1] = (lows, highe)
+        else:
+            collapsed.append(ranges[i])
+
+    return collapsed
+
+def build_compression_tree(outfile, ucdata):
+    print >> outfile, classdef
+
+    reversedict = {}
+    rootnode = gen_compression_tree(
+        outfile, ucdata.keys(), ucdata, reversedict)
+
+    function = ["def lookup_charcode(code):",
+                "    res = None"]
+    ranges = collapse_ranges(findranges(reversedict))
+    for low, high in ranges:
+        if high - low <= MINLIST:
+            for code in range(low, high + 1):
+                if code in reversedict:
+                    function.append(
+                        "    if code == %d: res = %s" %
+                        (code, reversedict[code]))
+            continue
+
+        function.append(
+            "    if %d <= code <= %d: res = _charnames_%d[code-%d]" % (
+            low, high, low, low))
+
+        print >> outfile, "_charnames_%d = [" % (low,)
+        for code in range(low, high + 1):
+            print >> outfile, "%s," % (reversedict.get(code),)
+        print >> outfile, "]\n"
+
+    function.extend(["    if res is None: raise KeyError, code",
+                     "    rstr = []",
+                     "    left = res.left",
+                     "    while res:",
+                     "        if res.left is left:",
+                     "            rstr.insert(0, res.substring)",
+                     "        left = res",
+                     "        res = res.parent",
+                     "    return ''.join(rstr)",
+                     "",
+                     ])
+    print >> outfile, '\n'.join(function)
+
+def gen_compression_tree(outfile, stringlist, ucdata, reversedict, parent=None, parent_str="", left=False, counter=[0]):
+    # Find "best" startstring
+    if not stringlist:
+        return None
+    codes = {}
+    for string in stringlist:
+        for stop in range(1, len(string) + 1):
+            codes[string[:stop]] = codes.get(string[:stop], 0) + 1
+            
+    s = [((freq), code) for (code, freq) in codes.iteritems()]            
+    s.sort()
+    if not s:
+        return None
+    newcode = s[-1][1]
+
+    has_substring = []
+    other_substring = []
+    codepoint = None
+    for string in stringlist:
+        if string == newcode:
+            codepoint = ucdata[parent_str+string]
+        elif string.startswith(newcode):
+            has_substring.append(string[len(newcode):])
+        else:
+            other_substring.append(string)
+
+    btnode = "_charnode_%d_" % (counter[0],)
+    args = '%r, %s' % (newcode, parent)
+    if left:
+        args += ', left=True'
+    if codepoint:
+        args += ', codepoint=%d' % (codepoint,)
+        reversedict[codepoint] = btnode        
+
+    print >> outfile, "%s = BTreeEntry(%s)" % (btnode, args)
+    counter[0] += 1
+
+    gen_compression_tree(
+        outfile, has_substring, ucdata, reversedict,
+        parent=btnode, parent_str=parent_str+newcode,
+        left=True, counter=counter)
+    gen_compression_tree(
+        outfile, other_substring, ucdata, reversedict,
+        parent=btnode, parent_str=parent_str,
+        left=False, counter=counter)
+
+    return btnode
+
+def count_tree(tree):
+    def subsum(tree, cset):
+        if not tree:
+            return 0, 0
+        cset.add(tree.substring)
+        lcount, ldepth = subsum(tree.left,cset)
+        rcount, rdepth = subsum(tree.right,cset)
+        return lcount+rcount+1, max(ldepth, rdepth) + 1
+
+    cset = set()
+    nodecount = subsum(tree, cset)
+    strsize = sum(3*4 + len(s) for s in cset)
+    nchars = sum(map(len, cset))
+
+    return strsize, nodecount, nchars

Modified: pypy/dist/pypy/module/unicodedata/generate_unicodedb.py
==============================================================================
--- pypy/dist/pypy/module/unicodedata/generate_unicodedb.py	(original)
+++ pypy/dist/pypy/module/unicodedata/generate_unicodedb.py	Tue Oct 14 13:48:03 2008
@@ -313,83 +313,12 @@
     print >> outfile, 'def combining(code): return _get_record(code)[4]'
 
 def write_character_names(outfile, table):
-    def findranges(d):
-        ranges = []
-        for i in range(max(d)+1):
-            if i in d:
-                if not ranges:
-                    ranges.append((i,i))
-                    last = i
-                    continue
-                if last + 1 == i:
-                    ranges[-1] = (ranges[-1][0], i)
-                else:
-                    ranges.append((i,i))
-                last = i
-        return ranges
-
-    def collapse_ranges(ranges):
-        collapsed = [ranges[0]]
-        for i in range(1,len(ranges)):
-            lows, lowe = collapsed[-1]
-            highs, highe = ranges[i]
-            if highs - lowe < 8:
-                collapsed[-1] = (lows, highe)
-            else:
-                collapsed.append(ranges[i])
-
-        return collapsed
-
-    names = [table[code].name for code in range(len(table)) if table[code].name]
-    codelist = compression.build_compression_table(names)
-    print >> outfile, "_codelist =", 
-    pprint.pprint(codelist, outfile)
-
-    codes = set(code for code in range(len(table)) if table[code].name)
-    ranges = collapse_ranges(findranges(codes))
-
-    f_reverse_dict = ["def _gen_reverse_dict():",
-                      "    res = {}"]
-    function = ["def lookup_charcode(code):",
-                "    from pypy.module.unicodedata import compression",
-                "    res = None"]
-    for low, high in ranges:
-        if high - low <= 5:
-            # Too short for list
-            for code in range(low, high + 1):
-                name = table[code].name
-                if name:
-                    function.append(
-                        "    if code == %d: res = %r" % (
-                        code, compression.compress(codelist, name)))
-                    f_reverse_dict.append(
-                        "    res[%r] = %d" % (
-                        compression.compress(codelist, name), code))
-            continue
 
-        function.append(
-            "    if %d <= code <= %d: res = _charnames_%d[code-%d]" % (
-            low, high, low, low))
-        f_reverse_dict.extend([
-            "    for i in range(%d, %d):" % (low, high+1),
-            "        name = _charnames_%d[i-%d]" % (low, low),
-            "        if name is not None:",
-            "            res[name] = i",
-            ])
-        print >> outfile, "_charnames_%d = [" % (low,)
-        for code in range(low, high + 1):
-            name = table[code].name
-            if name:
-                print >> outfile, '%r,' % (
-                    compression.compress(codelist, name))
-            else:
-                print >> outfile, 'None,'
-        print >> outfile, "]\n"
-    function.extend(["    if res is None: raise KeyError, code",
-                     "    return compression.uncompress(_codelist, res)\n"])
-    print >> outfile, '\n'.join(function)
-    f_reverse_dict.append("    return res\n")
-    print >> outfile, '\n'.join(f_reverse_dict)
+    import btreecompress
+
+    names = dict((table[code].name,code) for code in range(len(table)) if table[code].name)
+
+    btreecompress.build_compression_tree(outfile, names)
     
 def writeUnicodedata(version, table, outfile):
     # Version
@@ -403,8 +332,6 @@
     write_character_names(outfile, table)
     
     print >> outfile, '''
-_code_by_name = _gen_reverse_dict()
-
 _cjk_prefix = "CJK UNIFIED IDEOGRAPH-"
 _hangul_prefix = 'HANGUL SYLLABLE '
 
@@ -463,12 +390,11 @@
     raise KeyError
 
 def lookup(name):
-    from pypy.module.unicodedata import compression
     if name[:len(_cjk_prefix)] == _cjk_prefix:
         return _lookup_cjk(name[len(_cjk_prefix):])
     if name[:len(_hangul_prefix)] == _hangul_prefix:
         return _lookup_hangul(name[len(_hangul_prefix):])
-    return _code_by_name[compression.compress(_codelist, name)]
+    return btree_lookup(name)
 
 def name(code):
     if (0x3400 <= code <= 0x4DB5 or



More information about the Pypy-commit mailing list