[pypy-svn] r59099 - in pypy/dist/pypy/module/unicodedata: . test

iko at codespeak.net iko at codespeak.net
Tue Oct 14 22:45:23 CEST 2008


Author: iko
Date: Tue Oct 14 22:45:20 2008
New Revision: 59099

Added:
   pypy/dist/pypy/module/unicodedata/test/test_trie.py
   pypy/dist/pypy/module/unicodedata/triegenerator.py   (contents, props changed)
      - copied, changed from r59087, pypy/dist/pypy/module/unicodedata/btreecompress.py
Removed:
   pypy/dist/pypy/module/unicodedata/btreecompress.py
   pypy/dist/pypy/module/unicodedata/compression.py
   pypy/dist/pypy/module/unicodedata/test/test_compression.py
Modified:
   pypy/dist/pypy/module/unicodedata/generate_unicodedb.py
Log:
Improve efficiency of storage of unicode data tree (which is rightly
a trie as alexander pointed out)

We now use 375k total for one unicode database (according to static data
reporting, and it agrees with estimations based on sizes of strings
and lists)



Modified: pypy/dist/pypy/module/unicodedata/generate_unicodedb.py
==============================================================================
--- pypy/dist/pypy/module/unicodedata/generate_unicodedb.py	(original)
+++ pypy/dist/pypy/module/unicodedata/generate_unicodedb.py	Tue Oct 14 22:45:20 2008
@@ -1,7 +1,6 @@
 #!/usr/bin/env python
 
 import pprint
-import compression
 
 MAXUNICODE = 0x10FFFF     # the value of sys.maxunicode of wide Python builds
 
@@ -314,11 +313,11 @@
 
 def write_character_names(outfile, table):
 
-    import btreecompress
+    import triegenerator
 
     names = dict((table[code].name,code) for code in range(len(table)) if table[code].name)
 
-    btreecompress.build_compression_tree(outfile, names)
+    triegenerator.build_compression_tree(outfile, names)
     
 def writeUnicodedata(version, table, outfile):
     # Version
@@ -394,7 +393,7 @@
         return _lookup_cjk(name[len(_cjk_prefix):])
     if name[:len(_hangul_prefix)] == _hangul_prefix:
         return _lookup_hangul(name[len(_hangul_prefix):])
-    return btree_lookup(name)
+    return trie_lookup(name)
 
 def name(code):
     if (0x3400 <= code <= 0x4DB5 or

Added: pypy/dist/pypy/module/unicodedata/test/test_trie.py
==============================================================================
--- (empty file)
+++ pypy/dist/pypy/module/unicodedata/test/test_trie.py	Tue Oct 14 22:45:20 2008
@@ -0,0 +1,22 @@
+import py
+import StringIO
+
+from pypy.module.unicodedata import triegenerator
+
+def setup_module(mod):
+    mod.tmpdir = py.test.ensuretemp(mod.__name__)
+    mod.lines = lines = map(hex,map(hash, map(str, range(100))))
+    # some extra handcrafted tests
+    lines.extend([ 'AAA', 'AAAA', 'AAAB', 'AAB', 'AABB' ]) 
+    out = mod.tmpdir.join('btree.py')
+    o = out.open('w')
+    mod.trie = triegenerator.build_compression_tree(
+        o, dict(map(lambda (x,y):(y,x), enumerate(lines))))
+    o.close()
+    mod.bt = out.pyimport()
+
+
+def test_roundtrip():
+    for i, line in enumerate(lines):
+        assert bt.lookup_charcode(i) == line
+        assert bt.trie_lookup(line) == i

Copied: pypy/dist/pypy/module/unicodedata/triegenerator.py (from r59087, pypy/dist/pypy/module/unicodedata/btreecompress.py)
==============================================================================
--- pypy/dist/pypy/module/unicodedata/btreecompress.py	(original)
+++ pypy/dist/pypy/module/unicodedata/triegenerator.py	Tue Oct 14 22:45:20 2008
@@ -1,17 +1,44 @@
+import pprint
+
 MINLIST  = 5 # minimum number of codepoints in range to make a list
 MAXBLANK = 8 # max number of holes in a row in list range
 
-classdef = """
-class BTreeEntry(object):
-    substring = ""
-    codepoint = -1
-    left = right = parent = None
+STRIDXBITS = 16 # bits to use for string index. Remaining are
+                # used for parent pointer
+
+#
+# The trie of the unicode names is stored as a list, with 16-bit
+# indexes for left, right and parent pointer, and also pointer
+# into the string table (which is really just a long string)
+#
+# note, the size of the parent and the string pointer depend
+# on STRIDXBITS, the latter being used for the string pointer
+# and whatever is left for the parent pointer
+#
+# Each node is represented by 3 entrines in the _charnodes list:
+#
+# [leftright, parentstr, codepoint]
+#
+# (keeping them dirctly in the list rather than as 3-tuples
+# saves 8 bytes per entry)
+#
+# where leftrigt is left << 16 | right
+# and parentstr is parent << STRIDXBITS | string
+# (with some additional logic to account for the fact that integers
+# are signed)
+
+class TrieEntry(object):
+    allstrings = set()
+    counter = [0]
 
     def __init__(self, substring, parent, left=False, codepoint=-1):
         self.substring = substring
+        self.allstrings.add(substring)
         self.codepoint = codepoint
         self.parent = parent
         self.left = self.right = None
+        self.index = self.counter[0]
+        self.counter[0] += 1
         if parent:
             if left:
                 assert parent.left is None
@@ -20,18 +47,95 @@
                 assert parent.right is None
                 parent.right = self
 
-def btree_lookup(name):
-    charnode = _charnode_0_
-    while charnode:
-        if charnode.codepoint != -1 and name == charnode.substring:
-            return charnode.codepoint
-        if name.startswith(charnode.substring):
-            name = name[len(charnode.substring):]
-            charnode = charnode.left
+    def as_list(self, stringidx):
+        parentidx = leftidx = rightidx = -1
+        if self.left:
+            leftidx = self.left.index
+
+        if self.right:
+            rightidx = self.right.index
+
+        if self.parent:
+            parentidx = self.parent.index
+
+        stridx = stringidx[self.substring]
+
+        leftright = (leftidx&0xffff) << 16 | (rightidx&0xffff)
+        if leftright >= 2**31:
+            leftright = int(~0x7fffffff | (0x7fffffff&leftright))
+
+        parentstr = ((parentidx & ((1<<(32-STRIDXBITS))-1)) << STRIDXBITS |
+                      (stridx & ((1<<STRIDXBITS)-1)))
+        if parentstr >= 2**31:
+            parentstr = int(~0x7fffffff | (0x7fffffff&parentstr))
+
+        return (leftright, parentstr, self.codepoint)
+
+classdef = """
+def trie_lookup(name):
+    charnode = 0
+    while 0 <= charnode < 0xffff: # 16bit number, 0xffff = None
+        charnode *= 3
+        leftright, parentstr, codepoint = _charnodes[charnode:charnode+3]
+
+        if leftright < 0:
+            # XXX assumes msb is sign
+            left = 0x8000 | ((leftright & 0x7fffffff) >> 16)
+        else:
+            left = (leftright & 0x7fffffff) >> 16
+        right = leftright & 0xffff
+
+        if parentstr < 0:
+            # XXX assumes msb is sign
+            parent = 0x8000 | ((parentstr & 0x7fffffff) >> %(STRIDXBITS)d)
+        else:
+            parent = (parentstr & 0x7fffffff) >> %(STRIDXBITS)d
+        stridx = parentstr & ((1 << %(STRIDXBITS)d) - 1)
+        
+        strlen = ord(_stringtable[stridx])
+        substring = _stringtable[stridx+1:stridx+1+strlen]
+
+        if codepoint != -1 and name == substring:
+            return int(codepoint)
+        if name.startswith(substring):
+            name = name[strlen:]
+            charnode = left
         else:
-            charnode = charnode.right
+            charnode = right
     raise KeyError, name
-"""
+
+def name_of_node(charnode):
+    res = []
+    prevnode = -1
+    while 0 <= charnode < 0xffff: # 16bit number, 0xffff = None
+        charnode *= 3
+        leftright, parentstr, codepoint = _charnodes[charnode:charnode+3]
+
+        if leftright < 0:
+            # XXX assumes msg is sign
+            left = 0x8000 | ((leftright & 0x7fffffff) >> 16)
+        else:
+            left = (leftright & 0x7fffffff) >> 16
+        right = leftright & 0xffff
+
+        if parentstr < 0:
+            # XXX assumes msb is sign
+            parent = 0x8000 | ((parentstr & 0x7fffffff) >> %(STRIDXBITS)d)
+        else:
+            parent = (parentstr & 0x7fffffff) >> %(STRIDXBITS)d
+
+        if prevnode < 0 or prevnode == left:
+            stridx = parentstr & ((1<<%(STRIDXBITS)d)-1)
+            strlen = ord(_stringtable[stridx])
+            substring = _stringtable[stridx+1:stridx+1+strlen]
+            res.insert(0, substring)
+
+        prevnode = charnode // 3
+        charnode = parent
+
+    return ''.join(res)
+    
+""" % globals()
 
 def findranges(d):
     ranges = []
@@ -64,11 +168,48 @@
     print >> outfile, classdef
 
     reversedict = {}
-    rootnode = gen_compression_tree(
-        outfile, ucdata.keys(), ucdata, reversedict)
+    rootnode = gen_compression_tree(ucdata.keys(), ucdata, reversedict)
 
+    # write string table
+    print >> outfile, "_stringtable = ("
+    stringidx = {}
+    stridx = 0
+    for string in rootnode.allstrings:
+        strlen = len(string)
+        assert strlen < 256, "Substring too long, > 255 chars"
+        print >> outfile, "%r" % (chr(strlen) + string)
+        stringidx[string] = stridx
+        stridx += strlen + 1
+        
+    print >> outfile, ")"
+
+    print >> outfile, "_dbg_stringidx = ",
+    pprint.pprint(stringidx,stream=outfile)
+    
+    assert stridx < (1<<STRIDXBITS), "Too many strings, > %d chars" % (
+        ((1<<STRIDXBITS) - 1))
+
+    # build trie list
+    nodelist = []
+    maxidx = 0
+    nodes = [rootnode]
+    
+    while nodes:
+        n = nodes.pop()
+        nodelist.append(n)
+        if n.left:
+            nodes.append(n.left)
+        if n.right:
+            nodes.append(n.right)
+        
+    nodelist.sort(key=lambda x: x.index)
+    newnodes = []
+    map(newnodes.extend, (n.as_list(stringidx) for n in nodelist))
+    print >> outfile, "_charnodes =",
+    pprint.pprint(newnodes, stream=outfile)
+    
     function = ["def lookup_charcode(code):",
-                "    res = None"]
+                "    res = -1"]
     ranges = collapse_ranges(findranges(reversedict))
     for low, high in ranges:
         if high - low <= MINLIST:
@@ -76,7 +217,7 @@
                 if code in reversedict:
                     function.append(
                         "    if code == %d: res = %s" %
-                        (code, reversedict[code]))
+                        (code, reversedict[code].index))
             continue
 
         function.append(
@@ -85,23 +226,20 @@
 
         print >> outfile, "_charnames_%d = [" % (low,)
         for code in range(low, high + 1):
-            print >> outfile, "%s," % (reversedict.get(code),)
+            if code in reversedict:
+                print >> outfile, "%s," % (reversedict[code].index,)
+            else:
+                print >> outfile, "-1,"
         print >> outfile, "]\n"
 
-    function.extend(["    if res is None: raise KeyError, code",
-                     "    rstr = []",
-                     "    left = res.left",
-                     "    while res:",
-                     "        if res.left is left:",
-                     "            rstr.insert(0, res.substring)",
-                     "        left = res",
-                     "        res = res.parent",
-                     "    return ''.join(rstr)",
+    function.extend(["    if res == -1: raise KeyError, code",
+                     "    return name_of_node(res)",
                      "",
                      ])
     print >> outfile, '\n'.join(function)
+    return rootnode
 
-def gen_compression_tree(outfile, stringlist, ucdata, reversedict, parent=None, parent_str="", left=False, counter=[0]):
+def gen_compression_tree(stringlist, ucdata, reversedict, parent=None, parent_str="", left=False):
     # Find "best" startstring
     if not stringlist:
         return None
@@ -118,7 +256,7 @@
 
     has_substring = []
     other_substring = []
-    codepoint = None
+    codepoint = -1
     for string in stringlist:
         if string == newcode:
             codepoint = ucdata[parent_str+string]
@@ -127,25 +265,18 @@
         else:
             other_substring.append(string)
 
-    btnode = "_charnode_%d_" % (counter[0],)
-    args = '%r, %s' % (newcode, parent)
-    if left:
-        args += ', left=True'
-    if codepoint:
-        args += ', codepoint=%d' % (codepoint,)
-        reversedict[codepoint] = btnode        
-
-    print >> outfile, "%s = BTreeEntry(%s)" % (btnode, args)
-    counter[0] += 1
+    btnode = TrieEntry(newcode, parent, left, codepoint)
+    if codepoint != -1:
+        reversedict[codepoint] = btnode
 
     gen_compression_tree(
-        outfile, has_substring, ucdata, reversedict,
+        has_substring, ucdata, reversedict,
         parent=btnode, parent_str=parent_str+newcode,
-        left=True, counter=counter)
+        left=True)
     gen_compression_tree(
-        outfile, other_substring, ucdata, reversedict,
+        other_substring, ucdata, reversedict,
         parent=btnode, parent_str=parent_str,
-        left=False, counter=counter)
+        left=False)
 
     return btnode
 
@@ -164,3 +295,17 @@
     nchars = sum(map(len, cset))
 
     return strsize, nodecount, nchars
+
+if __name__ == '__main__':
+    testdata = {
+        'AAA' : 0,
+        'AAAA' : 1,
+        'AAB' : 2,
+        'ABA' : 3,
+        'BBB' : 4,
+        'ACA' : 5,
+        }
+
+    import sys
+
+    build_compression_tree(sys.stdout, testdata)



More information about the Pypy-commit mailing list