[pypy-svn] r59015 - in pypy/dist/pypy/module/unicodedata: . test

Sun Oct 12 12:46:17 CEST 2008

Author: iko
Date: Sun Oct 12 12:46:16 2008
New Revision: 59015

Modified:
   pypy/dist/pypy/module/unicodedata/generate_unicodedb.py
   pypy/dist/pypy/module/unicodedata/test/test_unicodedata.py
Log:
Use lists instead of dict for unicode names



Modified: pypy/dist/pypy/module/unicodedata/generate_unicodedb.py
==============================================================================

--- pypy/dist/pypy/module/unicodedata/generate_unicodedb.py	(original)
+++ pypy/dist/pypy/module/unicodedata/generate_unicodedb.py	Sun Oct 12 12:46:16 2008
@@ -313,20 +313,68 @@
     print >> outfile, 'def combining(code): return _get_record(code)[4]'
 
 def write_character_names(outfile, table):
-    # Compressed Character names
+    def findranges(d):
+        ranges = []
+        for i in range(max(d)+1):
+            if i in d:
+                if not ranges:
+                    ranges.append((i,i))
+                    last = i
+                    continue
+                if last + 1 == i:
+                    ranges[-1] = (ranges[-1][0], i)
+                else:
+                    ranges.append((i,i))
+                last = i
+        return ranges
+
+    def collapse_ranges(ranges):
+        collapsed = [ranges[0]]
+        for i in range(1,len(ranges)):
+            lows, lowe = collapsed[-1]
+            highs, highe = ranges[i]
+            if highs - lowe < max([lowe - lows, highe - highs]):
+                collapsed[-1] = (lows, highe)
+            else:
+                collapsed.append(ranges[i])
+
+        return collapsed
+
     names = [table[code].name for code in range(len(table)) if table[code].name]
     codelist = compression.build_compression_table(names)
-    print >> outfile, '_charnames = {'
-    for code in range(len(table)):
-        name = table[code].name
-        if name:
-            print >> outfile, '%r: %r,' % (
-                code, compression.compress(codelist, name))
-    print >> outfile, "}\n"
     print >> outfile, "_codelist =", 
     pprint.pprint(codelist, outfile)
 
+    codes = set(code for code in range(len(table)) if table[code].name)
+    ranges = collapse_ranges(findranges(codes))
 
+    f_reverse_dict = ["def _gen_reverse_dict():",
+                      "    res = {}"]
+    function = ["def lookup_charcode(code):",
+                "    from pypy.module.unicodedata import compression",
+                "    res = None"]
+    for low, high in ranges:
+        function.append(
+            "    if %d <= code <= %d: res = _charnames_%d[code-%d]" % (
+            low, high, low, low))
+        f_reverse_dict.append(
+            "    for i in range(%d, %d): res[_charnames_%d[i-%d]] = i" % (
+            low, high+1, low, low))
+        print >> outfile, "_charnames_%d = [" % (low,)
+        for code in range(low, high + 1):
+            name = table[code].name
+            if name:
+                print >> outfile, '%r,' % (
+                    compression.compress(codelist, name))
+            else:
+                print >> outfile, 'None,'
+        print >> outfile, "]\n"
+    function.extend(["    if res is None: raise KeyError, code",
+                     "    return compression.uncompress(_codelist, res)\n"])
+    print >> outfile, '\n'.join(function)
+    f_reverse_dict.append("    return res\n")
+    print >> outfile, '\n'.join(f_reverse_dict)
+    
 def writeUnicodedata(version, table, outfile):
     # Version
     print >> outfile, 'version = %r' % version
@@ -339,7 +387,7 @@
     write_character_names(outfile, table)
     
     print >> outfile, '''
-_code_by_name = dict(map(lambda x:(x[1],x[0]), _charnames.iteritems()))
+_code_by_name = _gen_reverse_dict()
 
 _cjk_prefix = "CJK UNIFIED IDEOGRAPH-"
 _hangul_prefix = 'HANGUL SYLLABLE '
@@ -407,7 +455,6 @@
     return _code_by_name[compression.compress(_codelist, name)]
 
 def name(code):
-    from pypy.module.unicodedata import compression
     if (0x3400 <= code <= 0x4DB5 or
         0x4E00 <= code <= 0x%X or
         0x20000 <= code <= 0x2A6D6):
@@ -422,7 +469,7 @@
         return ("HANGUL SYLLABLE " + _hangul_L[l_code] +
                 _hangul_V[v_code] + _hangul_T[t_code])
     
-    return compression.uncompress(_codelist, _charnames[code])
+    return lookup_charcode(code)
 ''' % (cjk_end, cjk_end)
 
     # Categories

Modified: pypy/dist/pypy/module/unicodedata/test/test_unicodedata.py
==============================================================================
--- pypy/dist/pypy/module/unicodedata/test/test_unicodedata.py	(original)
+++ pypy/dist/pypy/module/unicodedata/test/test_unicodedata.py	Sun Oct 12 12:46:16 2008
@@ -10,7 +10,8 @@
         space = gettestobjspace(usemodules=('unicodedata',))
         cls.space = space
         charlist_w = []
-        for i in range(2000):
+        nocharlist_w = []
+        while len(charlist_w) < 1000 or len(nocharlist_w) < 1000:
             chr = unichr(random.randrange(65536))
             try:
                 w_tup = space.newtuple([
@@ -19,8 +20,9 @@
                     ])
                 charlist_w.append(w_tup)
             except ValueError:
-                pass
+                nocharlist_w.append(space.wrap(chr))
         cls.w_charlist = space.newlist(charlist_w)
+        cls.w_nocharlist = space.newlist(nocharlist_w)
 
     def test_hangul_syllables(self):
         import unicodedata
@@ -93,3 +95,7 @@
             assert unicodedata.name(chr) == name
             assert unicodedata.lookup(name) == chr
 
+    def test_random_missing_chars(self):
+        import unicodedata
+        for chr in self.nocharlist:
+            raises(ValueError, unicodedata.name, chr)