[Python-checkins] CVS: python/dist/src/Tools/unicode makeunicodedata.py,1.3,1.4

Mon, 25 Sep 2000 11:00:00 -0700

Update of /cvsroot/python/python/dist/src/Tools/unicode
In directory slayer.i.sourceforge.net:/tmp/cvs-serv8239/Tools/unicode

Modified Files:
	makeunicodedata.py 
Log Message:


unicode database compression, step 3:

- use unidb compression for the unicodectype module.  smaller,
  faster, and slightly more portable...

- also mention the unicode directory in Tools/README

Index: makeunicodedata.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Tools/unicode/makeunicodedata.py,v
retrieving revision 1.3
retrieving revision 1.4
diff -C2 -r1.3 -r1.4
*** makeunicodedata.py	2000/09/25 08:07:06	1.3
--- makeunicodedata.py	2000/09/25 17:59:57	1.4
***************
*** 1,8 ****
  #
! # generate a compact version of the unicode property database
  #
  # history:
  # 2000-09-24 fl   created (based on bits and pieces from unidb)
  # 2000-09-25 fl   merged tim's splitbin fixes, separate decomposition table
  #
  # written by Fredrik Lundh (fredrik@pythonware.com), September 2000
--- 1,12 ----
  #
! # (re)generate unicode property and type databases
  #
+ # this script converts a unicode 3.0 database file to
+ # Modules/unicodedata_db.h and Objects/unicodetype_db.h
+ #
  # history:
  # 2000-09-24 fl   created (based on bits and pieces from unidb)
  # 2000-09-25 fl   merged tim's splitbin fixes, separate decomposition table
+ # 2000-09-25 fl   added character type table
  #
  # written by Fredrik Lundh (fredrik@pythonware.com), September 2000
***************
*** 14,18 ****
  VERSION = "1.1"
  
! UNICODE_DATA = "../UnicodeData-Latest.txt"
  
  CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
--- 18,22 ----
  VERSION = "1.1"
  
! UNICODE_DATA = "UnicodeData-Latest.txt"
  
  CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
***************
*** 24,29 ****
      "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
      "ON" ]
  
! def maketable():
  
      unicode = UnicodeData(UNICODE_DATA)
--- 28,42 ----
      "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
      "ON" ]
+ 
+ ALPHA_MASK = 0x01
+ DECIMAL_MASK = 0x02
+ DIGIT_MASK = 0x04
+ LOWER_MASK = 0x08
+ NUMERIC_MASK = 0x10
+ SPACE_MASK = 0x20
+ TITLE_MASK = 0x40
+ UPPER_MASK = 0x80
  
! def maketables():
  
      unicode = UnicodeData(UNICODE_DATA)
***************
*** 75,79 ****
              decomp_index[char] = i
  
!     FILE = "unicodedata_db.h"
  
      sys.stdout = open(FILE, "w")
--- 88,92 ----
              decomp_index[char] = i
  
!     FILE = "Modules/unicodedata_db.h"
  
      sys.stdout = open(FILE, "w")
***************
*** 88,91 ****
--- 101,107 ----
      print
  
+     # FIXME: the following tables should be made static, and
+     # the support code moved into unicodedatabase.c
+ 
      print "/* string literals */"
      print "const char *_PyUnicode_CategoryNames[] = {"
***************
*** 107,122 ****
      print "};"
  
!     # split index table
      index1, index2, shift = splitbins(index)
  
!     print "/* index tables used to find the right database record */"
      print "#define SHIFT", shift
      Array("index1", index1).dump(sys.stdout)
      Array("index2", index2).dump(sys.stdout)
  
!     # split index table
      index1, index2, shift = splitbins(decomp_index)
  
!     print "/* same, for the decomposition data */"
      print "#define DECOMP_SHIFT", shift
      Array("decomp_index1", index1).dump(sys.stdout)
--- 123,138 ----
      print "};"
  
!     # split record index table
      index1, index2, shift = splitbins(index)
  
!     print "/* index tables for the database records */"
      print "#define SHIFT", shift
      Array("index1", index1).dump(sys.stdout)
      Array("index2", index2).dump(sys.stdout)
  
!     # split decomposition index table
      index1, index2, shift = splitbins(decomp_index)
  
!     print "/* index tables for the decomposition data */"
      print "#define DECOMP_SHIFT", shift
      Array("decomp_index1", index1).dump(sys.stdout)
***************
*** 125,128 ****
--- 141,216 ----
      sys.stdout = sys.__stdout__
  
+     #
+     # 3) unicode type data
+ 
+     # extract unicode types
+     dummy = (0, 0, 0, 0)
+     table = [dummy]
+     cache = {0: dummy}
+     index = [0] * len(unicode.chars)
+ 
+     for char in unicode.chars:
+         record = unicode.table[char]
+         if record:
+             # extract database properties
+             category = record[2]
+             bidirectional = record[4]
+             flags = 0
+             if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
+                 flags |= ALPHA_MASK
+             if category == "Ll":
+                 flags |= LOWER_MASK
+             if category == "Zs" or bidirectional in ("WS", "B", "S"):
+                 flags |= SPACE_MASK
+             if category in ["Lt", "Lu"]:
+                 flags |= TITLE_MASK
+             if category == "Lu":
+                 flags |= UPPER_MASK
+             # use delta predictor for upper/lower/title
+             if record[12]:
+                 upper = (int(record[12], 16) - char) & 0xffff
+             else:
+                 upper = 0
+             if record[13]:
+                 lower = (int(record[13], 16) - char) & 0xffff
+             else:
+                 lower = 0
+             if record[14]:
+                 title = (int(record[14], 16) - char) & 0xffff
+             else:
+                 title = 0
+             item = (
+                 flags, upper, lower, title
+                 )
+             # add entry to index and item tables
+             i = cache.get(item)
+             if i is None:
+                 cache[item] = i = len(table)
+                 table.append(item)
+             index[char] = i
+ 
+     FILE = "Objects/unicodetype_db.h"
+ 
+     sys.stdout = open(FILE, "w")
+ 
+     print "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
+     print
+     print "/* a list of unique character type descriptors */"
+     print "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
+     for item in table:
+         print "    {%d, %d, %d, %d}," % item
+     print "};"
+     print
+ 
+     # split decomposition index table
+     index1, index2, shift = splitbins(index)
+ 
+     print "/* type indexes */"
+     print "#define SHIFT", shift
+     Array("index1", index1).dump(sys.stdout)
+     Array("index2", index2).dump(sys.stdout)
+ 
+     sys.stdout = sys.__stdout__
+ 
  # --------------------------------------------------------------------
  # the following support code is taken from the unidb utilities
***************
*** 260,262 ****
  
  if __name__ == "__main__":
!     maketable()
--- 348,350 ----
  
  if __name__ == "__main__":
!     maketables()