[Python-checkins] CVS: python/dist/src/Tools/unicode makeunicodedata.py,1.1,1.2

Tim Peters python-dev@python.org
Mon, 25 Sep 2000 00:13:44 -0700


Update of /cvsroot/python/python/dist/src/Tools/unicode
In directory slayer.i.sourceforge.net:/tmp/cvs-serv10487/python/dist/src/tools/unicode

Modified Files:
	makeunicodedata.py 
Log Message:
Fiddled w/ /F's cool new splitbins function:  documented it, generalized it
a bit, sped it a lot primarily by removing the unused assumption that None was
a legit bin entry (the function doesn't really need to assume that there's
anything special about 0), added an optional "trace" argument, and in __debug__
mode added exhaustive verification that the decomposition is both correct and
doesn't overstep any array bounds (which wasn't obvious to me from staring at the
generated C code -- now I feel safe!).  Did not commit a new unicodedata_db.h, as
the one produced by this version is identical to the one already checked in.


Index: makeunicodedata.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Tools/unicode/makeunicodedata.py,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -r1.1 -r1.2
*** makeunicodedata.py	2000/09/24 23:18:31	1.1
--- makeunicodedata.py	2000/09/25 07:13:41	1.2
***************
*** 166,201 ****
          return 4
  
! def splitbins(bins):
!     # split a sparse integer table into two tables, such as:
!     #   value = t2[(t1[char>>shift]<<shift)+(char&mask)]
!     # and value == 0 means no data
!     bytes = sys.maxint
!     for shift in range(16):
!         bin1 = []
!         bin2 = []
          size = 2**shift
          bincache = {}
!         for i in range(0, len(bins), size):
!             bin = bins[i:i+size]
!             index = bincache.get(tuple(bin))
              if index is None:
!                 index = len(bin2)
!                 bincache[tuple(bin)] = index
!                 for v in bin:
!                     if v is None:
!                         bin2.append(0)
!                     else:
!                         bin2.append(v)
!             bin1.append(index>>shift)
          # determine memory size
!         b = len(bin1)*getsize(bin1) + len(bin2)*getsize(bin2)
          if b < bytes:
!             best = shift, bin1, bin2
              bytes = b
!     shift, bin1, bin2 = best
! ##     print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
! ##         len(bin1), len(bin2), shift, bytes
! ##         )
!     return bin1, bin2, shift
  
  if __name__ == "__main__":
--- 166,229 ----
          return 4
  
! def splitbins(t, trace=0):
!     """t, trace=0 -> (t1, t2, shift).  Split a table to save space.
! 
!     t is a sequence of ints.  This function can be useful to save space if
!     many of the ints are the same.  t1 and t2 are lists of ints, and shift
!     is an int, chosen to minimize the combined size of t1 and t2 (in C
!     code), and where for each i in range(len(t)),
!         t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
!     where mask is a bitmask isolating the last "shift" bits.
! 
!     If optional arg trace is true (default false), progress info is
!     printed to sys.stderr.
!     """
! 
!     import sys
!     if trace:
!         def dump(t1, t2, shift, bytes):
!             print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
!                 len(t1), len(t2), shift, bytes)
!         print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
!                             "bytes"
!     n = len(t)-1    # last valid index
!     maxshift = 0    # the most we can shift n and still have something left
!     if n > 0:
!         while n >> 1:
!             n >>= 1
!             maxshift += 1
!     del n
!     bytes = sys.maxint  # smallest total size so far
!     t = tuple(t)    # so slices can be dict keys
!     for shift in range(maxshift + 1):
!         t1 = []
!         t2 = []
          size = 2**shift
          bincache = {}
!         for i in range(0, len(t), size):
!             bin = t[i:i+size]
!             index = bincache.get(bin)
              if index is None:
!                 index = len(t2)
!                 bincache[bin] = index
!                 t2.extend(bin)
!             t1.append(index >> shift)
          # determine memory size
!         b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
!         if trace:
!             dump(t1, t2, shift, b)
          if b < bytes:
!             best = t1, t2, shift
              bytes = b
!     t1, t2, shift = best
!     if trace:
!         print >>sys.stderr, "Best:",
!         dump(t1, t2, shift, bytes)
!     if __debug__:
!         # exhaustively verify that the decomposition is correct
!         mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
!         for i in xrange(len(t)):
!             assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
!     return best
  
  if __name__ == "__main__":