[Python-checkins] python/dist/src/Lib sre_compile.py,1.46,1.47 sre_constants.py,1.31,1.32

loewis@users.sourceforge.net loewis@users.sourceforge.net
Sat, 19 Apr 2003 05:56:10 -0700


Update of /cvsroot/python/python/dist/src/Lib
In directory sc8-pr-cvs1:/tmp/cvs-serv11938/Lib

Modified Files:
	sre_compile.py sre_constants.py 
Log Message:
Fully support 32-bit codes. Enable BIGCHARSET in UCS-4 builds.


Index: sre_compile.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/sre_compile.py,v
retrieving revision 1.46
retrieving revision 1.47
diff -C2 -d -r1.46 -r1.47
*** sre_compile.py	14 Apr 2003 17:59:31 -0000	1.46
--- sre_compile.py	19 Apr 2003 12:56:07 -0000	1.47
***************
*** 17,21 ****
  assert _sre.MAGIC == MAGIC, "SRE module mismatch"
  
! MAXCODE = 65535
  
  def _compile(code, pattern, flags):
--- 17,24 ----
  assert _sre.MAGIC == MAGIC, "SRE module mismatch"
  
! if _sre.CODESIZE == 2:
!     MAXCODE = 65535
! else:
!     MAXCODE = 0xFFFFFFFFL
  
  def _compile(code, pattern, flags):
***************
*** 192,198 ****
                  return charset # cannot compress
      except IndexError:
-         if sys.maxunicode != 65535:
-             # XXX: big charsets don't work in UCS-4 builds
-             return charset
          # character set contains unicode characters
          return _optimize_unicode(charset, fixup)
--- 195,198 ----
***************
*** 229,233 ****
  def _mk_bitmap(bits):
      data = []
!     m = 1; v = 0
      for c in bits:
          if c:
--- 229,237 ----
  def _mk_bitmap(bits):
      data = []
!     if _sre.CODESIZE == 2:
!         start = (1, 0)
!     else:
!         start = (1L, 0L)
!     m, v = start
      for c in bits:
          if c:
***************
*** 236,240 ****
          if m > MAXCODE:
              data.append(v)
!             m = 1; v = 0
      return data
  
--- 240,244 ----
          if m > MAXCODE:
              data.append(v)
!             m, v = start
      return data
  
***************
*** 259,277 ****
  # CHARSET matching).
  
  def _optimize_unicode(charset, fixup):
      charmap = [0]*65536
      negate = 0
!     for op, av in charset:
!         if op is NEGATE:
!             negate = 1
!         elif op is LITERAL:
!             charmap[fixup(av)] = 1
!         elif op is RANGE:
!             for i in range(fixup(av[0]), fixup(av[1])+1):
!                 charmap[i] = 1
!         elif op is CATEGORY:
!             # XXX: could expand category
!             return charset # cannot compress
      if negate:
          for i in range(65536):
              charmap[i] = not charmap[i]
--- 263,298 ----
  # CHARSET matching).
  
+ # In UCS-4 mode, the BIGCHARSET opcode still supports only subsets
+ # of the basic multilingual plane; an efficient representation
+ # for all of UTF-16 has not yet been developed. This means,
+ # in particular, that negated charsets cannot be represented as
+ # bigcharsets.
+ 
  def _optimize_unicode(charset, fixup):
+     try:
+         import array
+     except ImportError:
+         return charset
      charmap = [0]*65536
      negate = 0
!     try:
!         for op, av in charset:
!             if op is NEGATE:
!                 negate = 1
!             elif op is LITERAL:
!                 charmap[fixup(av)] = 1
!             elif op is RANGE:
!                 for i in range(fixup(av[0]), fixup(av[1])+1):
!                     charmap[i] = 1
!             elif op is CATEGORY:
!                 # XXX: could expand category
!                 return charset # cannot compress
!     except IndexError:
!         # non-BMP characters
!         return charset
      if negate:
+         if sys.maxunicode != 65535:
+             # XXX: negation does not work with big charsets
+             return charset
          for i in range(65536):
              charmap[i] = not charmap[i]
***************
*** 288,297 ****
              data = data + _mk_bitmap(chunk)
      header = [block]
!     assert MAXCODE == 65535
!     for i in range(128):
!         if sys.byteorder == 'big':
!             header.append(256*mapping[2*i]+mapping[2*i+1])
!         else:
!             header.append(mapping[2*i]+256*mapping[2*i+1])
      data[0:0] = header
      return [(BIGCHARSET, data)]
--- 309,320 ----
              data = data + _mk_bitmap(chunk)
      header = [block]
!     if MAXCODE == 65535:
!         code = 'H'
!     else:
!         code = 'L'
!     # Convert block indices to byte array of 256 bytes
!     mapping = array.array('b', mapping).tostring()
!     # Convert byte array to word array
!     header = header + array.array(code, mapping).tolist()
      data[0:0] = header
      return [(BIGCHARSET, data)]

Index: sre_constants.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/sre_constants.py,v
retrieving revision 1.31
retrieving revision 1.32
diff -C2 -d -r1.31 -r1.32
*** sre_constants.py	14 Apr 2003 17:59:32 -0000	1.31
--- sre_constants.py	19 Apr 2003 12:56:07 -0000	1.32
***************
*** 14,18 ****
  # update when constants are added or removed
  
! MAGIC = 20010701
  
  # max code word in this release
--- 14,18 ----
  # update when constants are added or removed
  
! MAGIC = 20030419
  
  # max code word in this release