[Python-checkins] CVS: python/dist/src/Lib sre_compile.py,1.19,1.20 sre_constants.py,1.13,1.14 sre_parse.py,1.20,1.21

Fredrik Lundh python-dev@python.org
Sun, 2 Jul 2000 05:00:09 -0700


Update of /cvsroot/python/python/dist/src/Lib
In directory slayer.i.sourceforge.net:/tmp/cvs-serv10442/Lib

Modified Files:
	sre_compile.py sre_constants.py sre_parse.py 
Log Message:


-- use charset bitmaps where appropriate.  this gives a 5-10%
   speedup for some tests, including the python tokenizer.

-- added support for an optional charset anchor to the engine
   (currently unused by the code generator).

-- removed workaround for array module bug.


Index: sre_compile.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/sre_compile.py,v
retrieving revision 1.19
retrieving revision 1.20
diff -C2 -r1.19 -r1.20
*** sre_compile.py	2000/07/01 17:50:59	1.19
--- sre_compile.py	2000/07/02 12:00:06	1.20
***************
*** 17,21 ****
  
  # find an array type code that matches the engine's code size
! for WORDSIZE in "BHil":
      if len(array.array(WORDSIZE, [0]).tostring()) == _sre.getcodesize():
          break
--- 17,21 ----
  
  # find an array type code that matches the engine's code size
! for WORDSIZE in "Hil":
      if len(array.array(WORDSIZE, [0]).tostring()) == _sre.getcodesize():
          break
***************
*** 23,26 ****
--- 23,85 ----
      raise RuntimeError, "cannot find a useable array type"
  
+ MAXCODE = 65535
+ 
+ def _charset(charset, fixup):
+     # internal: optimize character set
+     out = []
+     charmap = [0]*256
+     try:
+         for op, av in charset:
+             if op is NEGATE:
+                 out.append((op, av))
+             elif op is LITERAL:
+                 charmap[fixup(av)] = 1
+             elif op is RANGE:
+                 for i in range(fixup(av[0]), fixup(av[1])+1):
+                     charmap[i] = 1
+             elif op is CATEGORY:
+                 # FIXME: could append to charmap tail
+                 return charset # cannot compress
+     except IndexError:
+         # unicode
+         return charset
+     # compress character map
+     i = p = n = 0
+     runs = []
+     for c in charmap:
+         if c:
+             if n == 0:
+                 p = i
+             n = n + 1
+         elif n:
+             runs.append((p, n))
+             n = 0
+         i = i + 1
+     if n:
+         runs.append((p, n))
+     if len(runs) <= 2:
+         # use literal/range
+         for p, n in runs:
+             if n == 1:
+                 out.append((LITERAL, p))
+             else:
+                 out.append((RANGE, (p, p+n-1)))
+         if len(out) < len(charset):
+             return out
+     else:
+         # use bitmap
+         data = []
+         m = 1; v = 0
+         for c in charmap:
+             if c:
+                 v = v + m
+             m = m << 1
+             if m > MAXCODE:
+                 data.append(v)
+                 m = 1; v = 0
+         out.append((CHARSET, data))
+         return out
+     return charset
+ 
  def _compile(code, pattern, flags):
      # internal: compile a (sub)pattern
***************
*** 42,46 ****
                  fixup = lambda x: x
              skip = len(code); emit(0)
!             for op, av in av:
                  emit(OPCODES[op])
                  if op is NEGATE:
--- 101,105 ----
                  fixup = lambda x: x
              skip = len(code); emit(0)
!             for op, av in _charset(av, fixup):
                  emit(OPCODES[op])
                  if op is NEGATE:
***************
*** 51,54 ****
--- 110,115 ----
                      emit(fixup(av[0]))
                      emit(fixup(av[1]))
+                 elif op is CHARSET:
+                     code.extend(av)
                  elif op is CATEGORY:
                      if flags & SRE_FLAG_LOCALE:
***************
*** 156,161 ****
  def _compile_info(code, pattern, flags):
      # internal: compile an info block.  in the current version,
!     # this contains min/max pattern width and a literal prefix,
!     # if any
      lo, hi = pattern.getwidth()
      if lo == 0:
--- 217,222 ----
  def _compile_info(code, pattern, flags):
      # internal: compile an info block.  in the current version,
!     # this contains min/max pattern width, and an optional literal
!     # prefix or a character map
      lo, hi = pattern.getwidth()
      if lo == 0:
***************
*** 163,166 ****
--- 224,228 ----
      # look for a literal prefix
      prefix = []
+     charset = [] # not used
      if not (flags & SRE_FLAG_IGNORECASE):
          for op, av in pattern.data:
***************
*** 175,198 ****
      # literal flag
      mask = 0
!     if len(prefix) == len(pattern.data):
!         mask = 1
      emit(mask)
      # pattern length
!     emit(lo)
!     if hi < 32768:
          emit(hi)
      else:
          emit(0)
      # add literal prefix
-     emit(len(prefix))
      if prefix:
!         code.extend(prefix)
!         # generate overlap table
!         table = [-1] + ([0]*len(prefix))
!         for i in range(len(prefix)):
!             table[i+1] = table[i]+1
!             while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
!                 table[i+1] = table[table[i+1]-1]+1
!         code.extend(table[1:]) # don't store first entry
      code[skip] = len(code) - skip
  
--- 237,274 ----
      # literal flag
      mask = 0
!     if prefix:
!         mask = SRE_INFO_PREFIX
!         if len(prefix) == len(pattern.data):
!             mask = mask + SRE_INFO_LITERAL
!     elif charset:
!         mask = mask + SRE_INFO_CHARSET
      emit(mask)
      # pattern length
!     if lo < MAXCODE:
!         emit(lo)
!     else:
!         emit(MAXCODE)
!         prefix = prefix[:MAXCODE]
!     if hi < MAXCODE:
          emit(hi)
      else:
          emit(0)
      # add literal prefix
      if prefix:
!         emit(len(prefix))
!         if prefix:
!             code.extend(prefix)
!             # generate overlap table
!             table = [-1] + ([0]*len(prefix))
!             for i in range(len(prefix)):
!                 table[i+1] = table[i]+1
!                 while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
!                     table[i+1] = table[table[i+1]-1]+1
!             code.extend(table[1:]) # don't store first entry
!     elif charset:
!         for char in charset:
!             emit(OPCODES[LITERAL])
!             emit(char)
!         emit(OPCODES[FAILURE])
      code[skip] = len(code) - skip
  

Index: sre_constants.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/sre_constants.py,v
retrieving revision 1.13
retrieving revision 1.14
diff -C2 -r1.13 -r1.14
*** sre_constants.py	2000/07/01 17:50:59	1.13
--- sre_constants.py	2000/07/02 12:00:06	1.14
***************
*** 29,32 ****
--- 29,33 ----
  CALL = "call"
  CATEGORY = "category"
+ CHARSET = "charset"
  GROUP = "group"
  GROUP_IGNORE = "group_ignore"
***************
*** 88,91 ****
--- 89,93 ----
      CALL,
      CATEGORY,
+     CHARSET,
      GROUP, GROUP_IGNORE,
      IN, IN_IGNORE,
***************
*** 167,177 ****
  
  # flags
! SRE_FLAG_TEMPLATE = 1
! SRE_FLAG_IGNORECASE = 2
! SRE_FLAG_LOCALE = 4
! SRE_FLAG_MULTILINE = 8
! SRE_FLAG_DOTALL = 16
! SRE_FLAG_UNICODE = 32
! SRE_FLAG_VERBOSE = 64
  
  if __name__ == "__main__":
--- 169,184 ----
  
  # flags
! SRE_FLAG_TEMPLATE = 1 # template mode (disable backtracking)
! SRE_FLAG_IGNORECASE = 2 # case insensitive
! SRE_FLAG_LOCALE = 4 # honour system locale
! SRE_FLAG_MULTILINE = 8 # treat target as multiline string
! SRE_FLAG_DOTALL = 16 # treat target as a single string
! SRE_FLAG_UNICODE = 32 # use unicode locale
! SRE_FLAG_VERBOSE = 64 # ignore whitespace and comments
! 
! # flags for INFO primitive
! SRE_INFO_PREFIX = 1 # has prefix
! SRE_INFO_LITERAL = 2 # entire pattern is literal (given by prefix)
! SRE_INFO_CHARSET = 4 # pattern starts with character from given set
  
  if __name__ == "__main__":
***************
*** 202,205 ****
--- 209,213 ----
      dump(f, ATCODES, "SRE")
      dump(f, CHCODES, "SRE")
+ 
      f.write("#define SRE_FLAG_TEMPLATE %d\n" % SRE_FLAG_TEMPLATE)
      f.write("#define SRE_FLAG_IGNORECASE %d\n" % SRE_FLAG_IGNORECASE)
***************
*** 209,212 ****
--- 217,225 ----
      f.write("#define SRE_FLAG_UNICODE %d\n" % SRE_FLAG_UNICODE)
      f.write("#define SRE_FLAG_VERBOSE %d\n" % SRE_FLAG_VERBOSE)
+ 
+     f.write("#define SRE_INFO_PREFIX %d\n" % SRE_INFO_PREFIX)
+     f.write("#define SRE_INFO_LITERAL %d\n" % SRE_INFO_LITERAL)
+     f.write("#define SRE_INFO_CHARSET %d\n" % SRE_INFO_CHARSET)
+ 
      f.close()
      print "done"

Index: sre_parse.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/sre_parse.py,v
retrieving revision 1.20
retrieving revision 1.21
diff -C2 -r1.20 -r1.21
*** sre_parse.py	2000/07/01 23:49:14	1.20
--- sre_parse.py	2000/07/02 12:00:06	1.21
***************
*** 17,25 ****
  from sre_constants import *
  
! # FIXME: should be 65535, but the arraymodule is still broken
! MAXREPEAT = 32767
  
! # FIXME: might change in 2.0 final.  but for now, this seems
! # to be the best way to be compatible with 1.5.2
  CHARMASK = 0xff
  
--- 17,24 ----
  from sre_constants import *
  
! MAXREPEAT = 65535
  
! # FIXME: the following might change in 2.0 final.  but for now, this
! # seems to be the best way to be compatible with 1.5.2
  CHARMASK = 0xff