[Python-checkins] CVS: python/dist/src/Lib sre_compile.py,1.19,1.20 sre_constants.py,1.13,1.14 sre_parse.py,1.20,1.21
Fredrik Lundh
python-dev@python.org
Sun, 2 Jul 2000 05:00:09 -0700
Update of /cvsroot/python/python/dist/src/Lib
In directory slayer.i.sourceforge.net:/tmp/cvs-serv10442/Lib
Modified Files:
sre_compile.py sre_constants.py sre_parse.py
Log Message:
-- use charset bitmaps where appropriate. this gives a 5-10%
speedup for some tests, including the python tokenizer.
-- added support for an optional charset anchor to the engine
(currently unused by the code generator).
-- removed workaround for array module bug.
Index: sre_compile.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/sre_compile.py,v
retrieving revision 1.19
retrieving revision 1.20
diff -C2 -r1.19 -r1.20
*** sre_compile.py 2000/07/01 17:50:59 1.19
--- sre_compile.py 2000/07/02 12:00:06 1.20
***************
*** 17,21 ****
# find an array type code that matches the engine's code size
! for WORDSIZE in "BHil":
if len(array.array(WORDSIZE, [0]).tostring()) == _sre.getcodesize():
break
--- 17,21 ----
# find an array type code that matches the engine's code size
! for WORDSIZE in "Hil":
if len(array.array(WORDSIZE, [0]).tostring()) == _sre.getcodesize():
break
***************
*** 23,26 ****
--- 23,85 ----
raise RuntimeError, "cannot find a useable array type"
+ MAXCODE = 65535
+
+ def _charset(charset, fixup):
+ # internal: optimize character set
+ out = []
+ charmap = [0]*256
+ try:
+ for op, av in charset:
+ if op is NEGATE:
+ out.append((op, av))
+ elif op is LITERAL:
+ charmap[fixup(av)] = 1
+ elif op is RANGE:
+ for i in range(fixup(av[0]), fixup(av[1])+1):
+ charmap[i] = 1
+ elif op is CATEGORY:
+ # FIXME: could append to charmap tail
+ return charset # cannot compress
+ except IndexError:
+ # unicode
+ return charset
+ # compress character map
+ i = p = n = 0
+ runs = []
+ for c in charmap:
+ if c:
+ if n == 0:
+ p = i
+ n = n + 1
+ elif n:
+ runs.append((p, n))
+ n = 0
+ i = i + 1
+ if n:
+ runs.append((p, n))
+ if len(runs) <= 2:
+ # use literal/range
+ for p, n in runs:
+ if n == 1:
+ out.append((LITERAL, p))
+ else:
+ out.append((RANGE, (p, p+n-1)))
+ if len(out) < len(charset):
+ return out
+ else:
+ # use bitmap
+ data = []
+ m = 1; v = 0
+ for c in charmap:
+ if c:
+ v = v + m
+ m = m << 1
+ if m > MAXCODE:
+ data.append(v)
+ m = 1; v = 0
+ out.append((CHARSET, data))
+ return out
+ return charset
+
def _compile(code, pattern, flags):
# internal: compile a (sub)pattern
***************
*** 42,46 ****
fixup = lambda x: x
skip = len(code); emit(0)
! for op, av in av:
emit(OPCODES[op])
if op is NEGATE:
--- 101,105 ----
fixup = lambda x: x
skip = len(code); emit(0)
! for op, av in _charset(av, fixup):
emit(OPCODES[op])
if op is NEGATE:
***************
*** 51,54 ****
--- 110,115 ----
emit(fixup(av[0]))
emit(fixup(av[1]))
+ elif op is CHARSET:
+ code.extend(av)
elif op is CATEGORY:
if flags & SRE_FLAG_LOCALE:
***************
*** 156,161 ****
def _compile_info(code, pattern, flags):
# internal: compile an info block. in the current version,
! # this contains min/max pattern width and a literal prefix,
! # if any
lo, hi = pattern.getwidth()
if lo == 0:
--- 217,222 ----
def _compile_info(code, pattern, flags):
# internal: compile an info block. in the current version,
! # this contains min/max pattern width, and an optional literal
! # prefix or a character map
lo, hi = pattern.getwidth()
if lo == 0:
***************
*** 163,166 ****
--- 224,228 ----
# look for a literal prefix
prefix = []
+ charset = [] # not used
if not (flags & SRE_FLAG_IGNORECASE):
for op, av in pattern.data:
***************
*** 175,198 ****
# literal flag
mask = 0
! if len(prefix) == len(pattern.data):
! mask = 1
emit(mask)
# pattern length
! emit(lo)
! if hi < 32768:
emit(hi)
else:
emit(0)
# add literal prefix
- emit(len(prefix))
if prefix:
! code.extend(prefix)
! # generate overlap table
! table = [-1] + ([0]*len(prefix))
! for i in range(len(prefix)):
! table[i+1] = table[i]+1
! while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
! table[i+1] = table[table[i+1]-1]+1
! code.extend(table[1:]) # don't store first entry
code[skip] = len(code) - skip
--- 237,274 ----
# literal flag
mask = 0
! if prefix:
! mask = SRE_INFO_PREFIX
! if len(prefix) == len(pattern.data):
! mask = mask + SRE_INFO_LITERAL
! elif charset:
! mask = mask + SRE_INFO_CHARSET
emit(mask)
# pattern length
! if lo < MAXCODE:
! emit(lo)
! else:
! emit(MAXCODE)
! prefix = prefix[:MAXCODE]
! if hi < MAXCODE:
emit(hi)
else:
emit(0)
# add literal prefix
if prefix:
! emit(len(prefix))
! if prefix:
! code.extend(prefix)
! # generate overlap table
! table = [-1] + ([0]*len(prefix))
! for i in range(len(prefix)):
! table[i+1] = table[i]+1
! while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
! table[i+1] = table[table[i+1]-1]+1
! code.extend(table[1:]) # don't store first entry
! elif charset:
! for char in charset:
! emit(OPCODES[LITERAL])
! emit(char)
! emit(OPCODES[FAILURE])
code[skip] = len(code) - skip
Index: sre_constants.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/sre_constants.py,v
retrieving revision 1.13
retrieving revision 1.14
diff -C2 -r1.13 -r1.14
*** sre_constants.py 2000/07/01 17:50:59 1.13
--- sre_constants.py 2000/07/02 12:00:06 1.14
***************
*** 29,32 ****
--- 29,33 ----
CALL = "call"
CATEGORY = "category"
+ CHARSET = "charset"
GROUP = "group"
GROUP_IGNORE = "group_ignore"
***************
*** 88,91 ****
--- 89,93 ----
CALL,
CATEGORY,
+ CHARSET,
GROUP, GROUP_IGNORE,
IN, IN_IGNORE,
***************
*** 167,177 ****
# flags
! SRE_FLAG_TEMPLATE = 1
! SRE_FLAG_IGNORECASE = 2
! SRE_FLAG_LOCALE = 4
! SRE_FLAG_MULTILINE = 8
! SRE_FLAG_DOTALL = 16
! SRE_FLAG_UNICODE = 32
! SRE_FLAG_VERBOSE = 64
if __name__ == "__main__":
--- 169,184 ----
# flags
! SRE_FLAG_TEMPLATE = 1 # template mode (disable backtracking)
! SRE_FLAG_IGNORECASE = 2 # case insensitive
! SRE_FLAG_LOCALE = 4 # honour system locale
! SRE_FLAG_MULTILINE = 8 # treat target as multiline string
! SRE_FLAG_DOTALL = 16 # treat target as a single string
! SRE_FLAG_UNICODE = 32 # use unicode locale
! SRE_FLAG_VERBOSE = 64 # ignore whitespace and comments
!
! # flags for INFO primitive
! SRE_INFO_PREFIX = 1 # has prefix
! SRE_INFO_LITERAL = 2 # entire pattern is literal (given by prefix)
! SRE_INFO_CHARSET = 4 # pattern starts with character from given set
if __name__ == "__main__":
***************
*** 202,205 ****
--- 209,213 ----
dump(f, ATCODES, "SRE")
dump(f, CHCODES, "SRE")
+
f.write("#define SRE_FLAG_TEMPLATE %d\n" % SRE_FLAG_TEMPLATE)
f.write("#define SRE_FLAG_IGNORECASE %d\n" % SRE_FLAG_IGNORECASE)
***************
*** 209,212 ****
--- 217,225 ----
f.write("#define SRE_FLAG_UNICODE %d\n" % SRE_FLAG_UNICODE)
f.write("#define SRE_FLAG_VERBOSE %d\n" % SRE_FLAG_VERBOSE)
+
+ f.write("#define SRE_INFO_PREFIX %d\n" % SRE_INFO_PREFIX)
+ f.write("#define SRE_INFO_LITERAL %d\n" % SRE_INFO_LITERAL)
+ f.write("#define SRE_INFO_CHARSET %d\n" % SRE_INFO_CHARSET)
+
f.close()
print "done"
Index: sre_parse.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/sre_parse.py,v
retrieving revision 1.20
retrieving revision 1.21
diff -C2 -r1.20 -r1.21
*** sre_parse.py 2000/07/01 23:49:14 1.20
--- sre_parse.py 2000/07/02 12:00:06 1.21
***************
*** 17,25 ****
from sre_constants import *
! # FIXME: should be 65535, but the arraymodule is still broken
! MAXREPEAT = 32767
! # FIXME: might change in 2.0 final. but for now, this seems
! # to be the best way to be compatible with 1.5.2
CHARMASK = 0xff
--- 17,24 ----
from sre_constants import *
! MAXREPEAT = 65535
! # FIXME: the following might change in 2.0 final. but for now, this
! # seems to be the best way to be compatible with 1.5.2
CHARMASK = 0xff