[Python-checkins] CVS: python/dist/src/Lib sre.py,1.4,1.5 sre_compile.py,1.3,1.4 sre_constants.py,1.3,1.4

Jeremy Hylton python-dev@python.org
Thu, 1 Jun 2000 10:39:14 -0700


Update of /cvsroot/python/python/dist/src/Lib
In directory slayer.i.sourceforge.net:/tmp/cvs-serv26344

Modified Files:
	sre.py sre_compile.py sre_constants.py 
Log Message:
Fredrik Lundh: here's the 96.6% version of SRE


Index: sre.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/sre.py,v
retrieving revision 1.4
retrieving revision 1.5
diff -C2 -r1.4 -r1.5
*** sre.py	2000/05/02 15:52:33	1.4
--- sre.py	2000/06/01 17:39:12	1.5
***************
*** 1,6 ****
- # -*- Mode: Python; tab-width: 4 -*-
  #
  # Secret Labs' Regular Expression Engine
! # $Id: sre.py,v 1.4 2000/05/02 15:52:33 guido Exp $
  #
  # re-compatible interface for the sre matching engine
--- 1,5 ----
  #
  # Secret Labs' Regular Expression Engine
! # $Id: sre.py,v 1.5 2000/06/01 17:39:12 jhylton Exp $
  #
  # re-compatible interface for the sre matching engine
***************
*** 8,14 ****
  # Copyright (c) 1998-2000 by Secret Labs AB.  All rights reserved.
  #
- # This code can only be used for 1.6 alpha testing.  All other use
- # require explicit permission from Secret Labs AB.
- #
  # Portions of this engine have been developed in cooperation with
  # CNRI.  Hewlett-Packard provided funding for 1.6 integration and
--- 7,10 ----
***************
*** 16,45 ****
  #
  
- """
- this is a long string
- """
- 
  import sre_compile
  
  # --------------------------------------------------------------------
  # public interface
  
! def compile(pattern, flags=0):
!     return sre_compile.compile(pattern, _fixflags(flags))
  
  def match(pattern, string, flags=0):
!     return compile(pattern, _fixflags(flags)).match(string)
  
  def search(pattern, string, flags=0):
!     return compile(pattern, _fixflags(flags)).search(string)
  
! # FIXME: etc
  
  # --------------------------------------------------------------------
! # helpers
  
! def _fixflags(flags):
!     # convert flag bitmask to sequence
!     assert not flags
!     return ()
  
--- 12,132 ----
  #
  
  import sre_compile
  
+ # flags
+ I = IGNORECASE = sre_compile.SRE_FLAG_IGNORECASE
+ L = LOCALE = sre_compile.SRE_FLAG_LOCALE
+ M = MULTILINE = sre_compile.SRE_FLAG_MULTILINE
+ S = DOTALL = sre_compile.SRE_FLAG_DOTALL
+ X = VERBOSE = sre_compile.SRE_FLAG_VERBOSE
+ 
  # --------------------------------------------------------------------
  # public interface
  
! # FIXME: add docstrings
  
  def match(pattern, string, flags=0):
!     return _compile(pattern, flags).match(string)
  
  def search(pattern, string, flags=0):
!     return _compile(pattern, flags).search(string)
! 
! def sub(pattern, repl, string, count=0):
!     return _compile(pattern).sub(repl, string, count)
! 
! def subn(pattern, repl, string, count=0):
!     return _compile(pattern).subn(repl, string, count)
! 
! def split(pattern, string, maxsplit=0):
!     return _compile(pattern).split(string, maxsplit)
! 
! def findall(pattern, string, maxsplit=0):
!     return _compile(pattern).findall(string, maxsplit)
! 
! def compile(pattern, flags=0):
!     return _compile(pattern, flags)
  
! def escape(pattern):
!     s = list(pattern)
!     for i in range(len(pattern)):
!         c = pattern[i]
!         if not ("a" <= c <= "z" or "A" <= c <= "Z" or "0" <= c <= "9"):
!             if c == "\000":
!                 s[i] = "\\000"
!             else:
!                 s[i] = "\\" + c
!     return pattern[:0].join(s)
  
  # --------------------------------------------------------------------
! # internals
  
! _cache = {}
! _MAXCACHE = 100
  
+ def _compile(pattern, flags=0):
+     # internal: compile pattern
+     tp = type(pattern)
+     if tp not in (type(""), type(u"")):
+         return pattern
+     key = (tp, pattern, flags)
+     try:
+         return _cache[key]
+     except KeyError:
+         pass
+     p = sre_compile.compile(pattern, flags)
+     if len(_cache) >= _MAXCACHE:
+         _cache.clear()
+     _cache[key] = p
+     return p
+ 
+ def _sub(pattern, template, string, count=0):
+     # internal: pattern.sub implementation hook
+     return _subn(pattern, template, string, count)[0]
+ 
+ def _expand(match, template):
+     # internal: expand template
+     return template # FIXME
+ 
+ def _subn(pattern, template, string, count=0):
+     # internal: pattern.subn implementation hook
+     if callable(template):
+         filter = callable
+     else:
+         # FIXME: prepare template
+         def filter(match, template=template):
+             return _expand(match, template)
+     n = i = 0
+     s = []
+     append = s.append
+     c = pattern.cursor(string)
+     while not count or n < count:
+         m = c.search()
+         if not m:
+             break
+         j = m.start()
+         if j > i:
+             append(string[i:j])
+         append(filter(m))
+         i = m.end()
+         n = n + 1
+     if i < len(string):
+         append(string[i:])
+     return string[:0].join(s), n
+ 
+ def _split(pattern, string, maxsplit=0):
+     # internal: pattern.split implementation hook
+     n = i = 0
+     s = []
+     append = s.append
+     c = pattern.cursor(string)
+     while not maxsplit or n < maxsplit:
+         m = c.search()
+         if not m:
+             break
+         j = m.start()
+         append(string[i:j])
+         i = m.end()
+         n = n + 1
+     if i < len(string):
+         append(string[i:])
+     return s

Index: sre_compile.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/sre_compile.py,v
retrieving revision 1.3
retrieving revision 1.4
diff -C2 -r1.3 -r1.4
*** sre_compile.py	2000/04/10 17:10:48	1.3
--- sre_compile.py	2000/06/01 17:39:12	1.4
***************
*** 1,5 ****
  #
  # Secret Labs' Regular Expression Engine
! # $Id: sre_compile.py,v 1.3 2000/04/10 17:10:48 guido Exp $
  #
  # convert template to internal format
--- 1,5 ----
  #
  # Secret Labs' Regular Expression Engine
! # $Id: sre_compile.py,v 1.4 2000/06/01 17:39:12 jhylton Exp $
  #
  # convert template to internal format
***************
*** 15,21 ****
  #
  
- # FIXME: <fl> formalize (objectify?) and document the compiler code
- # format, so that other frontends can use this compiler
- 
  import array, string, sys
  
--- 15,18 ----
***************
*** 46,76 ****
      def todata(self):
  	# print self.data
! 	return array.array(WORDSIZE, self.data).tostring()
! 
! def _lower(literal):
!     # return _sre._lower(literal) # FIXME
!     return string.lower(literal)
  
! def _compile(code, pattern, flags):
      append = code.append
      for op, av in pattern:
  	if op is ANY:
! 	    if "s" in flags:
! 		append(CODES[op]) # any character at all!
  	    else:
! 		append(CODES[NOT_LITERAL])
! 		append(10)
  	elif op in (SUCCESS, FAILURE):
! 	    append(CODES[op])
  	elif op is AT:
! 	    append(CODES[op])
! 	    append(POSITIONS[av])
  	elif op is BRANCH:
! 	    append(CODES[op])
  	    tail = []
  	    for av in av[1]:
  		skip = len(code); append(0)
! 		_compile(code, av, flags)
! 		append(CODES[JUMP])
  		tail.append(len(code)); append(0)
  		code[skip] = len(code) - skip
--- 43,76 ----
      def todata(self):
  	# print self.data
! 	try:
! 	    return array.array(WORDSIZE, self.data).tostring()
! 	except OverflowError:
! 	    print self.data
! 	    raise
  
! def _compile(code, pattern, flags, level=0):
      append = code.append
      for op, av in pattern:
  	if op is ANY:
! 	    if flags & SRE_FLAG_DOTALL:
! 		append(OPCODES[op]) # any character at all!
  	    else:
! 		append(OPCODES[CATEGORY])
! 		append(CHCODES[CATEGORY_NOT_LINEBREAK])
  	elif op in (SUCCESS, FAILURE):
! 	    append(OPCODES[op])
  	elif op is AT:
! 	    append(OPCODES[op])
! 	    if flags & SRE_FLAG_MULTILINE:
! 		append(ATCODES[AT_MULTILINE[av]])
! 	    else:
! 		append(ATCODES[av])
  	elif op is BRANCH:
! 	    append(OPCODES[op])
  	    tail = []
  	    for av in av[1]:
  		skip = len(code); append(0)
! 		_compile(code, av, flags, level)
! 		append(OPCODES[JUMP])
  		tail.append(len(code)); append(0)
  		code[skip] = len(code) - skip
***************
*** 79,107 ****
  		code[tail] = len(code) - tail
  	elif op is CALL:
! 	    append(CODES[op])
  	    skip = len(code); append(0)
! 	    _compile(code, av, flags)
! 	    append(CODES[SUCCESS])
  	    code[skip] = len(code) - skip
  	elif op is CATEGORY: # not used by current parser
! 	    append(CODES[op])
! 	    append(CATEGORIES[av])
  	elif op is GROUP:
! 	    if "i" in flags:
! 		append(CODES[MAP_IGNORE[op]])
  	    else:
! 		append(CODES[op])
! 	    append(av)
  	elif op is IN:
! 	    if "i" in flags:
! 		append(CODES[MAP_IGNORE[op]])
  		def fixup(literal):
! 		    return ord(_lower(literal))
  	    else:
! 		append(CODES[op])
  		fixup = ord
  	    skip = len(code); append(0)
  	    for op, av in av:
! 		append(CODES[op])
  		if op is NEGATE:
  		    pass
--- 79,110 ----
  		code[tail] = len(code) - tail
  	elif op is CALL:
! 	    append(OPCODES[op])
  	    skip = len(code); append(0)
! 	    _compile(code, av, flags, level+1)
! 	    append(OPCODES[SUCCESS])
  	    code[skip] = len(code) - skip
  	elif op is CATEGORY: # not used by current parser
! 	    append(OPCODES[op])
! 	    if flags & SRE_FLAG_LOCALE:
! 		append(CH_LOCALE[CHCODES[av]])
! 	    else:
! 		append(CHCODES[av])
  	elif op is GROUP:
! 	    if flags & SRE_FLAG_IGNORECASE:
! 		append(OPCODES[OP_IGNORE[op]])
  	    else:
! 		append(OPCODES[op])
! 	    append(av-1)
  	elif op is IN:
! 	    if flags & SRE_FLAG_IGNORECASE:
! 		append(OPCODES[OP_IGNORE[op]])
  		def fixup(literal):
! 		    return ord(literal.lower())
  	    else:
! 		append(OPCODES[op])
  		fixup = ord
  	    skip = len(code); append(0)
  	    for op, av in av:
! 		append(OPCODES[op])
  		if op is NEGATE:
  		    pass
***************
*** 112,129 ****
  		    append(fixup(av[1]))
  		elif op is CATEGORY:
! 		    append(CATEGORIES[av])
  		else:
  		    raise ValueError, "unsupported set operator"
! 	    append(CODES[FAILURE])
  	    code[skip] = len(code) - skip
  	elif op in (LITERAL, NOT_LITERAL):
! 	    if "i" in flags:
! 		append(CODES[MAP_IGNORE[op]])
! 		append(ord(_lower(av)))
  	    else:
! 		append(CODES[op])
  		append(ord(av))
  	elif op is MARK:
! 	    append(CODES[op])
  	    append(av)
   	elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
--- 115,135 ----
  		    append(fixup(av[1]))
  		elif op is CATEGORY:
! 		    if flags & SRE_FLAG_LOCALE:
! 			append(CH_LOCALE[CHCODES[av]])
! 		    else:
! 			append(CHCODES[av])
  		else:
  		    raise ValueError, "unsupported set operator"
! 	    append(OPCODES[FAILURE])
  	    code[skip] = len(code) - skip
  	elif op in (LITERAL, NOT_LITERAL):
! 	    if flags & SRE_FLAG_IGNORECASE:
! 		append(OPCODES[OP_IGNORE[op]])
! 		append(ord(av.lower()))
  	    else:
! 		append(OPCODES[op])
  		append(ord(av))
  	elif op is MARK:
! 	    append(OPCODES[op])
  	    append(av)
   	elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
***************
*** 132,167 ****
   		raise SyntaxError, "cannot repeat zero-width items"
  	    if lo == hi == 1 and op is MAX_REPEAT:
! 		append(CODES[MAX_REPEAT_ONE])
  		skip = len(code); append(0)
  		append(av[0])
  		append(av[1])
! 		_compile(code, av[2], flags)
! 		append(CODES[SUCCESS])
  		code[skip] = len(code) - skip
  	    else:
! 		append(CODES[op])
  		skip = len(code); append(0)
  		append(av[0])
  		append(av[1])
! 		_compile(code, av[2], flags)
  		if op is MIN_REPEAT:
! 		    append(CODES[MIN_UNTIL])
  		else:
! 		    # FIXME: MAX_REPEAT PROBABLY DOESN'T WORK (?)
! 		    append(CODES[MAX_UNTIL])
  		code[skip] = len(code) - skip
  	elif op is SUBPATTERN:
! ## 	    group = av[0]
! ## 	    if group:
! ## 		append(CODES[MARK])
! ## 		append((group-1)*2)
! 	    _compile(code, av[1], flags)
! ## 	    if group:
! ## 		append(CODES[MARK])
! ## 		append((group-1)*2+1)
  	else:
  	    raise ValueError, ("unsupported operand type", op)
  
! def compile(p, flags=()):
      # convert pattern list to internal format
      if type(p) in (type(""), type(u"")):
--- 138,172 ----
   		raise SyntaxError, "cannot repeat zero-width items"
  	    if lo == hi == 1 and op is MAX_REPEAT:
! 		append(OPCODES[MAX_REPEAT_ONE])
  		skip = len(code); append(0)
  		append(av[0])
  		append(av[1])
! 		_compile(code, av[2], flags, level+1)
! 		append(OPCODES[SUCCESS])
  		code[skip] = len(code) - skip
  	    else:
! 		append(OPCODES[op])
  		skip = len(code); append(0)
  		append(av[0])
  		append(av[1])
! 		_compile(code, av[2], flags, level+1)
  		if op is MIN_REPEAT:
! 		    append(OPCODES[MIN_UNTIL])
  		else:
! 		    append(OPCODES[MAX_UNTIL])
  		code[skip] = len(code) - skip
  	elif op is SUBPATTERN:
!  	    group = av[0]
!  	    if group:
!  		append(OPCODES[MARK])
!  		append((group-1)*2)
! 	    _compile(code, av[1], flags, level+1)
!  	    if group:
!  		append(OPCODES[MARK])
!  		append((group-1)*2+1)
  	else:
  	    raise ValueError, ("unsupported operand type", op)
  
! def compile(p, flags=0):
      # convert pattern list to internal format
      if type(p) in (type(""), type(u"")):
***************
*** 171,180 ****
      else:
  	pattern = None
!     # print p.getwidth()
!     # print p
      code = Code()
!     _compile(code, p.data, p.pattern.flags)
!     code.append(CODES[SUCCESS])
!     # print list(code.data)
      data = code.todata()
      if 0: # debugging
--- 176,183 ----
      else:
  	pattern = None
!     flags = p.pattern.flags | flags
      code = Code()
!     _compile(code, p.data, flags)
!     code.append(OPCODES[SUCCESS])
      data = code.todata()
      if 0: # debugging
***************
*** 184,187 ****
  	sre_disasm.disasm(data)
  	print "-" * 68
!     # print len(data), p.pattern.groups, len(p.pattern.groupdict)
!     return _sre.compile(pattern, data, p.pattern.groups-1, p.pattern.groupdict)
--- 187,193 ----
  	sre_disasm.disasm(data)
  	print "-" * 68
!     return _sre.compile(
! 	pattern, flags,
! 	data,
! 	p.pattern.groups-1, p.pattern.groupdict
! 	)

Index: sre_constants.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/sre_constants.py,v
retrieving revision 1.3
retrieving revision 1.4
diff -C2 -r1.3 -r1.4
*** sre_constants.py	2000/04/10 17:10:48	1.3
--- sre_constants.py	2000/06/01 17:39:12	1.4
***************
*** 1,5 ****
  #
  # Secret Labs' Regular Expression Engine
! # $Id: sre_constants.py,v 1.3 2000/04/10 17:10:48 guido Exp $
  #
  # various symbols used by the regular expression engine.
--- 1,5 ----
  #
  # Secret Labs' Regular Expression Engine
! # $Id: sre_constants.py,v 1.4 2000/06/01 17:39:12 jhylton Exp $
  #
  # various symbols used by the regular expression engine.
***************
*** 49,58 ****
  # positions
  AT_BEGINNING = "at_beginning"
  AT_BOUNDARY = "at_boundary"
  AT_NON_BOUNDARY = "at_non_boundary"
  AT_END = "at_end"
  
  # categories
- 
  CATEGORY_DIGIT = "category_digit"
  CATEGORY_NOT_DIGIT = "category_not_digit"
--- 49,59 ----
  # positions
  AT_BEGINNING = "at_beginning"
+ AT_BEGINNING_LINE = "at_beginning_line"
  AT_BOUNDARY = "at_boundary"
  AT_NON_BOUNDARY = "at_non_boundary"
  AT_END = "at_end"
+ AT_END_LINE = "at_end_line"
  
  # categories
  CATEGORY_DIGIT = "category_digit"
  CATEGORY_NOT_DIGIT = "category_not_digit"
***************
*** 61,66 ****
  CATEGORY_WORD = "category_word"
  CATEGORY_NOT_WORD = "category_not_word"
  
! CODES = [
  
      # failure=0 success=1 (just because it looks better that way :-)
--- 62,77 ----
  CATEGORY_WORD = "category_word"
  CATEGORY_NOT_WORD = "category_not_word"
+ CATEGORY_LINEBREAK = "category_linebreak"
+ CATEGORY_NOT_LINEBREAK = "category_not_linebreak"
+ CATEGORY_LOC_DIGIT = "category_loc_digit"
+ CATEGORY_LOC_NOT_DIGIT = "category_loc_not_digit"
+ CATEGORY_LOC_SPACE = "category_loc_space"
+ CATEGORY_LOC_NOT_SPACE = "category_loc_not_space"
+ CATEGORY_LOC_WORD = "category_loc_word"
+ CATEGORY_LOC_NOT_WORD = "category_loc_not_word"
+ CATEGORY_LOC_LINEBREAK = "category_loc_linebreak"
+ CATEGORY_LOC_NOT_LINEBREAK = "category_loc_not_linebreak"
  
! OPCODES = [
  
      # failure=0 success=1 (just because it looks better that way :-)
***************
*** 87,101 ****
  
  ]
  
! # convert to dictionary
! c = {}
! i = 0
! for code in CODES:
!     c[code] = i
!     i = i + 1
! CODES = c
  
  # replacement operations for "ignore case" mode
! MAP_IGNORE = {
      GROUP: GROUP_IGNORE,
      IN: IN_IGNORE,
--- 98,130 ----
  
  ]
+ 
+ ATCODES = [
+     AT_BEGINNING, AT_BEGINNING_LINE, AT_BOUNDARY,
+     AT_NON_BOUNDARY, AT_END, AT_END_LINE
+ ]
  
! CHCODES = [
!     CATEGORY_DIGIT, CATEGORY_NOT_DIGIT, CATEGORY_SPACE,
!     CATEGORY_NOT_SPACE, CATEGORY_WORD, CATEGORY_NOT_WORD,
!     CATEGORY_LINEBREAK, CATEGORY_NOT_LINEBREAK, CATEGORY_LOC_DIGIT,
!     CATEGORY_LOC_NOT_DIGIT, CATEGORY_LOC_SPACE,
!     CATEGORY_LOC_NOT_SPACE, CATEGORY_LOC_WORD, CATEGORY_LOC_NOT_WORD,
!     CATEGORY_LOC_LINEBREAK, CATEGORY_LOC_NOT_LINEBREAK
! ]
! 
! def makedict(list):
!     d = {}
!     i = 0
!     for item in list:
! 	d[item] = i
! 	i = i + 1
!     return d
! 
! OPCODES = makedict(OPCODES)
! ATCODES = makedict(ATCODES)
! CHCODES = makedict(CHCODES)
  
  # replacement operations for "ignore case" mode
! OP_IGNORE = {
      GROUP: GROUP_IGNORE,
      IN: IN_IGNORE,
***************
*** 104,131 ****
  }
  
! POSITIONS = {
!     AT_BEGINNING: ord("a"),
!     AT_BOUNDARY: ord("b"),
!     AT_NON_BOUNDARY: ord("B"),
!     AT_END: ord("z"),
  }
  
! CATEGORIES = {
!     CATEGORY_DIGIT: ord("d"),
!     CATEGORY_NOT_DIGIT: ord("D"),
!     CATEGORY_SPACE: ord("s"),
!     CATEGORY_NOT_SPACE: ord("S"),
!     CATEGORY_WORD: ord("w"),
!     CATEGORY_NOT_WORD: ord("W"),
  }
  
  if __name__ == "__main__":
      import string
!     items = CODES.items()
!     items.sort(lambda a, b: cmp(a[1], b[1]))
      f = open("sre_constants.h", "w")
!     f.write("/* generated by sre_constants.py */\n")
!     for k, v in items:
! 	f.write("#define SRE_OP_" + string.upper(k) + " " + str(v) + "\n")
      f.close()
      print "done"
--- 133,172 ----
  }
  
! AT_MULTILINE = {
!     AT_BEGINNING: AT_BEGINNING_LINE,
!     AT_END: AT_END_LINE
  }
  
! CH_LOCALE = {
!     CATEGORY_DIGIT: CATEGORY_LOC_DIGIT,
!     CATEGORY_NOT_DIGIT: CATEGORY_LOC_NOT_DIGIT,
!     CATEGORY_SPACE: CATEGORY_LOC_SPACE,
!     CATEGORY_NOT_SPACE: CATEGORY_LOC_NOT_SPACE,
!     CATEGORY_WORD: CATEGORY_LOC_WORD,
!     CATEGORY_NOT_WORD: CATEGORY_LOC_NOT_WORD,
!     CATEGORY_LINEBREAK: CATEGORY_LOC_LINEBREAK,
!     CATEGORY_NOT_LINEBREAK: CATEGORY_LOC_NOT_LINEBREAK
  }
  
+ # flags
+ SRE_FLAG_TEMPLATE = 1 # NYI
+ SRE_FLAG_IGNORECASE = 2
+ SRE_FLAG_LOCALE = 4
+ SRE_FLAG_MULTILINE = 8
+ SRE_FLAG_DOTALL = 16
+ SRE_FLAG_VERBOSE = 32
+ 
  if __name__ == "__main__":
      import string
!     def dump(f, d, prefix):
! 	items = d.items()
! 	items.sort(lambda a, b: cmp(a[1], b[1]))
! 	for k, v in items:
! 	    f.write("#define %s_%s %s\n" % (prefix, string.upper(k), v))
      f = open("sre_constants.h", "w")
!     f.write("/* generated from sre_constants.py */\n")
!     dump(f, OPCODES, "SRE_OP")
!     dump(f, ATCODES, "SRE")
!     dump(f, CHCODES, "SRE")
      f.close()
      print "done"