paul wrote:
As an aside: I would be pumpld about getting a generic lexer into the Python distribution.
how about this quick and dirty proposal: - add a new primitive to SRE: (?P#n), where n is a small integer. this primitive sets the match object's "index" variable to n when the engine stumbles upon it. - given a list of "phrases", combine them into a single regular expression like this: (?:phrase1(?P#1))|(?:phrase2(?P#2))|... - apply match repeatedly to the input string. for each match, use the index attribute to figure out what phrase we matched. see below for a slightly larger example. </F> import sre class Scanner: def __init__(self, lexicon): self.lexicon = lexicon p = [] for phrase, action in lexicon: p.append("(?:%s)(?P#%d)" % (phrase, len(p))) self.scanner = sre.compile("|".join(p)) def scan(self, string): result = [] append = result.append match = self.scanner.match i = 0 while 1: m = match(string, i) if not m: break j = m.end() if i == j: break action = self.lexicon[m.index][1] if callable(action): self.match = match action = action(self, m.group()) if action is not None: append(action) i = j return result, string[i:] def s_ident(scanner, token): return token def s_operator(scanner, token): return "operator%s" % token def s_float(scanner, token): return float(token) def s_int(scanner, token): return int(token) scanner = Scanner([ (r"[a-zA-Z_]\w*", s_ident), (r"\d+\.\d*", s_float), (r"\d+", s_int), (r"=|\+|-|\*|/", s_operator), (r"\s+", None), ]) tokens, tail = scanner.scan("sum = 3*foo + 312.50 + bar") print tokens if tail: print "syntax error at", tail ## prints: ## ['sum', 'operator=', 3, 'operator*', 'foo', 'operator+', ## 312.5, 'operator+', 'bar']