[Python-Dev] A standard lexer?
Fredrik Lundh
Fredrik Lundh" <effbot@telia.com
Sun, 2 Jul 2000 18:13:58 +0200
paul wrote:
> As an aside: I would be pumpld about getting a generic lexer into the
> Python distribution.
how about this quick and dirty proposal:
- add a new primitive to SRE: (?P#n), where n is a small integer.
this primitive sets the match object's "index" variable to n when
the engine stumbles upon it.
- given a list of "phrases", combine them into a single regular
expression like this:
(?:phrase1(?P#1))|(?:phrase2(?P#2))|...
- apply match repeatedly to the input string. for each match,
use the index attribute to figure out what phrase we matched.
see below for a slightly larger example.
</F>
import sre
class Scanner:
def __init__(self, lexicon):
self.lexicon =3D lexicon
p =3D []
for phrase, action in lexicon:
p.append("(?:%s)(?P#%d)" % (phrase, len(p)))
self.scanner =3D sre.compile("|".join(p))
def scan(self, string):
result =3D []
append =3D result.append
match =3D self.scanner.match
i =3D 0
while 1:
m =3D match(string, i)
if not m:
break
j =3D m.end()
if i =3D=3D j:
break
action =3D self.lexicon[m.index][1]
if callable(action):
self.match =3D match
action =3D action(self, m.group())
if action is not None:
append(action)
i =3D j
return result, string[i:]
def s_ident(scanner, token):
return token
def s_operator(scanner, token):
return "operator%s" % token
def s_float(scanner, token):
return float(token)
def s_int(scanner, token):
return int(token)
scanner =3D Scanner([
(r"[a-zA-Z_]\w*", s_ident),
(r"\d+\.\d*", s_float),
(r"\d+", s_int),
(r"=3D|\+|-|\*|/", s_operator),
(r"\s+", None),
])
tokens, tail =3D scanner.scan("sum =3D 3*foo + 312.50 + bar")
print tokens
if tail:
print "syntax error at", tail
## prints:
## ['sum', 'operator=3D', 3, 'operator*', 'foo', 'operator+',
## 312.5, 'operator+', 'bar']