parsing an Excel formula with the re module

Paul McGuire ptmcg at austin.rr.com
Wed Jan 13 03:15:52 EST 2010


On Jan 5, 1:49 pm, Tim Chase <python.l... at tim.thechases.com> wrote:
> vsoler wrote:
> > Hence, I need toparseExcel formulas. Can I do it by means only of re
> > (regular expressions)?
>
> > I know that for simple formulas such as "=3*A7+5" it is indeed
> > possible. What about complex for formulas that include functions,
> > sheet names and possibly other *.xls files?
>
> Where things start getting ugly is when you have nested function
> calls, such as
>
>    =if(Sum(A1:A25)>42,Min(B1:B25), if(Sum(C1:C25)>3.14,
> (Min(C1:C25)+3)*18,Max(B1:B25)))
>
> Regular expressions don't do well with nested parens (especially
> arbitrarily-nesting-depth such as are possible), so I'd suggest
> going for a full-blown parsing solution like pyparsing.
>
> If you have fair control over what can be contained in the
> formulas and you know they won't contain nested parens/functions,
> you might be able to formulate some sort of "kinda, sorta, maybe
> parses some forms of formulas" regexp.
>
> -tkc

This might give the OP a running start:

from pyparsing import (CaselessKeyword, Suppress, Word, alphas,
    alphanums, nums, Optional, Group, oneOf, Forward, Regex,
    operatorPrecedence, opAssoc, dblQuotedString)

test1 = "=3*A7+5"
test2 = "=3*Sheet1!$A$7+5"
test3 = "=if(Sum(A1:A25)>42,Min(B1:B25), " \
     "if(Sum(C1:C25)>3.14, (Min(C1:C25)+3)*18,Max(B1:B25)))"

EQ,EXCL,LPAR,RPAR,COLON,COMMA,DOLLAR = map(Suppress, '=!():,$')
sheetRef = Word(alphas, alphanums)
colRef = Optional(DOLLAR) + Word(alphas,max=2)
rowRef = Optional(DOLLAR) + Word(nums)
cellRef = Group(Optional(sheetRef + EXCL)("sheet") + colRef("col") +
                    rowRef("row"))

cellRange = (Group(cellRef("start") + COLON + cellRef("end"))
("range")
                | cellRef )

expr = Forward()

COMPARISON_OP = oneOf("< = > >= <= != <>")
condExpr = expr + COMPARISON_OP + expr

ifFunc = (CaselessKeyword("if") +
          LPAR +
          Group(condExpr)("condition") +
          COMMA + expr("if_true") +
          COMMA + expr("if_false") + RPAR)
statFunc = lambda name : CaselessKeyword(name) + LPAR + cellRange +
RPAR
sumFunc = statFunc("sum")
minFunc = statFunc("min")
maxFunc = statFunc("max")
aveFunc = statFunc("ave")
funcCall = ifFunc | sumFunc | minFunc | maxFunc | aveFunc

multOp = oneOf("* /")
addOp = oneOf("+ -")
numericLiteral = Regex(r"\-?\d+(\.\d+)?")
operand = numericLiteral | funcCall | cellRange | cellRef
arithExpr = operatorPrecedence(operand,
    [
    (multOp, 2, opAssoc.LEFT),
    (addOp, 2, opAssoc.LEFT),
    ])

textOperand = dblQuotedString | cellRef
textExpr = operatorPrecedence(textOperand,
    [
    ('&', 2, opAssoc.LEFT),
    ])
expr << (arithExpr | textExpr)

import pprint
for test in (test1,test2, test3):
    print test
    pprint.pprint( (EQ + expr).parseString(test).asList() )
    print


Prints:

=3*A7+5
[[['3', '*', ['A', '7']], '+', '5']]

=3*Sheet1!$A$7+5
[[['3', '*', ['Sheet1', 'A', '7']], '+', '5']]

=if(Sum(A1:A25)>42,Min(B1:B25), if(Sum(C1:C25)>3.14, (Min(C1:C25)+3)
*18,Max(B1:B25)))
['if',
 ['sum', [['A', '1'], ['A', '25']], '>', '42'],
 'min',
 [['B', '1'], ['B', '25']],
 'if',
 ['sum', [['C', '1'], ['C', '25']], '>', '3.14'],
 [['min', [['C', '1'], ['C', '25']], '+', '3'], '*', '18'],
 'max',
 [['B', '1'], ['B', '25']]]


-- Paul



More information about the Python-list mailing list