A nice way to use regex for complicate parsing
Shane Geiger
sgeiger at ncee.net
Thu Mar 29 10:42:50 EDT 2007
It would be worth learning pyparsing to do this.
aspineux wrote:
> My goal is to write a parser for these imaginary string from the SMTP
> protocol, regarding RFC 821 and 1869.
> I'm a little flexible with the BNF from these RFC :-)
> Any comment ?
>
> tests=[ 'MAIL FROM:<john.smith at address.com>',
> 'MAIL FROM:john.smith at address.com',
> 'MAIL FROM:<john.smith at address.com> SIZE=1234
> OTHER=foo at bar.com',
> 'MAIL FROM:john.smith at address.com SIZE=1234
> OTHER=foo at bar.com',
> 'MAIL FROM:<"this at is.a> legal=email"@address.com>',
> 'MAIL FROM:"this at is.a> legal=email"@address.com',
> 'MAIL FROM:<"this at is.a> legal=email"@address.com> SIZE=1234
> OTHER=foo at bar.com',
> 'MAIL FROM:"this at is.a> legal=email"@address.com SIZE=1234
> OTHER=foo at bar.com',
> ]
>
> def RN(name, regex):
> """protect using () and give an optional name to a regex"""
> if name:
> return r'(?P<%s>%s)' % (name, regex)
> else:
> return r'(?:%s)' % regex
>
>
> regex={}
>
> # <dotnum> ::= <snum> "." <snum> "." <snum> "." <snum>
> regex['dotnum']=RN(None, r'[012]?\d?\d\.[012]?\d?\d\.[012]?\d?\d\.
> [012]?\d?\d' % regex)
> # <dot-string> ::= <string> | <string> "." <dot-string>
> regex['dot_string']=RN(None, r'[a-zA-Z0-9]+(?:\.[a-zA-Z0-9]+)*' %
> regex)
> # <domain> ::= <element> | <element> "." <domain>
> regex['domain']=RN('domain', r'%(dotnum)s|%(dot_string)s' % regex)
> # <q> ::= any one of the 128 ASCII characters except <CR>, <LF>, quote
> ("), or backslash (\)
> regex['q']=RN(None, r'[^\n\r"\\]' % regex)
> # <x> ::= any one of the 128 ASCII characters (no exceptions)
> regex['x']=RN(None, r'.' % regex)
> # <qtext> ::= "\" <x> | "\" <x> <qtext> | <q> | <q> <qtext>
> regex['qtext']=RN(None, r'(?:\\%(x)s|%(q)s)+' % regex)
> # <quoted-string> ::= """ <qtext> """
> regex['quoted_string']=RN('quoted_string', r'"%(qtext)s"' % regex)
> # <local-part> ::= <dot-string> | <quoted-string>
> regex['local_part']=RN('local_part', r'%(quoted_string)s|%
> (dot_string)s' % regex)
> # <mailbox> ::= <local-part> "@" <domain>
> regex['mailbox']=RN('mailbox', r'%(local_part)s@%(domain)s' % regex)
> # <path> ::= "<" [ <a-d-l> ":" ] <mailbox> ">"
> # also accept address without <>
> regex['path']=RN('path', r'(?P<path_lt><)?%(mailbox)s(?(path_lt)>)' %
> regex)
> # esmtp-keyword ::= (ALPHA / DIGIT) *(ALPHA / DIGIT / "-")
> regex['esmtp_keyword']=RN(None, r'[a-zA-Z0-9][-a-zA-Z0-9]*' % regex)
> # esmtp-value ::= 1*<any CHAR excluding "=", SP, and all ;
> syntax and values depend on esmtp-keyword
> # control characters (US ASCII 0-31inclusive)>
> regex['esmtp_value']=RN(None, r'[^= \t\r\n\f\v]*' % regex)
> # esmtp-parameter ::= esmtp-keyword ["=" esmtp-value]
> regex['esmtp_parameter']=RN(None, r'%(esmtp_keyword)s(?:=%
> (esmtp_value)s)?' % regex)
> # esmtp-parameter ::= esmtp-keyword ["=" esmtp-value]
> regex['esmtp_parameters']=RN('esmtp_parameters', r'%
> (esmtp_parameter)s(?:\s+%(esmtp_parameter)s)+' % regex)
> # esmtp-cmd ::= inner-esmtp-cmd [SP esmtp-parameters] CR LF
> regex['esmtp_addr']=RN('esmtp_addr', r'%(path)s(?:\s+%
> (esmtp_parameters)s)?' % regex)
>
> for t in tests:
> for keyword in [ 'MAIL FROM:', 'RCPT TO:' ]:
> keylen=len(keyword)
> if t[:keylen].upper()==keyword:
> t=t[keylen:]
> break
>
> match=re.match(regex['esmtp_addr'], t)
> if match:
> print 'MATCH local_part=%(local_part)s domain=%(domain)s
> esmtp_parameters=%(esmtp_parameters)s' % match.groupdict()
> else:
> print 'DONT match', t
>
>
--
Shane Geiger
IT Director
National Council on Economic Education
sgeiger at ncee.net | 402-438-8958 | http://www.ncee.net
Leading the Campaign for Economic and Financial Literacy
-------------- next part --------------
A non-text attachment was scrubbed...
Name: sgeiger.vcf
Type: text/x-vcard
Size: 310 bytes
Desc: not available
URL: <http://mail.python.org/pipermail/python-list/attachments/20070329/4b6f5a97/attachment.vcf>
More information about the Python-list
mailing list