data file auto-parsing
Manuel M Garcia
mail at manuelmgarcia.com
Sun Mar 9 21:04:15 EST 2003
On Wed, 5 Mar 2003 23:21:53 -0800, "Hilbert" <hilbert at microsoft.com>
wrote:
>Hello,
>
>Is there a parser that would convert python-like
>indented data files into data structures? Like:
>
>#--------------
>numbers
> 1
> 4
> 5
>colors
> light
> red
> green
> dark
> blue
>#--------------
>
>Would be parsed into a list like:
>
>[ ['numbers',['1','4','5']],
> ['colors',['light',['red','green']],
> ['dark',['blue']] ] ]
>
>Am I crazy?
>Thanks,
>Hilbert
>
>
I once did something like this for work. Here it is (hopefully
cleaned up a bit)...
Manuel
# ###################################### #
# datafile_parsing00.py
import shlex
import StringIO
import re
import sys
import pprint
import types
import traceback
# ###################################### #
class IndentStream:
"""Stream with indent at start of each line.
Indent can be a string or
the number of spaces you wish to indent.
"""
def __init__(self, indent=' ', stream=None):
if stream is None:
self._stream = sys.stdout
else:
self._stream = stream
try:
indent = ' ' * indent
except TypeError:
pass
self._indent = indent
self._nl_indent = '\n' + indent
self._indent_size = len(indent)
self._comma = 0
def write(self, s):
comma = not s.endswith('\n')
s = s.replace('\n', self._nl_indent)
if not self._comma: s = self._indent + s
if s.endswith(self._nl_indent): s = s[:-self._indent_size]
self._stream.write(s)
self._comma = comma
def PrintEvalExec(s, stream=None):
"""Print and eval or exec lines in s.
Each line is considered individually,
so not suitable for indented code.
Uses sys._getframe(1) so assignments
will be in correct namespace.
Tries to keep output lines less than 75 characters long.
"""
if stream is None:
stream = sys.stdout
_indentstream = IndentStream(4, stream=stream)
_pprint = pprint.PrettyPrinter(width=71,
stream=_indentstream).pprint
f = sys._getframe(1)
for e in s.split('\n'):
e = e.lstrip()
if e:
try:
r = eval(e, f.f_globals, f.f_locals)
except SyntaxError:
exec e in f.f_globals, f.f_locals
print >> stream, e
else:
if isinstance(r, types.StringTypes) and r.count('\n'):
print >> stream, '%s -> <multi-line string...>' %
(e,)
print >> _indentstream, r
else:
x = '%s -> %r' % (e, r)
if len(x) > 75:
print >> stream, '%s ->' % (e,)
_pprint(r)
else:
print >> stream, x
# ###################################### #
_TABSPACES = ' ' * 4
# ###################################### #
_re_indent_rest = re.compile(r'(\s*)(.*)')
class shlexError(Exception):
"""Error in shlex"""
pass
def _line_to_indent_tokenlist(L):
"""String into indent and tuple of tokens.append
Makes an attempt to strip # comments."""
L = L.replace('\t', _TABSPACES)
re0 = _re_indent_rest.match(L)
indent = len(re0.group(1))
shlex0 = shlex.shlex(
StringIO.StringIO(re0.group(2)))
tokens = []
while 1:
try:
t = shlex0.get_token()
except:
traceback.print_exc()
err_last = traceback.extract_tb(sys.exc_info()[2])[-1]
raise shlexError(
'shlex gagged during get_token()',
'shlex.token:%r shlex.lineno:%r' % (shlex0.token,
shlex0.lineno),
err_last)
if (t == '') or t.startswith('#'): break
tokens.append(t)
return indent, tuple(tokens)
def _simplify_list(list0):
"""Scan list0 for elements that are lists of length 1,
simplify those elements, in place.
Assumes all elements in list0 are lists,
and no elements of length 0."""
temp = []
for x in list0:
if len(x) == 1:
temp.append(x[0])
else:
temp.append(x)
list0[:] = temp[:]
def ProcessInput(s):
"""Indented lines into nested lists.
Process lines of data, seperated by new-lines.
"""
processed = [] # rough form of nested list we desire
parent_list = [] # list of possible parents
all_p_lines = [] # all p_lines put here too, for later cleaning
for L in s.split('\n'):
indent, tokens = _line_to_indent_tokenlist(L)
if not tokens: continue
# if only have one token in 'tokens', just get rid of
un-needed tuple
if len(tokens) < 2:
tokens = tokens[0]
p_line = [indent, tokens, []]
# Remove possible parents with an indent greater than
# indent we are currently working with.
while parent_list:
if parent_list[-1][0] < indent: break
del parent_list[-1]
if not parent_list:
# if no possible parents,
# should be in 'processed 'at top level
processed.append(p_line)
else:
# otherwise, insert it
# easiest way to insert it is to find parent at
# the end of 'parent_list', and 'processed' we
automagically
# updated at the same time
# (because it is the same object in 'parent_list' and in
# 'processed')
parent_list[-1][2].append(p_line)
# and make current p_line a possible parent
parent_list.append(p_line)
# add to list of all p_lines (for more sneakiness later)
all_p_lines.append(p_line)
# now clean up!
# when we clean 'all_p_lines', 'processed' automagically gets
cleaned too!
all_p_lines.reverse()
for x in all_p_lines:
del x[0] # no longer need indent information
if not x[1]:
# if no children, get rid of children list
del x[1]
else:
_simplify_list(x[1])
_simplify_list(processed)
return processed
# ###################################### #
def _printevalexec(s):
PrintEvalExec(s, stream=IndentStream(4))
_test_input = """#--------------
numbers
1
4
5
colors
light
red
green
dark
blue
#--------------"""
print '\n*** Example 1 ***\n'
_printevalexec('_test_input')
_printevalexec('ProcessInput(_test_input)')
_test_input = """#--------------
junk0 # this comment should be ignored
junk1 junk2 junk3 'string #1' # as should this comment
0
1
'1.0'
'1.1'
'1.2'
2 2A 2B
3
#--------------"""
print '\n*** Example 2 ***\n'
_printevalexec('_test_input')
_printevalexec('ProcessInput(_test_input)')
------------------------------------------------
*** Example 1 ***
_test_input -> <multi-line string...>
#--------------
numbers
1
4
5
colors
light
red
green
dark
blue
#--------------
ProcessInput(_test_input) ->
[['numbers', ['1', '4', '5']],
['colors', [['light', ['red', 'green']], ['dark',
['blue']]]]]
*** Example 2 ***
_test_input -> <multi-line string...>
#--------------
junk0 # this comment should be ignored
junk1 junk2 junk3 'string #1' # as should this comment
0
1
'1.0'
'1.1'
'1.2'
2 2A 2B
3
#--------------
ProcessInput(_test_input) ->
['junk0',
[('junk1', 'junk2', 'junk3', "'string #1'"),
['0', ['1', ["'1.0'", "'1.1'", "'1.2'"]], ('2', '2A', '2B'),
'3']]]
More information about the Python-list
mailing list