High memory usage - program mistake or Python feature?
Bengt Richter
bokr at oz.net
Sun May 25 05:20:40 EDT 2003
On Sat, 24 May 2003 20:34:31 +0100, "Ben S" <bens at replytothegroupplease.com> wrote:
>Hmm, in quick experiments with using xreadlines instead of readlines,
>there is obviously the problem that while a single iteration over either
>container works the same way, in order to repeat iterations over
>xreadlines I need to somehow reset the iterator, which a quick look at
>the documentation doesn't show me how to do. How do I do this, so that
>my functions can take a list of lines without caring whether those lines
>are in memory or coming from xreadlines?
>
Here's a possibility for conserving space while allowing random access to
optionally stripped lines (warning: not tested beyond what you see ;-).
For a windows file with \r\n EOLs int the raw file, putting it in memory
should cost 2 (4 of int minus the 2 for \r\n stripped) bytes per rstripped line.
BTW, using int positions, this is limited to 2**31-1 file size (2,147,483,647),
assuming you had the memory to slurp that (or you could use a mmap image otherwise
instead of self.data and maybe another for self.linePos as well at the cost of
some en/decoding).
Hm, that might make an interesting variant of array -- an option to allocate in mmap space(s).
====< linesof.py >===============================================
class LinesOf(object):
from array import array
def __init__(self, *fileargs, **kw):
strip_method = kw.get('strip_method')
self.data = self.array('c')
self.linePos = self.array('l') # max 2**31-1
pos = 0
for line in file(*fileargs):
if strip_method: line = strip_method(line)
self.data.fromstring(line)
self.linePos.append(pos)
pos += len(line) # pos of next line
self.num_lines = len(self.linePos)
self.linePos.append(pos) # pos of EOF
def __getitem__(self, i):
if isinstance(i, int):
if i < 0: i += self.num_lines
return self.data[self.linePos[i]:self.linePos[i+1]].tostring()
raise NotImplementedError, 'slices not yet implemented'
def __len__(self): return self.num_lines
def test(*fileargs, **kwargs):
fileLines = LinesOf(*fileargs, **kwargs)
line_no = 0
for something in fileLines:
print '%3s: %s' % (line_no, `something`)
line_no += 1
print 'len(%s) => %s' % (fileargs[0], len(fileLines))
print '%s[0]: %s' % (fileargs[0], fileLines[0])
print '%s[1]: %s' % (fileargs[0], fileLines[1])
print '...'
print '%s[-3]: %s' % (fileargs[0],fileLines[-3])
print '%s[-2]: %s' % (fileargs[0],fileLines[-2])
print '%s[-1]: %s' % (fileargs[0],fileLines[-1])
print '%s[1:3] %s\n' % (fileargs[0], fileLines[1:3])
if __name__ == '__main__':
import sys
def getit(names):
names = names.split('.')
thing, names = names[0], names[1:]
thing = locals().get(thing) or globals().get(thing) or __builtins__.__dict__.get(thing)
if thing:
for name in names: thing = getattr(thing, name)
return thing
try:
args = tuple([x for x in sys.argv[1:] if not '=' in x])
kw = dict([(k, getit(v)) for k,v in [x.split('=') for x in sys.argv[1:] if '=' in x]])
test(*args, **kw)
except Exception, e:
print '%s: %s\n' % (e.__class__.__name__, e)
print 'Usage: [python] linesof.py filepath [mode] [strip_method=str.[rstrip|strip|lstrip]]'
=================================================================
running it with itself as a test file (I included a slice in the test, but didn't implement
slicing, so it raises the NotImplementedError exception).
[ 1:33] C:\pywk\ut>linesof.py linesof.py strip_method=str.rstrip
0: 'class LinesOf(object):'
1: ' from array import array'
2: ' def __init__(self, *fileargs, **kw):'
3: " strip_method = kw.get('strip_method')"
4: " self.data = self.array('c')"
[... etc (note list printed here is of repr(line) not line ...]
50: ' except Exception, e:'
51: " print '%s: %s\\n' % (e.__class__.__name__, e)"
52: " print 'Usage: [python] linesof.py filepath [mode] [strip_method=str.[rstrip|strip|lstrip]]'"
53: ''
54: ''
len(linesof.py) => 55
linesof.py[0]: class LinesOf(object):
linesof.py[1]: from array import array
...
linesof.py[-3]: print 'Usage: [python] linesof.py filepath [mode] [strip_method=str.[rstrip|strip|lstrip]]'
linesof.py[-2]:
linesof.py[-1]:
NotImplementedError: slices not yet implemented
Usage: [python] linesof.py filepath [mode] [strip_method=str.[rstrip|strip|lstrip]]
--
Or, the way you might use it as a module:
>>> import linesof
>>> lines = linesof.LinesOf('linesof.py', strip_method=str.rstrip)
>>> lines[0]
'class LinesOf(object):'
>>> lines[1]
' from array import array'
>>> lines[-4]
" print '%s: %s\\n' % (e.__class__.__name__, e)"
Or strip both sides:
>>> lines = linesof.LinesOf('linesof.py', strip_method=str.strip)
>>> lines[0]
'class LinesOf(object):'
>>> lines[1]
'from array import array'
>>> lines[-4]
"print '%s: %s\\n' % (e.__class__.__name__, e)"
Or neither side:
>>> lines = linesof.LinesOf('linesof.py')
>>> lines[0]
'class LinesOf(object):\n'
>>> lines[1]
' from array import array\n'
>>> lines[-4]
" print '%s: %s\\n' % (e.__class__.__name__, e)\n"
Obviously you could pass anything that would transform a string as strip_method, e.g.,
>>> def weird_method(s): return s.lstrip()[:5].upper()
...
>>> lines = linesof.LinesOf('linesof.py', strip_method=weird_method)
>>> lines[0]
'CLASS'
>>> lines[1]
'FROM '
>>> lines[-4]
'PRINT'
Or you could create optional keyword parameters filter_in=<regex including line if match>
and filter_out=<regex to exclude if match> and filter_order=<specify in-out or out-in>,
and allow a whole list of munging methods, etc. etc. to select from the original file and
preprocess it. You could also have a lazyslice(lo,hi) method that would return an iterator
to produce a slice of lines one at a time without making a separate list, though of course
it's easy to iterate through a range of indices and access lines[i] (still weirdly stripped here):
>>> [lines[i] for i in (0,1,-4)]
['CLASS', 'FROM ', 'PRINT']
>>> [lines[i] for i in range(4)]
['CLASS', 'FROM ', 'DEF _', 'STRIP']
>>> '<%s>' % '><'.join([lines[i] for i in range(4)])
'<CLASS><FROM ><DEF _><STRIP>'
Etc. Etc.
Regards,
Bengt Richter
More information about the Python-list
mailing list