High memory usage - program mistake or Python feature?

Bengt Richter bokr at oz.net
Sun May 25 05:20:40 EDT 2003


On Sat, 24 May 2003 20:34:31 +0100, "Ben S" <bens at replytothegroupplease.com> wrote:

>Hmm, in quick experiments with using xreadlines instead of readlines,
>there is obviously the problem that while a single iteration over either
>container works the same way, in order to repeat iterations over
>xreadlines I need to somehow reset the iterator, which a quick look at
>the documentation doesn't show me how to do. How do I do this, so that
>my functions can take a list of lines without caring whether those lines
>are in memory or coming from xreadlines?
>

Here's a possibility for conserving space while allowing random access to
optionally stripped lines (warning: not tested beyond what you see ;-).

For a windows file with \r\n EOLs int the raw file, putting it in memory
should cost 2 (4 of int minus the 2 for \r\n stripped) bytes per rstripped line.

BTW, using int positions, this is limited to 2**31-1 file size (2,147,483,647),
assuming you had the memory to slurp that (or you could use a mmap image otherwise
instead of self.data and maybe another for self.linePos as well at the cost of
some en/decoding).

Hm, that might make an interesting variant of array -- an option to allocate in  mmap space(s).

====< linesof.py >===============================================
class LinesOf(object):
    from array import array
    def __init__(self, *fileargs, **kw):
        strip_method = kw.get('strip_method')
        self.data = self.array('c')
        self.linePos = self.array('l') # max 2**31-1
        pos = 0
        for line in file(*fileargs):
            if strip_method: line = strip_method(line)
            self.data.fromstring(line)
            self.linePos.append(pos)
            pos += len(line) # pos of next line
        self.num_lines = len(self.linePos)
        self.linePos.append(pos) # pos of EOF
    def __getitem__(self, i):
        if isinstance(i, int):
            if i < 0: i += self.num_lines
            return self.data[self.linePos[i]:self.linePos[i+1]].tostring()
        raise NotImplementedError, 'slices not yet implemented'
    def __len__(self): return self.num_lines 

def test(*fileargs, **kwargs):
    fileLines = LinesOf(*fileargs, **kwargs)
    line_no = 0
    for something in fileLines:
        print '%3s: %s' % (line_no, `something`)
        line_no += 1
    print 'len(%s) => %s' % (fileargs[0], len(fileLines))
    print '%s[0]: %s' % (fileargs[0], fileLines[0])
    print '%s[1]: %s' % (fileargs[0], fileLines[1])
    print '...'
    print '%s[-3]: %s' % (fileargs[0],fileLines[-3])
    print '%s[-2]: %s' % (fileargs[0],fileLines[-2])
    print '%s[-1]: %s' % (fileargs[0],fileLines[-1])
    print '%s[1:3] %s\n' % (fileargs[0], fileLines[1:3])

if __name__ == '__main__':
    import sys
    def getit(names):
        names = names.split('.')
        thing, names = names[0], names[1:]
        thing = locals().get(thing) or globals().get(thing) or __builtins__.__dict__.get(thing)
        if thing:
            for name in names: thing = getattr(thing, name)
        return thing

    try:
        args = tuple([x for x in sys.argv[1:] if not '=' in x])
        kw   = dict([(k, getit(v)) for k,v in [x.split('=') for x in sys.argv[1:] if '=' in x]])
        test(*args, **kw)
    except Exception, e:
        print '%s: %s\n' % (e.__class__.__name__, e)
        print 'Usage: [python] linesof.py filepath [mode] [strip_method=str.[rstrip|strip|lstrip]]'
        
=================================================================

running it with itself as a test file (I included a slice in the test, but didn't implement
slicing, so it raises the NotImplementedError exception).

[ 1:33] C:\pywk\ut>linesof.py linesof.py  strip_method=str.rstrip
  0: 'class LinesOf(object):'
  1: '    from array import array'
  2: '    def __init__(self, *fileargs, **kw):'
  3: "        strip_method = kw.get('strip_method')"
  4: "        self.data = self.array('c')"

[... etc (note list printed here is of repr(line) not line ...]

 50: '    except Exception, e:'
 51: "        print '%s: %s\\n' % (e.__class__.__name__, e)"
 52: "        print 'Usage: [python] linesof.py filepath [mode] [strip_method=str.[rstrip|strip|lstrip]]'"
 53: ''
 54: ''
len(linesof.py) => 55
linesof.py[0]: class LinesOf(object):
linesof.py[1]:     from array import array
...
linesof.py[-3]:         print 'Usage: [python] linesof.py filepath [mode] [strip_method=str.[rstrip|strip|lstrip]]'
linesof.py[-2]:
linesof.py[-1]:
NotImplementedError: slices not yet implemented

Usage: [python] linesof.py filepath [mode] [strip_method=str.[rstrip|strip|lstrip]]

--
Or, the way you might use it as a module:

 >>> import linesof
 >>> lines = linesof.LinesOf('linesof.py', strip_method=str.rstrip)
 >>> lines[0]
 'class LinesOf(object):'
 >>> lines[1]
 '    from array import array'
 >>> lines[-4]
 "        print '%s: %s\\n' % (e.__class__.__name__, e)"

Or strip both sides:

 >>> lines = linesof.LinesOf('linesof.py', strip_method=str.strip)
 >>> lines[0]
 'class LinesOf(object):'
 >>> lines[1]
 'from array import array'
 >>> lines[-4]
 "print '%s: %s\\n' % (e.__class__.__name__, e)"

Or neither side:

 >>> lines = linesof.LinesOf('linesof.py')
 >>> lines[0]
 'class LinesOf(object):\n'
 >>> lines[1]
 '    from array import array\n'
 >>> lines[-4]
 "        print '%s: %s\\n' % (e.__class__.__name__, e)\n"

Obviously you could pass anything that would transform a string as strip_method, e.g.,

 >>> def weird_method(s): return s.lstrip()[:5].upper()
 ...
 >>> lines = linesof.LinesOf('linesof.py', strip_method=weird_method)
 >>> lines[0]
 'CLASS'
 >>> lines[1]
 'FROM '
 >>> lines[-4]
 'PRINT'

Or you could create optional keyword parameters filter_in=<regex including line if match>
and filter_out=<regex to exclude if match> and filter_order=<specify in-out or out-in>,
and allow a whole list of munging methods, etc. etc. to select from the original file and
preprocess it. You could also have a lazyslice(lo,hi) method that would return an iterator
to produce a slice of lines one at a time without making a separate list, though of course
it's easy to iterate through a range of indices and access lines[i] (still weirdly stripped here):

 >>> [lines[i] for i in (0,1,-4)]
 ['CLASS', 'FROM ', 'PRINT']
 >>> [lines[i] for i in range(4)]
 ['CLASS', 'FROM ', 'DEF _', 'STRIP']
 >>> '<%s>' % '><'.join([lines[i] for i in range(4)])
 '<CLASS><FROM ><DEF _><STRIP>'

Etc. Etc.

Regards,
Bengt Richter




More information about the Python-list mailing list