looking for speed-up ideas

Mon Feb 3 20:27:08 EST 2003

This is pseudocode, but might give you an idea of how to write something 
a little faster:

from __future__ import generators

def lines( filename ):
    currentPath = ()
    for line in open(filename):
        if line[0] == 'F':
            yield currentPath, line
        elif line[0] == 'S':
            # manipulate currentPath to store the path information
            # so that you can properly report the filename iff a sub-file
            # turns out to be one of the final result set
            pass

def top200( filename, count=200 ):
    """Find the largest count files in the given filesystem-list"""
    smallest = 0
    set = []
    for directory, file in lines(filename):
        lineSize = int(file.split('/')[1])
        if lineSize > smallest:
            bisect.insort( set, (lineSize,file, directory) )
            if len(set) > count:
                del set[:-count]
            smallest = set[0][0]
    return set

The ideas being:
    generators are fast because everything is local
    don't interpret lines you don't need (i.e. those where size is 
insufficient), figure out which lines are important, then process data 
structures with meta-information
    as you get more files being scanned, the proportion of undesired to 
desired gets higher, so you are likely to run the insort and deletion 
less (save for pathological cases where you windup scanning everything 
in precisely increasing size order).  This allows the quick check 
against the current smallest item to short-circuit the entire processing 
of a file, and that should mitigate the overhead of bisect and del.

Hope that helps,
Mike

Ram Bhamidipaty wrote:

>I have some python code that processes a large file. I want to see how
>much faster this code can get. Mind you, I don't _need_ the code to go
>faster - but it sure would be nice if it were faster...
>
>Here is a specification of the input:
> 1. Lines start with T, S or F
> 2. The first line of the file starts with
>    T, all other lines start with S or F.
> 3. F lines look like "F/<number>/string"
> 4. S lines look like "S/string/<number>/<number>"
>
>Here is a sample:
>
>T /remote 0
>S/name/0/1
>S/joe/1/2
>S/bob/1/3
>F/3150900/big_file.tar.gz
>S/testing/3/4
>F/414/.envrc
>F/276/BUILD_FLAGS
>F/36505/make.incl
>F/3861/build_envrc
>
>In case you are curious the file is a dump of a file system. F lines
>specify a file name and file size. S lines speficy a directory. The
>numbers on an S line represent a directory number and a directory
>parent number. All the F lines under an S line are files in a
>particular directory.
>
>My script reads the file and prints out the 200 largest files.
>
>I am currently using the heapcq module written by John Eikenberry. I
>downloaded it from here: http://zhar.net/projects/python/
>
>There is an edited version of the script at the end of this message.
>
>My script current processes 300,000 lines in about 18 seconds
>on a Sun Ultra 60. To make preformance testing easier the
>script currently limits processing to just reading 300,000 lines.
>The wc program can read the same 300k lines in around 0.4 seconds.
>
>The full input file is around 43 Meg with around 2.2 million lines.
>
>The output from the python profiler shows this:
>
>          15426 function calls in 18.770 CPU seconds
>
>   Ordered by: standard name
>
>   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
>        1    0.010    0.010   18.750   18.750 <string>:1(?)
>        0    0.000             0.000          profile:0(profiler)
>        1    0.020    0.020   18.770   18.770 profile:0(read_for_files(f))
>        1    0.000    0.000   18.740   18.740 read_diskinfo:163(read_for_files)
>     1752    0.060    0.000    0.060    0.000 read_diskinfo:50(__init__)
>    13670    0.560    0.000    0.560    0.000 read_diskinfo:59(__le__)
>        1   18.120   18.120   18.740   18.740 read_diskinfo:89(read_diskinfo_file)
>
>
>I am willing to write C code, but I would first like to get the python
>code faster. 
>
>Any ideas appreciated.
>-Ram
>
>
>------------------------------------------------------------------
>#!/remote/TMhome/ramb/tools/bin/python
>
>import sys, xreadlines, pdb, os.path, imp
>import profile
>
>m = imp.find_module("heapqc", ["/remote/TMhome/ramb/src/Downloaded/py_heap"])
>if not m:
>    print "Unable to load heap module"
>    sys.exit(0)
>mod = imp.load_module("heapqc", m[0], m[1], m[2])
>if not mod:
>    print "module load failed"
>import heapqc
>
>if len(sys.argv) != 3:
>    print "Usage: read_diskinfo files | dirs_self_only | dirs_with_sub <file>"
>    sys.exit(0)
>
>## file format:
>##
>## T full_path_of_top_dir 0
>## F size1 file1
>## F size2 file2
>##   ...
>## S <subname> <par_num> <this_dir_num>
>## F size1 file1
>## F size2 file2
>##   ...
>##
>
>class FileSize:
>    def __init__(self,name,size):
>        self.name = name
>        self.size = size
>    def __lt__(self,other):
>        assert type(self) == type(other)
>        if self.size < other.size:
>            return 1
>        else:
>            return 0
>    def __le__(self,other):
>        if self.size <= other.size:
>            return 1
>        else:
>            return 0
>    def __eq__(self,other):
>        assert type(self) == type(other)
>        if self.size == other.size:
>            return 1
>        else:
>            return 0
>    def __ne__(self,other):
>        assert type(self) == type(other)
>        if self.size == other.size:
>            return 0
>        else:
>            return 1
>    def __ge__(self,other):
>        assert type(self) == type(other)
>        if self.size >= other.size:
>            return 1
>        else:
>            return 0
>    def __gt__(self,other):
>        assert type(self) == type(other)
>        if self.size > other.size:
>            return 1
>        else:
>            return 0
>
>def read_diskinfo_file(f):
>    global dirs
>
>    big_files = []
>
>    print "Starting to read ...",
>    lnum = 0
>    num_in_list = 0
>    sys.stdout.flush()
>    for l in xreadlines.xreadlines(f):
>        lnum += 1
>        #if (lnum & 0xffff) == 0:
>        #    print "read", lnum, "lines."
>        if lnum > 300000:
>            sys.exit(0)
>
>        if l[0] == "F":
>            # F/<size>/<name>
>            tup = l.split("/")
>            try:
>                fsize = int(tup[1])
>            except ValueError:
>                fsize = long(tup[1])
>            
>            if num_in_list < 200:
>                big_files.append(FileSize((this_dir_num, tup[2]), fsize))
>                num_in_list += 1
>                if num_in_list == 200:
>                    heapqc.heapify(big_files)
>            else:
>                if fsize < big_files[0].size:
>                    continue
>                heapqc.heapreplace(big_files, \
>                                   FileSize((this_dir_num, tup[2]), fsize))
>
>        elif l[0] == "S":
>            # S <name> <par_num> <this_dir_num>
>            tup = l.split("/")
>
>            name         = tup[1]
>            par_num      = int(tup[2])
>            this_dir_num = int(tup[3])
>            #print "Working on dir_num=", this_dir_num
>
>            flist = []
>            dirs[this_dir_num] = (name, [], flist, this_dir_num, par_num, 0L)
>
>        elif l[0] == "T":
>            # T full_path 0
>            tup = l.split()
>            flist = []
>            dirs[0] = (tup[1], [], flist, 0, None, 0L)
>
>                
>    print "done reading."
>
>    while big_files:
>        o = heapqc.heappop(big_files)
>        print o.size, o.name
>
>    return
>
>
>def get_dir_name(num):
>    global dirs
>
>    cur_dir_num = num
>    cur_name = ""
>    while cur_dir_num != 0:
>        dir_tup = dirs[cur_dir_num]
>        cur_name = os.path.join(dir_tup[0], cur_name)
>        cur_dir_num = dir_tup[4]
>    return cur_name
>
>def read_for_files(f):
>    global dirs
>    
>    read_diskinfo_file(f)
>    all_file_list = []
>    for (dir_num,dir_info) in dirs.items():
>        file_data = dir_info[2]
>        for tup in file_data:
>            fname = tup[0]
>            fsize = tup[1]
>            encoding = fsize
>            all_file_list.append((encoding, fname, fsize, dir_num))
>
>    all_file_list.sort()
>    all_file_list.reverse()
>    
>
>    #print "Top files sorted by file size"
>    num_to_print = 100
>    num_printed = 0
>    for t in all_file_list:
>        print t[2], t[1], get_dir_name(t[3])
>
>        num_printed += 1
>        if num_printed == num_to_print:
>            break
>
>def read_for_dirs(f, dir_type):
>    global dirs
>    
>    assert dir_type == "dirs_self_only" or dir_type=="dirs_with_sub"
>
>    # for both cases a directory needs to add up files rooted there
>    for (dir_num,dir_info) in dirs.items():
>        file_data = dir_info[2]
>        for tup in file_data:
>            fsize = tup[1]
>            file_data[5] += fsize
>
>    if dir_type == "dirs_with_subs":
>        for (dir_num,dir_info) in dirs.items():
>            file_data = dir_info[2]
>            for tup in file_data:
>                fsize = tup[1]
>                file_data[5] += fsize
>        
>
>##
>## Generate info about how disk space is being used
>## there are two main kinds of info 1. list of the
>## largest files and 2. the directories that contain
>## the largest files
>##
>
>fname = sys.argv[2]
>f = file(fname, "r")
>
>##
>## dirs hash table
>##   key is the directory number as a string
>##   data is a tuple:
>##
>##     [0] --> name
>##     [1] --> list_of_subdir_nums
>##     [2] --> list_of_file_data
>##     [3] --> num_as_key
>##     [4] --> par_dir_num
>##     [5] --> size
>##
>##  file data is for files rooted in this directory
>##
>##  each file data tuple:
>##     
>##    [0] --> name
>##    [1] --> file size
>##
>dirs = {}
>
>try:
>    if sys.argv[1] == "files":
>        profile.run("read_for_files(f)")
>    elif sys.argv[1] == "dirs_self_only":
>        read_for_dirs(f, "self_only")
>    elif sys.argv[1] == "dirs_with_sub":
>        read_for_dirs(f, "with_sub")
>    else:
>        print "Unknown option:", sys.argv[1]
>        sys.exit(0)
>except AssertionError:
>    exc_typ, exc_val, exc_trace = sys.exc_info()
>    pdb.post_mortem(exc_trace)
>
>sys.exit(0)
>
>
>  
>

-- 
_______________________________________
  Mike C. Fletcher
  Designer, VR Plumber, Coder
  http://members.rogers.com/mcfletch/