looking for speed-up ideas

Tue Feb 4 16:06:11 EST 2003

Andrew Dalke <adalke at mindspring.com> writes:

> Ram Bhamidipaty wrote:
> > Ok. Here are two scripts:
> 
> Indeed.  I'm still surprised my code is that much slower.
> 
> Here are my thoughts:
>    - you use xreadlines.xreadlines(f) where I use "for line in f"
>        I thought they are the same speed, but perhaps I'm wrong.
>        Could you change that and find out?

With xreadlines the new speed is:
espring> ~/script2.1.py /tmp/foo /tmp/foo_2
         3 function calls in 32.920 CPU seconds

> 
>    - You do
>        try / int() / except Value Error / long()
>       when I always do long().  It could be that longs are much more
>       expensive than ints.  Also, in newer pythons the int converts
>       to a long as needed so this is no longer needed.

with both xreadlines and the try/excep for long/int:

espring> ~/script2.1.py /tmp/foo /tmp/foo_2
         3 function calls in 29.410 CPU seconds

> 
>    - you do an explicit compare against the max size before insertion
>       into the list while I always append.  That can be a big slowdown,
>       so change
> 
>       for line in infile:
>           if line[:1] == "F":
>               ignore, size, name = line.split("/")
>               # negate size so 'largest' is sorted first
>               fileinfo.append( (-long(size), dirid, name) )
>               if len(fileinfo) > 10000:
>                   # Could use a heapq....
>                   fileinfo.sort()
>                   fileinfo = fileinfo[:200]
>       into
> 
>       min_allowed = 0
> 
>       for line in infile:
>           if line[:1] == "F":
>               ignore, size, name = line.split("/")
>               # negate size so 'largest' is sorted first
>               size = long(size)
>               if size > min_allowed:
>                   fileinfo.append( (-long(size), dirid, name) )
>               if len(fileinfo) > 10000:
>                   # Could use a heapq....
>                   fileinfo.sort()
>                   del fileinfo[200:]
>                   min_allowed = -fileinfo[-1][0]
> 

And now with all three:

espring> ~/script2.1.py /tmp/foo /tmp/foo_2
         3 function calls in 17.610 CPU seconds

Here is the version that generated the above result:

----------------------------------------------------------------
#!/remote/espring/ramb/tools/bin/python

import sys, profile, xreadlines

def process(infile,outfile):
     dirid_info = {}

     line = infile.readline()
     assert line[:1] == "T"
     ignore, dirname, dirid = line.split()
     dirid_info[dirid] = (None, dirname)

     fileinfo = []
     min_allowed = 0

     for line in xreadlines.xreadlines(infile):
         if line[:1] == "F":
             ignore, size, name = line.split("/")

             try:
                 size2 = int(size)
             except ValueError:
                 size2 = long(size)

             if size2 > min_allowed: 
                 fileinfo.append( (-size2, dirid, name) ) 
             if len(fileinfo) > 10000: 
                 fileinfo.sort() 
                 del fileinfo[200:] 
                 min_allowed = -fileinfo[-1][0] 

         else:
             ignore, dirname, parent_id, dirid = line[:-1].split("/")
             dirid_info[dirid] = (parent_id, dirname)

     fileinfo.sort()
     fileinfo = fileinfo[:200]

     for size, dirid, name in fileinfo:
         size = -size
         components = [name[:-1]]  # need to chop newline
         while dirid != None:
             dirid, dirname = dirid_info[dirid]
             components.append(dirname)
         components.reverse()
         print >>outfile, size, "/".join(components)

f = file(sys.argv[1], "r")
f2 = file(sys.argv[2], "w")
profile.run("process(f, f2)")
----------------------------------------------------------------

And for reference I've included my script1.py with the change suggested
by Tim Peters:
----------------------------------------------------------------
#!/remote/TMhome/ramb/tools/bin/python

import sys, xreadlines, pdb, os.path, imp
import profile

m = imp.find_module("heapqc", ["/remote/TMhome/ramb/src/Downloaded/py_heap"])
imp.load_module("heapqc", m[0], m[1], m[2])
import heapqc

class FileSize:
    def __init__(self,name,size):
        self.name = name
        self.size = size
    def __le__(self,other):
        return self.size <= other.size

def read_diskinfo_file(f,fout):
    global dirs

    big_files = []

    num_in_list = 0
    sys.stdout.flush()
    for l in xreadlines.xreadlines(f):
        if l[0] == "F":
            tup = l.split("/")
            try:
                fsize = int(tup[1])
            except ValueError:
                fsize = long(tup[1])

            if num_in_list < 200:
                big_files.append(FileSize((this_dir_num, tup[2]), fsize))
                num_in_list += 1
                if num_in_list == 200:
                    heapqc.heapify(big_files)
            else:
                if fsize < big_files[0].size:
                    continue
                heapqc.heapreplace(big_files, \
                                   FileSize((this_dir_num, tup[2]), fsize))

        elif l[0] == "S":
            tup = l.split("/")

            name         = tup[1]
            par_num      = int(tup[2])
            this_dir_num = int(tup[3])

            flist = []
            dirs[this_dir_num] = (name, [], flist, this_dir_num, par_num, 0L)

        elif l[0] == "T":
            tup = l.split()
            flist = []
            dirs[0] = (tup[1], [], flist, 0, None, 0L)

    final_order = []
    heapqc.heapify(big_files)
    while big_files:
        o = heapqc.heappop(big_files)
        final_order.append(o)

    final_order.reverse()
    for o in final_order:
        name = get_dir_name(o.name[0]) + o.name[1]
        print >>fout, o.size, name,

    return

def get_dir_name(num):
    global dirs

    cur_dir_num = num
    cur_name = ""
    while cur_dir_num != 0:
        dir_tup = dirs[cur_dir_num]
        cur_name = os.path.join(dir_tup[0], cur_name)
        cur_dir_num = dir_tup[4]
    cur_name = os.path.join(dirs[0][0], cur_name)
    return cur_name

dirs = {}
in_fname = sys.argv[1]
f_in = file(in_fname, "r")
out_fname = sys.argv[2]
f_out = file(out_fname, "w")
profile.run("read_diskinfo_file(f_in, f_out)")
f_in.close()
f_out.close()
----------------------------------------------------------------
The times for the new script:

espring> ~/script1.py /tmp/foo /tmp/foo_1
         18760 function calls in 17.850 CPU seconds

Wow - and now your script is a bit faster than mine. The big win came
from not appending anything that was known to be too small.

-----------------------------------------------------------------
The sad part is this:

espring> cat du_sh_scipt
grep \^F /tmp/foo | sort -t '/' -n -k 2,2 | tail -200 > /tmp/foo_sh

espring> time ~/du_sh_script
8.40u 0.58s 0:09.09 98.7%

To be fair the shell script does not output the full file name, but
that could be handled with some post-processing.

Ok - so now I have a script that is about half the speed of grep + sort.
I guess thats not bad for an interpreted language.

-Ram