looking for speed-up ideas

Mon Feb 3 19:39:09 EST 2003

I have some python code that processes a large file. I want to see how
much faster this code can get. Mind you, I don't _need_ the code to go
faster - but it sure would be nice if it were faster...

Here is a specification of the input:
 1. Lines start with T, S or F
 2. The first line of the file starts with
    T, all other lines start with S or F.
 3. F lines look like "F/<number>/string"
 4. S lines look like "S/string/<number>/<number>"

Here is a sample:

T /remote 0
S/name/0/1
S/joe/1/2
S/bob/1/3
F/3150900/big_file.tar.gz
S/testing/3/4
F/414/.envrc
F/276/BUILD_FLAGS
F/36505/make.incl
F/3861/build_envrc

In case you are curious the file is a dump of a file system. F lines
specify a file name and file size. S lines speficy a directory. The
numbers on an S line represent a directory number and a directory
parent number. All the F lines under an S line are files in a
particular directory.

My script reads the file and prints out the 200 largest files.

I am currently using the heapcq module written by John Eikenberry. I
downloaded it from here: http://zhar.net/projects/python/

There is an edited version of the script at the end of this message.

My script current processes 300,000 lines in about 18 seconds
on a Sun Ultra 60. To make preformance testing easier the
script currently limits processing to just reading 300,000 lines.
The wc program can read the same 300k lines in around 0.4 seconds.

The full input file is around 43 Meg with around 2.2 million lines.

The output from the python profiler shows this:

          15426 function calls in 18.770 CPU seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.010    0.010   18.750   18.750 <string>:1(?)
        0    0.000             0.000          profile:0(profiler)
        1    0.020    0.020   18.770   18.770 profile:0(read_for_files(f))
        1    0.000    0.000   18.740   18.740 read_diskinfo:163(read_for_files)
     1752    0.060    0.000    0.060    0.000 read_diskinfo:50(__init__)
    13670    0.560    0.000    0.560    0.000 read_diskinfo:59(__le__)
        1   18.120   18.120   18.740   18.740 read_diskinfo:89(read_diskinfo_file)

I am willing to write C code, but I would first like to get the python
code faster. 

Any ideas appreciated.
-Ram

------------------------------------------------------------------
#!/remote/TMhome/ramb/tools/bin/python

import sys, xreadlines, pdb, os.path, imp
import profile

m = imp.find_module("heapqc", ["/remote/TMhome/ramb/src/Downloaded/py_heap"])
if not m:
    print "Unable to load heap module"
    sys.exit(0)
mod = imp.load_module("heapqc", m[0], m[1], m[2])
if not mod:
    print "module load failed"
import heapqc

if len(sys.argv) != 3:
    print "Usage: read_diskinfo files | dirs_self_only | dirs_with_sub <file>"
    sys.exit(0)

## file format:
##
## T full_path_of_top_dir 0
## F size1 file1
## F size2 file2
##   ...
## S <subname> <par_num> <this_dir_num>
## F size1 file1
## F size2 file2
##   ...
##

class FileSize:
    def __init__(self,name,size):
        self.name = name
        self.size = size
    def __lt__(self,other):
        assert type(self) == type(other)
        if self.size < other.size:
            return 1
        else:
            return 0
    def __le__(self,other):
        if self.size <= other.size:
            return 1
        else:
            return 0
    def __eq__(self,other):
        assert type(self) == type(other)
        if self.size == other.size:
            return 1
        else:
            return 0
    def __ne__(self,other):
        assert type(self) == type(other)
        if self.size == other.size:
            return 0
        else:
            return 1
    def __ge__(self,other):
        assert type(self) == type(other)
        if self.size >= other.size:
            return 1
        else:
            return 0
    def __gt__(self,other):
        assert type(self) == type(other)
        if self.size > other.size:
            return 1
        else:
            return 0

def read_diskinfo_file(f):
    global dirs

    big_files = []

    print "Starting to read ...",
    lnum = 0
    num_in_list = 0
    sys.stdout.flush()
    for l in xreadlines.xreadlines(f):
        lnum += 1
        #if (lnum & 0xffff) == 0:
        #    print "read", lnum, "lines."
        if lnum > 300000:
            sys.exit(0)

        if l[0] == "F":
            # F/<size>/<name>
            tup = l.split("/")
            try:
                fsize = int(tup[1])
            except ValueError:
                fsize = long(tup[1])

            if num_in_list < 200:
                big_files.append(FileSize((this_dir_num, tup[2]), fsize))
                num_in_list += 1
                if num_in_list == 200:
                    heapqc.heapify(big_files)
            else:
                if fsize < big_files[0].size:
                    continue
                heapqc.heapreplace(big_files, \
                                   FileSize((this_dir_num, tup[2]), fsize))

        elif l[0] == "S":
            # S <name> <par_num> <this_dir_num>
            tup = l.split("/")

            name         = tup[1]
            par_num      = int(tup[2])
            this_dir_num = int(tup[3])
            #print "Working on dir_num=", this_dir_num

            flist = []
            dirs[this_dir_num] = (name, [], flist, this_dir_num, par_num, 0L)

        elif l[0] == "T":
            # T full_path 0
            tup = l.split()
            flist = []
            dirs[0] = (tup[1], [], flist, 0, None, 0L)

    print "done reading."

    while big_files:
        o = heapqc.heappop(big_files)
        print o.size, o.name

    return

def get_dir_name(num):
    global dirs

    cur_dir_num = num
    cur_name = ""
    while cur_dir_num != 0:
        dir_tup = dirs[cur_dir_num]
        cur_name = os.path.join(dir_tup[0], cur_name)
        cur_dir_num = dir_tup[4]
    return cur_name

def read_for_files(f):
    global dirs

    read_diskinfo_file(f)
    all_file_list = []
    for (dir_num,dir_info) in dirs.items():
        file_data = dir_info[2]
        for tup in file_data:
            fname = tup[0]
            fsize = tup[1]
            encoding = fsize
            all_file_list.append((encoding, fname, fsize, dir_num))

    all_file_list.sort()
    all_file_list.reverse()

    #print "Top files sorted by file size"
    num_to_print = 100
    num_printed = 0
    for t in all_file_list:
        print t[2], t[1], get_dir_name(t[3])

        num_printed += 1
        if num_printed == num_to_print:
            break

def read_for_dirs(f, dir_type):
    global dirs

    assert dir_type == "dirs_self_only" or dir_type=="dirs_with_sub"

    # for both cases a directory needs to add up files rooted there
    for (dir_num,dir_info) in dirs.items():
        file_data = dir_info[2]
        for tup in file_data:
            fsize = tup[1]
            file_data[5] += fsize

    if dir_type == "dirs_with_subs":
        for (dir_num,dir_info) in dirs.items():
            file_data = dir_info[2]
            for tup in file_data:
                fsize = tup[1]
                file_data[5] += fsize

##
## Generate info about how disk space is being used
## there are two main kinds of info 1. list of the
## largest files and 2. the directories that contain
## the largest files
##

fname = sys.argv[2]
f = file(fname, "r")

##
## dirs hash table
##   key is the directory number as a string
##   data is a tuple:
##
##     [0] --> name
##     [1] --> list_of_subdir_nums
##     [2] --> list_of_file_data
##     [3] --> num_as_key
##     [4] --> par_dir_num
##     [5] --> size
##
##  file data is for files rooted in this directory
##
##  each file data tuple:
##     
##    [0] --> name
##    [1] --> file size
##
dirs = {}

try:
    if sys.argv[1] == "files":
        profile.run("read_for_files(f)")
    elif sys.argv[1] == "dirs_self_only":
        read_for_dirs(f, "self_only")
    elif sys.argv[1] == "dirs_with_sub":
        read_for_dirs(f, "with_sub")
    else:
        print "Unknown option:", sys.argv[1]
        sys.exit(0)
except AssertionError:
    exc_typ, exc_val, exc_trace = sys.exc_info()
    pdb.post_mortem(exc_trace)

sys.exit(0)