looking for speed-up ideas
Ram Bhamidipaty
ramb at sonic.net
Mon Feb 3 19:39:09 EST 2003
I have some python code that processes a large file. I want to see how
much faster this code can get. Mind you, I don't _need_ the code to go
faster - but it sure would be nice if it were faster...
Here is a specification of the input:
1. Lines start with T, S or F
2. The first line of the file starts with
T, all other lines start with S or F.
3. F lines look like "F/<number>/string"
4. S lines look like "S/string/<number>/<number>"
Here is a sample:
T /remote 0
S/name/0/1
S/joe/1/2
S/bob/1/3
F/3150900/big_file.tar.gz
S/testing/3/4
F/414/.envrc
F/276/BUILD_FLAGS
F/36505/make.incl
F/3861/build_envrc
In case you are curious the file is a dump of a file system. F lines
specify a file name and file size. S lines speficy a directory. The
numbers on an S line represent a directory number and a directory
parent number. All the F lines under an S line are files in a
particular directory.
My script reads the file and prints out the 200 largest files.
I am currently using the heapcq module written by John Eikenberry. I
downloaded it from here: http://zhar.net/projects/python/
There is an edited version of the script at the end of this message.
My script current processes 300,000 lines in about 18 seconds
on a Sun Ultra 60. To make preformance testing easier the
script currently limits processing to just reading 300,000 lines.
The wc program can read the same 300k lines in around 0.4 seconds.
The full input file is around 43 Meg with around 2.2 million lines.
The output from the python profiler shows this:
15426 function calls in 18.770 CPU seconds
Ordered by: standard name
ncalls tottime percall cumtime percall filename:lineno(function)
1 0.010 0.010 18.750 18.750 <string>:1(?)
0 0.000 0.000 profile:0(profiler)
1 0.020 0.020 18.770 18.770 profile:0(read_for_files(f))
1 0.000 0.000 18.740 18.740 read_diskinfo:163(read_for_files)
1752 0.060 0.000 0.060 0.000 read_diskinfo:50(__init__)
13670 0.560 0.000 0.560 0.000 read_diskinfo:59(__le__)
1 18.120 18.120 18.740 18.740 read_diskinfo:89(read_diskinfo_file)
I am willing to write C code, but I would first like to get the python
code faster.
Any ideas appreciated.
-Ram
------------------------------------------------------------------
#!/remote/TMhome/ramb/tools/bin/python
import sys, xreadlines, pdb, os.path, imp
import profile
m = imp.find_module("heapqc", ["/remote/TMhome/ramb/src/Downloaded/py_heap"])
if not m:
print "Unable to load heap module"
sys.exit(0)
mod = imp.load_module("heapqc", m[0], m[1], m[2])
if not mod:
print "module load failed"
import heapqc
if len(sys.argv) != 3:
print "Usage: read_diskinfo files | dirs_self_only | dirs_with_sub <file>"
sys.exit(0)
## file format:
##
## T full_path_of_top_dir 0
## F size1 file1
## F size2 file2
## ...
## S <subname> <par_num> <this_dir_num>
## F size1 file1
## F size2 file2
## ...
##
class FileSize:
def __init__(self,name,size):
self.name = name
self.size = size
def __lt__(self,other):
assert type(self) == type(other)
if self.size < other.size:
return 1
else:
return 0
def __le__(self,other):
if self.size <= other.size:
return 1
else:
return 0
def __eq__(self,other):
assert type(self) == type(other)
if self.size == other.size:
return 1
else:
return 0
def __ne__(self,other):
assert type(self) == type(other)
if self.size == other.size:
return 0
else:
return 1
def __ge__(self,other):
assert type(self) == type(other)
if self.size >= other.size:
return 1
else:
return 0
def __gt__(self,other):
assert type(self) == type(other)
if self.size > other.size:
return 1
else:
return 0
def read_diskinfo_file(f):
global dirs
big_files = []
print "Starting to read ...",
lnum = 0
num_in_list = 0
sys.stdout.flush()
for l in xreadlines.xreadlines(f):
lnum += 1
#if (lnum & 0xffff) == 0:
# print "read", lnum, "lines."
if lnum > 300000:
sys.exit(0)
if l[0] == "F":
# F/<size>/<name>
tup = l.split("/")
try:
fsize = int(tup[1])
except ValueError:
fsize = long(tup[1])
if num_in_list < 200:
big_files.append(FileSize((this_dir_num, tup[2]), fsize))
num_in_list += 1
if num_in_list == 200:
heapqc.heapify(big_files)
else:
if fsize < big_files[0].size:
continue
heapqc.heapreplace(big_files, \
FileSize((this_dir_num, tup[2]), fsize))
elif l[0] == "S":
# S <name> <par_num> <this_dir_num>
tup = l.split("/")
name = tup[1]
par_num = int(tup[2])
this_dir_num = int(tup[3])
#print "Working on dir_num=", this_dir_num
flist = []
dirs[this_dir_num] = (name, [], flist, this_dir_num, par_num, 0L)
elif l[0] == "T":
# T full_path 0
tup = l.split()
flist = []
dirs[0] = (tup[1], [], flist, 0, None, 0L)
print "done reading."
while big_files:
o = heapqc.heappop(big_files)
print o.size, o.name
return
def get_dir_name(num):
global dirs
cur_dir_num = num
cur_name = ""
while cur_dir_num != 0:
dir_tup = dirs[cur_dir_num]
cur_name = os.path.join(dir_tup[0], cur_name)
cur_dir_num = dir_tup[4]
return cur_name
def read_for_files(f):
global dirs
read_diskinfo_file(f)
all_file_list = []
for (dir_num,dir_info) in dirs.items():
file_data = dir_info[2]
for tup in file_data:
fname = tup[0]
fsize = tup[1]
encoding = fsize
all_file_list.append((encoding, fname, fsize, dir_num))
all_file_list.sort()
all_file_list.reverse()
#print "Top files sorted by file size"
num_to_print = 100
num_printed = 0
for t in all_file_list:
print t[2], t[1], get_dir_name(t[3])
num_printed += 1
if num_printed == num_to_print:
break
def read_for_dirs(f, dir_type):
global dirs
assert dir_type == "dirs_self_only" or dir_type=="dirs_with_sub"
# for both cases a directory needs to add up files rooted there
for (dir_num,dir_info) in dirs.items():
file_data = dir_info[2]
for tup in file_data:
fsize = tup[1]
file_data[5] += fsize
if dir_type == "dirs_with_subs":
for (dir_num,dir_info) in dirs.items():
file_data = dir_info[2]
for tup in file_data:
fsize = tup[1]
file_data[5] += fsize
##
## Generate info about how disk space is being used
## there are two main kinds of info 1. list of the
## largest files and 2. the directories that contain
## the largest files
##
fname = sys.argv[2]
f = file(fname, "r")
##
## dirs hash table
## key is the directory number as a string
## data is a tuple:
##
## [0] --> name
## [1] --> list_of_subdir_nums
## [2] --> list_of_file_data
## [3] --> num_as_key
## [4] --> par_dir_num
## [5] --> size
##
## file data is for files rooted in this directory
##
## each file data tuple:
##
## [0] --> name
## [1] --> file size
##
dirs = {}
try:
if sys.argv[1] == "files":
profile.run("read_for_files(f)")
elif sys.argv[1] == "dirs_self_only":
read_for_dirs(f, "self_only")
elif sys.argv[1] == "dirs_with_sub":
read_for_dirs(f, "with_sub")
else:
print "Unknown option:", sys.argv[1]
sys.exit(0)
except AssertionError:
exc_typ, exc_val, exc_trace = sys.exc_info()
pdb.post_mortem(exc_trace)
sys.exit(0)
More information about the Python-list
mailing list