speed problems
Hans-Peter Jansen
hpj at urpla.net
Thu Jun 10 14:27:22 EDT 2004
Hi Axel & Pythoneers,
I played around with your scripts, and my winner got a bit longer than
usual. I hope, the important part doesn't suffered to much, but thanks
to a cheat, this one is faster than your original perl script, even with
profiling and annotations enabled! Talking 'bout the latter: shamelessly
stolen from a Zope check in by our master master, because I couldn't get
hotshot to produce useful per line statistics out of the box.
http://mail.zope.org/pipermail/zope-cvs/2002-May/001035.html
Well, although I have to admit, that perl seems faster on this specific
task (since the grep cheat would work for perl too), I would never
consider such a move, just try to do this with perl:
---8<--- [virstat.py] ---8<---
#!/usr/bin/python
import os
import re
maillogs = [
"mail",
"mail-20040600.gz",
"mail-20040604.gz",
"mail-20040607.gz",
"mail-20040610.gz"
]
#gzip = "/usr/bin/gzip -dc"
#bzip2 = "/usr/bin/bzip2 -dc"
gzip = "/usr/bin/zcat"
bzip2 = "/usr/bin/bzcat"
virstat = {}
total = 0
doprof = 1
pat = re.compile( "INFECTED \((.*)\)" )
def dovirstat():
global virstat, total
for logfile in maillogs:
if os.path.isfile(logfile):
# is it compressed?
if logfile.endswith('.gz'):
#ifd, lfd = os.popen2("%s %s" % (gzip, logfile))
#XXX: cheating
ifd, lfd = os.popen2("%s %s | grep INFECTED" % (gzip, logfile))
elif logfile.endswith('.bz2'):
#ifd, lfd = os.popen2("%s %s" % (bzip2, logfile))
#XXX: cheating
ifd, lfd = os.popen2("%s %s | grep INFECTED" % (bzip2, logfile))
else:
# uncompressed
lfd = open(logfile, "r")
# hot loop
for line in lfd:
mo = pat.search(line)
if mo:
for vnam in mo.group(1).split( ", "):
virstat[vnam] = virstat.get(vnam, 0) + 1
total += 1
lfd.close()
# else:
# print "logfile '%s' doesn't exist, skipping it." % logfile
def load_line_info(log):
byline = {}
prevloc = None
for what, place, tdelta in log:
if tdelta > 0:
t, nhits = byline.get(prevloc, (0, 0))
byline[prevloc] = (tdelta + t), (nhits + 1)
prevloc = place
return byline
def basename(path, cache={}):
try:
return cache[path]
except KeyError:
fn = os.path.split(path)[1]
cache[path] = fn
return fn
def print_results(results):
for info, place in results:
if not place:
print 'Bad unpack:', info, place
continue
filename, line, funcname = place
print '%8d %8d' % info, basename(filename), line
def annotate_results(results):
files = {}
for stats, place in results:
if not place:
continue
time, hits = stats
file, line, func = place
l = files.get(file)
if l is None:
l = files[file] = []
l.append((line, hits, time))
order = files.keys()
order.sort()
for k in order:
if os.path.exists(k):
v = files[k]
v.sort()
annotate(k, v)
def annotate(file, lines):
print "-" * 60
print file
print "-" * 60
f = open(file)
i = 1
match = lines[0][0]
for line in f:
if match == i:
print "%6d %8d " % lines[0][1:], line,
del lines[0]
if lines:
match = lines[0][0]
else:
match = None
else:
print " " * 16, line,
i += 1
print
if not doprof:
dovirstat()
else:
import hotshot
prof = hotshot.Profile("virstat.prof", lineevents=1)
prof.runcall(dovirstat)
prof.close()
vlist = virstat.keys()
vlist.sort()
for vname in vlist:
p = (virstat[vname] / float(total)) * 100
print "%-30s %5.2f%%" % (vname, p)
print
if doprof:
from hotshot.log import LogReader
log = LogReader("virstat.prof")
byline = load_line_info(log)
results = [(v, k) for k, v in byline.items() if k and k[0] == 'virstat.py' ]
results.sort()
#print_results(results)
annotate_results(results)
--->8---
Python programming is not only an easy way to get necessary work done,
on it's best it combines art and science in an esthetic manner.
Pete
More information about the Python-list
mailing list