speed problems

Thu Jun 10 14:27:22 EDT 2004

Hi Axel & Pythoneers,

I played around with your scripts, and my winner got a bit longer than 
usual. I hope, the important part doesn't suffered to much, but thanks 
to a cheat, this one is faster than your original perl script, even with 
profiling and annotations enabled! Talking 'bout the latter: shamelessly
stolen from a Zope check in by our master master, because I couldn't get 
hotshot to produce useful per line statistics out of the box. 
http://mail.zope.org/pipermail/zope-cvs/2002-May/001035.html

Well, although I have to admit, that perl seems faster on this specific 
task (since the grep cheat would work for perl too), I would never 
consider such a move, just try to do this with perl:

---8<--- [virstat.py] ---8<---
#!/usr/bin/python

import os
import re

maillogs = [
            "mail",
            "mail-20040600.gz",
            "mail-20040604.gz",
            "mail-20040607.gz",
            "mail-20040610.gz"
           ]

#gzip = "/usr/bin/gzip -dc"
#bzip2 = "/usr/bin/bzip2 -dc"
gzip = "/usr/bin/zcat"
bzip2 = "/usr/bin/bzcat"
virstat = {}
total = 0
doprof = 1
pat = re.compile( "INFECTED \((.*)\)" )

def dovirstat():
  global virstat, total
  for logfile in maillogs:
    if os.path.isfile(logfile):
      # is it compressed?
      if logfile.endswith('.gz'):
        #ifd, lfd = os.popen2("%s %s" % (gzip, logfile))
        #XXX: cheating
        ifd, lfd = os.popen2("%s %s | grep INFECTED" % (gzip, logfile))
      elif logfile.endswith('.bz2'):
        #ifd, lfd = os.popen2("%s %s" % (bzip2, logfile))
        #XXX: cheating
        ifd, lfd = os.popen2("%s %s | grep INFECTED" % (bzip2, logfile))
      else:
        # uncompressed
        lfd = open(logfile, "r")

      # hot loop
      for line in lfd:
        mo = pat.search(line)
        if mo:
          for vnam in mo.group(1).split( ", "):
            virstat[vnam] = virstat.get(vnam, 0) + 1
            total += 1
      lfd.close()
  #  else:
  #    print "logfile '%s' doesn't exist, skipping it." % logfile

def load_line_info(log):
    byline = {}
    prevloc = None
    for what, place, tdelta in log:
        if tdelta > 0:
            t, nhits = byline.get(prevloc, (0, 0))
            byline[prevloc] = (tdelta + t), (nhits + 1)
            prevloc = place
    return byline

def basename(path, cache={}):
    try:
        return cache[path]
    except KeyError:
        fn = os.path.split(path)[1]
        cache[path] = fn
        return fn

def print_results(results):
    for info, place in results:
        if not place:
            print 'Bad unpack:', info, place
            continue
        filename, line, funcname = place
        print '%8d %8d' % info, basename(filename), line

def annotate_results(results):
    files = {}
    for stats, place in results:
        if not place:
            continue
        time, hits = stats
        file, line, func = place
        l = files.get(file)
        if l is None:
            l = files[file] = []
        l.append((line, hits, time))
    order = files.keys()
    order.sort()
    for k in order:
        if os.path.exists(k):
            v = files[k]
            v.sort()
            annotate(k, v)

def annotate(file, lines):
    print "-" * 60
    print file
    print "-" * 60
    f = open(file)
    i = 1
    match = lines[0][0]
    for line in f:
        if match == i:
            print "%6d %8d " % lines[0][1:], line,
            del lines[0]
            if lines:
                match = lines[0][0]
            else:
                match = None
        else:
            print " " * 16, line,
        i += 1
    print

if not doprof:
  dovirstat()
else:
  import hotshot
  prof = hotshot.Profile("virstat.prof", lineevents=1)
  prof.runcall(dovirstat)
  prof.close()

vlist = virstat.keys()
vlist.sort()
for vname in vlist:
  p = (virstat[vname] / float(total)) * 100
  print "%-30s  %5.2f%%" % (vname, p)
print

if doprof:
  from hotshot.log import LogReader

  log = LogReader("virstat.prof")
  byline = load_line_info(log)
  results = [(v, k) for k, v in byline.items() if k and k[0] == 'virstat.py' ]
  results.sort()
  #print_results(results)
  annotate_results(results)

--->8---

Python programming is not only an easy way to get necessary work done,
on it's best it combines art and science in an esthetic manner. 

Pete