[Spambayes-checkins] spambayes/testtools cmp.py,NONE,1.1.2.1 fpfn.py,NONE,1.1.2.1 mboxtest.py,NONE,1.1.2.1 rates.py,NONE,1.1.2.1 simplexloop.py,NONE,1.1.2.1 table.py,NONE,1.1.2.1 timcv.py,NONE,1.1.2.1 timtest.py,NONE,1.1.2.1weaktest.py,NONE,1.1.2.1

Anthony Baxter anthonybaxter at users.sourceforge.net
Fri Jan 10 02:41:10 EST 2003


Update of /cvsroot/spambayes/spambayes/testtools
In directory sc8-pr-cvs1:/tmp/cvs-serv9389/testtools

Added Files:
      Tag: reorg-branch
	cmp.py fpfn.py mboxtest.py rates.py simplexloop.py table.py 
	timcv.py timtest.py weaktest.py 
Log Message:
Checkpointing before I head home.

Still to do: 
 - distutils magic to make sure that the 22compat modules are 
   installed when needed.
 - Walking through testtools and utilities and fixing imports.
 - Documentation.

hammie works, everything else that people use in day-to-day operation
should work - please give it a go.



--- NEW FILE: cmp.py ---
#!/usr/bin/env python

"""
cmp.py sbase1 sbase2

Combines output from sbase1.txt and sbase2.txt, which are created by
rates.py from timtest.py output, and displays comparison statistics to
stdout.
"""

import sys
f1n, f2n = sys.argv[1:3]

# Return
#  (list of all f-p rates,
#   list of all f-n rates,
#   total f-p,
#   total f-n,
#   average f-p rate,
#   average f-n rate,
#   list of all ham score deviations,
#   list of all spam score deviations,
#   ham score deviation for all runs,
#   spam score deviations for all runs,
# )
# from summary file f.
def suck(f):
    fns = []
    fps = []
    hamdev = []
    spamdev = []
    hamdevall = spamdevall = (0.0, 0.0)

    get = f.readline
    while 1:
        line = get()
        if line.startswith('-> <stat> tested'):
            print line,
        if line.find(' items; mean ') != -1:
            # -> <stat> Ham distribution for this pair: 1000 items; mean 0.05; sample sdev 0.68
            # and later "sample " went away
            vals = line.split(';')
            mean = float(vals[1].split()[-1])
            sdev = float(vals[2].split()[-1])
            val = (mean, sdev)
            typ = vals[0].split()[2]
            if line.find('for all runs') != -1:
                if typ == 'Ham':
                    hamdevall = val
                else:
                    spamdevall = val
            elif line.find('all in this') != -1:
                if typ == 'Ham':
                    hamdev.append(val)
                else:
                    spamdev.append(val)
            continue
        if line.startswith('-> '):
            continue
        if line.startswith('total'):
            break
        # A line with an f-p rate and an f-n rate.
        p, n = map(float, line.split())
        fps.append(p)
        fns.append(n)

    # "total unique false pos 0"
    # "total unique false neg 0"
    # "average fp % 0.0"
    # "average fn % 0.0"
    fptot = int(line.split()[-1])
    fntot = int(get().split()[-1])
    fpmean = float(get().split()[-1])
    fnmean = float(get().split()[-1])
    return (fps, fns, fptot, fntot, fpmean, fnmean,
            hamdev, spamdev, hamdevall, spamdevall)

def tag(p1, p2):
    if p1 == p2:
        t = "tied          "
    else:
        t = p1 < p2 and "lost " or "won  "
        if p1:
            p = (p2 - p1) * 100.0 / p1
            t += " %+7.2f%%" % p
        else:
            t += " +(was 0)"
    return t

def mtag(m1, m2):
    mean1, dev1 = m1
    mean2, dev2 = m2
    t = "%7.2f %7.2f " % (mean1, mean2)
    if mean1:
        mp = (mean2 - mean1) * 100.0 / mean1
        t += "%+7.2f%%" % mp
    else:
        t += "+(was 0)"
    t += "     %7.2f %7.2f " % (dev1, dev2)
    if dev1:
        dp = (dev2 - dev1) * 100.0 / dev1
        t += "%+7.2f%%" % dp
    else:
        t += "+(was 0)"
    return t

def dump(p1s, p2s):
    alltags = ""
    for p1, p2 in zip(p1s, p2s):
        t = tag(p1, p2)
        print "    %5.3f  %5.3f  %s" % (p1, p2, t)
        alltags += t + " "
    print
    for t in "won", "tied", "lost":
        print "%-4s %2d times" % (t, alltags.count(t))
    print

def dumpdev(meandev1, meandev2):
    for m1, m2 in zip(meandev1, meandev2):
        print mtag(m1, m2)

def windowsfy(fn):
    import os
    if os.path.exists(fn + '.txt'):
        return fn + '.txt'
    else:
        return fn

print f1n, '->', f2n


f1n = windowsfy(f1n)
f2n = windowsfy(f2n)

(fp1, fn1, fptot1, fntot1, fpmean1, fnmean1,
 hamdev1, spamdev1, hamdevall1, spamdevall1) = suck(file(f1n))

(fp2, fn2, fptot2, fntot2, fpmean2, fnmean2,
 hamdev2, spamdev2, hamdevall2, spamdevall2) = suck(file(f2n))

print
print "false positive percentages"
dump(fp1, fp2)
print "total unique fp went from", fptot1, "to", fptot2, tag(fptot1, fptot2)
print "mean fp % went from", fpmean1, "to", fpmean2, tag(fpmean1, fpmean2)

print
print "false negative percentages"
dump(fn1, fn2)
print "total unique fn went from", fntot1, "to", fntot2, tag(fntot1, fntot2)
print "mean fn % went from", fnmean1, "to", fnmean2, tag(fnmean1, fnmean2)

print
if len(hamdev1) == len(hamdev2) and len(spamdev1) == len(spamdev2):
    print "ham mean                     ham sdev"
    dumpdev(hamdev1, hamdev2)
    print
    print "ham mean and sdev for all runs"
    dumpdev([hamdevall1], [hamdevall2])


    print
    print "spam mean                    spam sdev"
    dumpdev(spamdev1, spamdev2)
    print
    print "spam mean and sdev for all runs"
    dumpdev([spamdevall1], [spamdevall2])

    print
    diff1 = spamdevall1[0] - hamdevall1[0]
    diff2 = spamdevall2[0] - hamdevall2[0]
    print "ham/spam mean difference: %2.2f %2.2f %+2.2f" % (diff1,
                                                            diff2,
                                                            diff2 - diff1)
else:
    print "[info about ham & spam means & sdevs not available in both files]"

--- NEW FILE: fpfn.py ---
#! /usr/bin/env python
"""Extract false positive and false negative filenames from timcv.py output."""

import sys
import re

def cmpf(a, b):
    # Sort function that sorts by numerical value
    ma = re.search(r'(\d+)/(\d+)$', a)
    mb = re.search(r'(\d+)/(\d+)$', b)
    if ma and mb:
        xa, ya = map(int, ma.groups())
        xb, yb = map(int, mb.groups())
        return cmp((xa, ya), (xb, yb))
    else:
        return cmp(a, b)

def main():
    for name in sys.argv[1:]:
        try:
            f = open(name + ".txt")
        except IOError:
            f = open(name)
        print "===", name, "==="
        fp = []
        fn = []
        for line in f:
            if line.startswith('    new fp: '):
                fp.extend(eval(line[12:]))
            elif line.startswith('    new fn: '):
                fn.extend(eval(line[12:]))
        fp.sort(cmpf)
        fn.sort(cmpf)
        print "--- fp ---"
        for x in fp:
            print x
        print "--- fn ---"
        for x in fn:
            print x

if __name__ == '__main__':
    main()

--- NEW FILE: mboxtest.py ---
#! /usr/bin/env python
"""mboxtest.py: A test driver for classifier.

Usage: mboxtest.py [options] <ham> <spam>

Options:
    -f FMT
        One of unix, mmdf, mh, or qmail.  Specifies mailbox format for
        ham and spam files.  Default is unix.

    -n NSETS
        Number of test sets to create for a single mailbox.  Default is 5.

    -s SEED
        Seed for random number generator.  Default is 101.

    -m MSGS
        Read no more than MSGS messages from mailbox.
"""

from __future__ import generators

import getopt
import mailbox
import random
import re
from sets import Set
import sys

from spambayes.tokenizer import tokenize
from spambayes.TestDriver import Driver
from spambayes.msgs import Msg
from spambayes.Options import options

try:
    True, False
except NameError:
    # Maintain compatibility with Python 2.2
    True, False = 1, 0


mbox_fmts = {"unix": mailbox.PortableUnixMailbox,
             "mmdf": mailbox.MmdfMailbox,
             "mh": mailbox.MHMailbox,
             "qmail": mailbox.Maildir,
             }

class MboxMsg(Msg):

    def __init__(self, fp, path, index):
        self.guts = fp.read()
        self.tag = "%s:%s %s" % (path, index, subject(self.guts))

    def __str__(self):
        lines = []
        i = 0
        for line in self.guts.split("\n"):
            skip = False
            for skip_prefix in 'X-', 'Received:', '\t',:
                if line.startswith(skip_prefix):
                    skip = True
            if skip:
                continue
            i += 1
            if i > 100:
                lines.append("... truncated")
                break
            lines.append(line)
        return "\n".join(lines)

    def __iter__(self):
        return tokenize(self.guts)

class mbox(object):

    def __init__(self, path, indices=None):
        self.path = path
        self.indices = {}
        self.key = ''
        if indices is not None:
            self.key = " %s" % indices[0]
            for i in indices:
                self.indices[i] = 1

    def __repr__(self):
        return "<mbox: %s%s>" % (self.path, self.key)

    def __iter__(self):
        # Use a simple factory that just produces a string.
        mbox = mbox_fmts[FMT](open(self.path, "rb"),
                              lambda f: MboxMsg(f, self.path, i))

        i = 0
        while 1:
            msg = mbox.next()
            if msg is None:
                return
            i += 1
            if self.indices.get(i-1) or not self.indices:
                yield msg

def subject(buf):
    buf = buf.lower()
    i = buf.find('subject:')
    j = buf.find("\n", i)
    return buf[i:j]

def randindices(nelts, nresults):
    L = range(nelts)
    random.shuffle(L)
    chunk = nelts / nresults
    for i in range(nresults):
        yield Set(L[:chunk])
        del L[:chunk]

def sort(seq):
    L = list(seq)
    L.sort()
    return L

def main(args):
    global FMT

    print options.display()

    FMT = "unix"
    NSETS = 10
    SEED = 101
    MAXMSGS = None
    opts, args = getopt.getopt(args, "f:n:s:m:")
    for k, v in opts:
        if k == '-f':
            FMT = v
        if k == '-n':
            NSETS = int(v)
        if k == '-s':
            SEED = int(v)
        if k == '-m':
            MAXMSGS = int(v)

    ham, spam = args

    random.seed(SEED)

    nham = len(list(mbox(ham)))
    nspam = len(list(mbox(spam)))

    if MAXMSGS:
        nham = min(nham, MAXMSGS)
        nspam = min(nspam, MAXMSGS)

    print "ham", ham, nham
    print "spam", spam, nspam

    ihams = map(tuple, randindices(nham, NSETS))
    ispams = map(tuple, randindices(nspam, NSETS))

    driver = Driver()

    for i in range(1, NSETS):
        driver.train(mbox(ham, ihams[i]), mbox(spam, ispams[i]))

    i = 0
    for iham, ispam in zip(ihams, ispams):
        hams = mbox(ham, iham)
        spams = mbox(spam, ispam)

        if i > 0:
            driver.untrain(hams, spams)

        driver.test(hams, spams)
        driver.finishtest()

        if i < NSETS - 1:
            driver.train(hams, spams)

        i += 1
    driver.alldone()

if __name__ == "__main__":
    sys.exit(main(sys.argv[1:]))

--- NEW FILE: rates.py ---
#!/usr/bin/env python

"""
rates.py basename ...

Assuming that file

    basename + '.txt'
or
    basename

contains output from one of the test drivers (timcv, mboxtest, timtest),
scans that file for summary statistics, displays them to stdout, and also
writes them to file

    basename + 's.txt'

(where the 's' means 'summary').  This doesn't need a full output file
from a test run, and will display stuff for as far as the output file
has gotten so far.

Two of these summary files can later be fed to cmp.py.
"""

import sys

"""
-> Training on Data/Ham/Set2-3 & Data/Spam/Set2-3 ... 8000 hams & 5500 spams
-> Predicting Data/Ham/Set1 & Data/Spam/Set1 ...
-> <stat> tested 4000 hams & 2750 spams against 8000 hams & 5500 spams
-> <stat> false positive %: 0.025
-> <stat> false negative %: 0.327272727273
-> <stat> 1 new false positives
"""

def doit(basename):
    if basename.endswith('.txt'):
        basename = basename[:-4]
    try:
        ifile = file(basename + '.txt')
    except IOError:
        ifile = file(basename)
    interesting = filter(lambda line: line.startswith('-> '), ifile)
    ifile.close()

    oname = basename + 's.txt'
    ofile = file(oname, 'w')
    print basename, '->', oname

    def dump(*stuff):
        msg = ' '.join(map(str, stuff))
        print msg
        print >> ofile, msg

    ntests = nfn = nfp = 0
    sumfnrate = sumfprate = 0.0

    for line in interesting:
        dump(line[:-1])
        fields = line.split()

        # 0      1      2    3    4 5    6                 -5  -4 -3   -2    -1
        #-> <stat> tested 4000 hams & 2750 spams against 8000 hams & 5500 spams
        if line.startswith('-> <stat> tested '):
            ntests += 1
            continue

        #  0      1     2        3
        # -> <stat> false positive %: 0.025
        # -> <stat> false negative %: 0.327272727273
        if line.startswith('-> <stat> false '):
            kind = fields[3]
            percent = float(fields[-1])
            if kind == 'positive':
                sumfprate += percent
                lastval = percent
            else:
                sumfnrate += percent
                dump('    %7.3f %7.3f' % (lastval, percent))
            continue

        #  0      1 2   3     4         5
        # -> <stat> 1 new false positives
        if len(fields) >= 5 and fields[3] == 'new' and fields[4] == 'false':
            kind = fields[-1]
            count = int(fields[2])
            if kind == 'positives':
                nfp += count
            else:
                nfn += count

    dump('total unique false pos', nfp)
    dump('total unique false neg', nfn)
    dump('average fp %', sumfprate / ntests)
    dump('average fn %', sumfnrate / ntests)

for name in sys.argv[1:]:
    doit(name)

--- NEW FILE: simplexloop.py ---
#
# Optimize parameters
#
"""Usage: %(program)s  [options] -c command

Where:
    -h
        Show usage and exit.
    -c command
        The command to be run, with all its options. 
        The last line of output from this program should
        match 'YYYYYYY cost: $xxxx.xx'
        (i.e. the third word of the last line should be the value to be
         minimized, preceded by a dollar sign)
        I have used
         "python2.3 timcv.py -n 10 --spam-keep=600 --ham-keep=600 -s 12345"

This program will overwrite bayescustomize.ini!
"""

import sys

def usage(code, msg=''):
    """Print usage message and sys.exit(code)."""
    if msg:
        print >> sys.stderr, msg
        print >> sys.stderr
    print >> sys.stderr, __doc__ % globals()
    sys.exit(code)

program = sys.argv[0]

from spambayes import Options

start = (Options.options.unknown_word_prob,
         Options.options.minimum_prob_strength,
         Options.options.unknown_word_strength)
err = (0.01, 0.01, 0.01)

def mkini(vars):
    f=open('bayescustomize.ini', 'w')
    f.write("""
[Classifier]
unknown_word_prob = %.6f
minimum_prob_strength = %.6f
unknown_word_strength = %.6f
"""%tuple(vars))
    f.close()

def score(vars):
    import os
    mkini(vars)
    status = os.system('%s > loop.out'%command)
    if status != 0:
        print >> sys.stderr, "Error status from subcommand"
        sys.exit(status)
    f = open('loop.out', 'r')
    txt = f.readlines()
    # Extract the flex cost field.
    cost = float(txt[-1].split()[2][1:])
    f.close()
    os.rename('loop.out','loop.out.old')
    print ''.join(txt[-20:])[:-1]
    print "x=%.4f p=%.4f s=%.4f %.2f"%(tuple(vars)+(cost,))
    sys.stdout.flush()
    return -cost

def main():
    import spambayes.optimize
    finish=spambayes.optimize.SimplexMaximize(start,err,score)
    mkini(finish)
    print "Best result left in bayescustomize.ini"

if __name__ == "__main__":
    import getopt

    try:
        opts, args = getopt.getopt(sys.argv[1:], 'hc:')
    except getopt.error, msg:
        usage(1, msg)

    command = None
    for opt, arg in opts:
        if opt == '-h':
            usage(0)
        elif opt == '-c':
            command = arg

    if args:
        usage(1, "Positional arguments not supported")
    if command is None:
        usage(1, "-c is required")

    main()

--- NEW FILE: table.py ---
#!/usr/bin/env python

"""
table.py [-m] base1 base2 ... baseN

Combines output from base1.txt, base2.txt, etc., which are created by
the TestDriver (such as timcv.py) output, and displays tabulated
comparison statistics to stdout.  Each input file is represented by
one column in the table.

Optional argument -m shows a final column with the mean value of each
statistic.
"""

# Return
#  (
#   ham tested,
#   spam tested,
#   total f-p,
#   total f-n,
#   total unsure,
#   average f-p rate,
#   average f-n rate,
#   average unsure rate,
#   real cost,
#   best cost,
#   ham score deviation for all runs,
#   spam score deviations for all runs,
# )
# from summary file f.
def suck(f):
    hamdevall = spamdevall = (0.0, 0.0)
    cost = 0.0
    bestcost = 0.0
    fp = 0
    fn = 0
    un = 0
    fpp = 0.0
    fnp = 0.0
    unp = 0.0
    htest = 0
    stest = 0

    get = f.readline
    while 1:
        line = get()
        if line.startswith('-> <stat> tested'):
            # <stat> tested 1910 hams & 948 spams against 2741 hams & 948 spams
            #      1      2    3    4 5   6
            print line,

        elif line.find(' items; mean ') > 0 and line.find('for all runs') > 0:
            # <stat> Ham scores for all runs: 2741 items; mean 0.86; sdev 6.28
            #                                          0          1          2
            vals = line.split(';')
            mean = float(vals[1].split()[-1])
            sdev = float(vals[2].split()[-1])
            val = (mean, sdev)
            ntested = int(vals[0].split()[-2])
            typ = vals[0].split()[2]
            if line.find('for all runs') != -1:
                if typ == 'Ham':
                    hamdevall = val
                    htest = ntested
                else:
                    spamdevall = val
                    stest = ntested

        elif line.startswith('-> best cost for all runs: $'):
            # -> best cost for all runs: $28.20
            bestcost = float(line.split('$')[-1])

        elif line.startswith('-> <stat> all runs false positives: '):
            fp = int(line.split()[-1])

        elif line.startswith('-> <stat> all runs false negatives: '):
            fn = int(line.split()[-1])

        elif line.startswith('-> <stat> all runs unsure: '):
            un = int(line.split()[-1])

        elif line.startswith('-> <stat> all runs false positive %: '):
            fpp = float(line.split()[-1])

        elif line.startswith('-> <stat> all runs false negative %: '):
            fnp = float(line.split()[-1])

        elif line.startswith('-> <stat> all runs unsure %: '):
            unp = float(line.split()[-1])

        elif line.startswith('-> <stat> all runs cost: '):
            cost = float(line.split('$')[-1])
            break

    return (htest, stest, fp, fn, un, fpp, fnp, unp, cost, bestcost,
            hamdevall, spamdevall)

def windowsfy(fn):
    import os
    if os.path.exists(fn + '.txt'):
        return fn + '.txt'
    else:
        return fn

def table():
    import getopt, sys

    showMean = 0

    fname = "filename: "
    fnam2 = "          "
    ratio = "ham:spam: "
    rat2  = "          "
    fptot = "fp total: "
    fpper = "fp %:     "
    fntot = "fn total: "
    fnper = "fn %:     "
    untot = "unsure t: "
    unper = "unsure %: "
    rcost = "real cost:"
    bcost = "best cost:"

    hmean = "h mean:   "
    hsdev = "h sdev:   "
    smean = "s mean:   "
    ssdev = "s sdev:   "
    meand = "mean diff:"
    kval  = "k:        "

    tfptot = tfpper = tfntot = tfnper = tuntot = tunper = trcost = tbcost = \
    thmean = thsdev = tsmean = tssdev = tmeand = tkval =  0

    args, fileargs = getopt.getopt(sys.argv[1:], 'm')
    for arg, val in args:
        if arg == "-m":
            showMean = 1

    for filename in fileargs:
        filename = windowsfy(filename)
        (htest, stest, fp, fn, un, fpp, fnp, unp, cost, bestcost,
         hamdevall, spamdevall) = suck(file(filename))
        if filename.endswith('.txt'):
            filename = filename[:-4]
        filename = filename[filename.rfind('/')+1:]
        filename = filename[filename.rfind("\\")+1:]
        if len(fname) > len(fnam2):
            fname += "        "
            fname = fname[0:(len(fnam2) + 8)]
            fnam2 += " %7s" % filename
        else:
            fnam2 += "        "
            fnam2 = fnam2[0:(len(fname) + 8)]
            fname += " %7s" % filename
        if len(ratio) > len(rat2):
            ratio += "        "
            ratio = ratio[0:(len(rat2) + 8)]
            rat2  += " %7s" % ("%d:%d" % (htest, stest))
        else:
            rat2  += "        "
            rat2  = rat2[0:(len(ratio) + 8)]
            ratio += " %7s" % ("%d:%d" % (htest, stest))
        fptot += "%8d"   % fp
        tfptot += fp
        fpper += "%8.2f" % fpp
        tfpper += fpp
        fntot += "%8d"   % fn
        tfntot += fn
        fnper += "%8.2f" % fnp
        tfnper += fnp
        untot += "%8d"   % un
        tuntot += un
        unper += "%8.2f" % unp
        tunper += unp
        rcost += "%8s"   % ("$%.2f" % cost)
        trcost += cost
        bcost += "%8s"   % ("$%.2f" % bestcost)
        tbcost += bestcost
        hmean += "%8.2f" % hamdevall[0]
        thmean += hamdevall[0]
        hsdev += "%8.2f" % hamdevall[1]
        thsdev += hamdevall[1]
        smean += "%8.2f" % spamdevall[0]
        tsmean += spamdevall[0]
        ssdev += "%8.2f" % spamdevall[1]
        tssdev += spamdevall[1]
        meand += "%8.2f" % (spamdevall[0] - hamdevall[0])
        tmeand += (spamdevall[0] - hamdevall[0])
        k = (spamdevall[0] - hamdevall[0]) / (spamdevall[1] + hamdevall[1])
        kval  += "%8.2f" % k
        tkval  += k

    nfiles = len(fileargs)
    if nfiles and showMean:
        fptot += "%12d"   % (tfptot/nfiles)
        fpper += "%12.2f" % (tfpper/nfiles)
        fntot += "%12d"   % (tfntot/nfiles)
        fnper += "%12.2f" % (tfnper/nfiles)
        untot += "%12d"   % (tuntot/nfiles)
        unper += "%12.2f" % (tunper/nfiles)
        rcost += "%12s"   % ("$%.2f" % (trcost/nfiles))
        bcost += "%12s"   % ("$%.2f" % (tbcost/nfiles))
        hmean += "%12.2f" % (thmean/nfiles)
        hsdev += "%12.2f" % (thsdev/nfiles)
        smean += "%12.2f" % (tsmean/nfiles)
        ssdev += "%12.2f" % (tssdev/nfiles)
        meand += "%12.2f" % (tmeand/nfiles)
        kval  += "%12.2f" % (tkval/nfiles)

    print fname
    if len(fnam2.strip()) > 0:
        print fnam2
    print ratio
    if len(rat2.strip()) > 0:
        print rat2
    print fptot
    print fpper
    print fntot
    print fnper
    print untot
    print unper
    print rcost
    print bcost
    print hmean
    print hsdev
    print smean
    print ssdev
    print meand
    print kval

if __name__ == "__main__":
    table()

--- NEW FILE: timcv.py ---
#! /usr/bin/env python

# A driver for N-fold cross validation.

"""Usage: %(program)s [options] -n nsets

Where:
    -h
        Show usage and exit.
    -n int
        Number of Set directories (Data/Spam/Set1, ... and Data/Ham/Set1, ...).
        This is required.

If you only want to use some of the messages in each set,

    --HamTrain int
        The maximum number of msgs to use from each Ham set for training.
        The msgs are chosen randomly.  See also the -s option.

    --SpamTrain int
        The maximum number of msgs to use from each Spam set for training.
        The msgs are chosen randomly.  See also the -s option.

    --HamTest int
        The maximum number of msgs to use from each Ham set for testing.
        The msgs are chosen randomly.  See also the -s option.

    --SpamTest int
        The maximum number of msgs to use from each Spam set for testing.
        The msgs are chosen randomly.  See also the -s option.

    --ham-keep int
        The maximum number of msgs to use from each Ham set for testing
        and training. The msgs are chosen randomly.  See also the -s option.

    --spam-keep int
        The maximum number of msgs to use from each Spam set for testing
        and training. The msgs are chosen randomly.  See also the -s option.

    -s int
        A seed for the random number generator.  Has no effect unless
        at least on of {--ham-keep, --spam-keep} is specified.  If -s
        isn't specifed, the seed is taken from current time.

In addition, an attempt is made to merge bayescustomize.ini into the options.
If that exists, it can be used to change the settings in Options.options.
"""

from __future__ import generators

import sys

from spambayes.Options import options
from spambayes import TestDriver
from spambayes import msgs

program = sys.argv[0]

def usage(code, msg=''):
    """Print usage message and sys.exit(code)."""
    if msg:
        print >> sys.stderr, msg
        print >> sys.stderr
    print >> sys.stderr, __doc__ % globals()
    sys.exit(code)

def drive(nsets):
    print options.display()

    hamdirs  = [options.ham_directories % i for i in range(1, nsets+1)]
    spamdirs = [options.spam_directories % i for i in range(1, nsets+1)]

    d = TestDriver.Driver()
    # Train it on all sets except the first.
    d.train(msgs.HamStream("%s-%d" % (hamdirs[1], nsets),
                            hamdirs[1:], train=1),
            msgs.SpamStream("%s-%d" % (spamdirs[1], nsets),
                            spamdirs[1:], train=1))

    # Now run nsets times, predicting pair i against all except pair i.
    for i in range(nsets):
        h = hamdirs[i]
        s = spamdirs[i]
        hamstream = msgs.HamStream(h, [h], train=0)
        spamstream = msgs.SpamStream(s, [s], train=0)

        if i > 0:
            if options.build_each_classifier_from_scratch:
                # Build a new classifier from the other sets.
                d.new_classifier()

                hname = "%s-%d, except %d" % (hamdirs[0], nsets, i+1)
                h2 = hamdirs[:]
                del h2[i]

                sname = "%s-%d, except %d" % (spamdirs[0], nsets, i+1)
                s2 = spamdirs[:]
                del s2[i]

                d.train(msgs.HamStream(hname, h2, train=1),
                        msgs.SpamStream(sname, s2, train=1))

            else:
                # Forget this set.
                d.untrain(hamstream, spamstream)

        # Predict this set.
        d.test(hamstream, spamstream)
        d.finishtest()

        if i < nsets - 1 and not options.build_each_classifier_from_scratch:
            # Add this set back in.
            d.train(hamstream, spamstream)

    d.alldone()

def main():
    import getopt

    try:
        opts, args = getopt.getopt(sys.argv[1:], 'hn:s:',
                                   ['HamTrain=', 'SpamTrain=',
                                   'HamTest=', 'SpamTest=',
                                   'ham-keep=', 'spam-keep='])
    except getopt.error, msg:
        usage(1, msg)

    nsets = seed = hamtrain = spamtrain = None
    hamtest = spamtest = hamkeep = spamkeep = None
    for opt, arg in opts:
        if opt == '-h':
            usage(0)
        elif opt == '-n':
            nsets = int(arg)
        elif opt == '-s':
            seed = int(arg)
        elif opt == '--HamTest':
            hamtest = int(arg)
        elif opt == '--SpamTest':
            spamtest = int(arg)
        elif opt == '--HamTrain':
            hamtrain = int(arg)
        elif opt == '--SpamTrain':
            spamtrain = int(arg)
        elif opt == '--ham-keep':
            hamkeep = int(arg)
        elif opt == '--spam-keep':
            spamkeep = int(arg)

    if args:
        usage(1, "Positional arguments not supported")
    if nsets is None:
        usage(1, "-n is required")

    if hamkeep is not None:
        msgs.setparms(hamkeep, spamkeep, seed=seed)
    else:
        msgs.setparms(hamtrain, spamtrain, hamtest, spamtest, seed)
    drive(nsets)

if __name__ == "__main__":
    main()

--- NEW FILE: timtest.py ---
#! /usr/bin/env python

# A test driver using "the standard" test directory structure.  See also
# rates.py and cmp.py for summarizing results.  This runs an NxN test grid,
# skipping the diagonal.

"""Usage: %(program)s  [options] -n nsets

Where:
    -h
        Show usage and exit.
    -n int
        Number of Set directories (Data/Spam/Set1, ... and Data/Ham/Set1, ...).
        This is required.

If you only want to use some of the messages in each set,

    --ham-keep int
        The maximum number of msgs to use from each Ham set.  The msgs are
        chosen randomly.  See also the -s option.

    --spam-keep int
        The maximum number of msgs to use from each Spam set.  The msgs are
        chosen randomly.  See also the -s option.

    -s int
        A seed for the random number generator.  Has no effect unless
        at least on of {--ham-keep, --spam-keep} is specified.  If -s
        isn't specifed, the seed is taken from current time.

In addition, an attempt is made to merge bayescustomize.ini into the options.
If that exists, it can be used to change the settings in Options.options.
"""

from __future__ import generators

import sys

from spambayes.Options import options
from spambayes import TestDriver
from spambayes import msgs

program = sys.argv[0]

def usage(code, msg=''):
    """Print usage message and sys.exit(code)."""
    if msg:
        print >> sys.stderr, msg
        print >> sys.stderr
    print >> sys.stderr, __doc__ % globals()
    sys.exit(code)

def drive(nsets):
    print options.display()

    spamdirs = [options.spam_directories % i for i in range(1, nsets+1)]
    hamdirs  = [options.ham_directories % i for i in range(1, nsets+1)]
    spamhamdirs = zip(spamdirs, hamdirs)

    d = TestDriver.Driver()
    for spamdir, hamdir in spamhamdirs:
        d.new_classifier()
        d.train(msgs.HamStream(hamdir, [hamdir]),
                msgs.SpamStream(spamdir, [spamdir]))
        for sd2, hd2 in spamhamdirs:
            if (sd2, hd2) == (spamdir, hamdir):
                continue
            d.test(msgs.HamStream(hd2, [hd2]),
                   msgs.SpamStream(sd2, [sd2]))
        d.finishtest()
    d.alldone()

def main():
    import getopt

    try:
        opts, args = getopt.getopt(sys.argv[1:], 'hn:s:',
                                   ['ham-keep=', 'spam-keep='])
    except getopt.error, msg:
        usage(1, msg)

    nsets = seed = hamkeep = spamkeep = None
    for opt, arg in opts:
        if opt == '-h':
            usage(0)
        elif opt == '-n':
            nsets = int(arg)
        elif opt == '-s':
            seed = int(arg)
        elif opt == '--ham-keep':
            hamkeep = int(arg)
        elif opt == '--spam-keep':
            spamkeep = int(arg)

    if args:
        usage(1, "Positional arguments not supported")
    if nsets is None:
        usage(1, "-n is required")

    msgs.setparms(hamkeep, spamkeep, seed=seed)
    drive(nsets)

if __name__ == "__main__":
    main()

--- NEW FILE: weaktest.py ---
#! /usr/bin/env python

# A test driver using "the standard" test directory structure.
# This simulates a user that gets E-mail, and only trains on fp,
# fn and unsure messages. It starts by training on the first 30
# messages, and from that point on well classified messages will
# not be used for training. This can be used to see what the performance
# of the scoring algorithm is under such conditions. Questions are:
#  * How does the size of the database behave over time?
#  * Does the classification get better over time?
#  * Are there other combinations of parameters for the classifier
#    that make this better behaved than the default values?


"""Usage: %(program)s  [options] -n nsets

Where:
    -h
        Show usage and exit.
    -n int
        Number of Set directories (Data/Spam/Set1, ... and Data/Ham/Set1, ...).
        This is required.
    -d decider 
        Name of the decider. One of %(decisionkeys)s
    -m min
        Minimal number of messages to train on before involving the decider.

In addition, an attempt is made to merge bayescustomize.ini into the options.
If that exists, it can be used to change the settings in Options.options.
"""

from __future__ import generators

import sys,os

from spambayes.Options import options
from spambayes import hammie, msgs, CostCounter

program = sys.argv[0]

debug = 0

def usage(code, msg=''):
    """Print usage message and sys.exit(code)."""
    if msg:
        print >> sys.stderr, msg
        print >> sys.stderr
    print >> sys.stderr, __doc__ % globals()
    sys.exit(code)

DONT_TRAIN = None
TRAIN_AS_HAM = 1
TRAIN_AS_SPAM = 2

class TrainDecision:
    def __call__(self,scr,is_spam):
        if is_spam:
            return self.spamtrain(scr)
        else:
            return self.hamtrain(scr)

class UnsureAndFalses(TrainDecision):
    def spamtrain(self,scr):
        if scr < options.spam_cutoff:
	    return TRAIN_AS_SPAM

    def hamtrain(self,scr):
        if scr > options.ham_cutoff:
	    return TRAIN_AS_HAM

class UnsureOnly(TrainDecision):
    def spamtrain(self,scr):
        if options.ham_cutoff < scr < options.spam_cutoff:
	    return TRAIN_AS_SPAM

    def hamtrain(self,scr):
        if options.ham_cutoff < scr < options.spam_cutoff:
	    return TRAIN_AS_HAM

class All(TrainDecision):
    def spamtrain(self,scr):
        return TRAIN_AS_SPAM

    def hamtrain(self,scr):
        return TRAIN_AS_HAM

class AllBut0and100(TrainDecision):
    def spamtrain(self,scr):
        if scr < 0.995:
	    return TRAIN_AS_SPAM

    def hamtrain(self,scr):
        if scr > 0.005:
            return TRAIN_AS_HAM

class OwnDecision(TrainDecision):
    def hamtrain(self,scr):
        if scr < options.ham_cutoff:
	    return TRAIN_AS_HAM
        elif scr > options.spam_cutoff:
	    return TRAIN_AS_SPAM

    spamtrain = hamtrain

class OwnDecisionFNCorrection(OwnDecision):
    def spamtrain(self,scr):
        return TRAIN_AS_SPAM

decisions={'all': All,
           'allbut0and100': AllBut0and100,
           'unsureonly': UnsureOnly,
           'unsureandfalses': UnsureAndFalses,
           'owndecision': OwnDecision,
           'owndecision+fn': OwnDecisionFNCorrection,
          }
decisionkeys=decisions.keys()
decisionkeys.sort()

class FirstN:
    def __init__(self,n,client):
        self.client = client
        self.x = 0
        self.n = n

    def __call__(self,scr,is_spam):
        self.x += 1
        if self.tooearly():
            if is_spam:
		return TRAIN_AS_SPAM
            else:
		return TRAIN_AS_HAM
        else:
            return self.client(scr,is_spam)
    
    def tooearly(self):
        return self.x < self.n

class Updater:
    def __init__(self,d=None):
        self.setd(d)

    def setd(self,d):
        self.d=d

def drive(nsets,decision):
    print options.display()

    spamdirs = [options.spam_directories % i for i in range(1, nsets+1)]
    hamdirs  = [options.ham_directories % i for i in range(1, nsets+1)]

    spamfns = [(x,y,1) for x in spamdirs for y in os.listdir(x)]
    hamfns = [(x,y,0) for x in hamdirs for y in os.listdir(x)]

    nham = len(hamfns)
    nspam = len(spamfns)
    cc = CostCounter.nodelay()

    allfns = {}
    for fn in spamfns+hamfns:
        allfns[fn] = None

    d = hammie.open('weaktest.db', False)

    hamtrain = 0
    spamtrain = 0
    n = 0
    for dir,name, is_spam in allfns.iterkeys():
        n += 1
        m=msgs.Msg(dir, name).guts
        if debug > 1:
            print "trained:%dH+%dS"%(hamtrain,spamtrain)
        scr=d.score(m)
        if debug > 1:
            print "score:%.3f"%scr
        if not decision.tooearly():
            if is_spam:
                if debug > 0:
                    print "Spam with score %.2f"%scr
                cc.spam(scr)
            else:
                if debug > 0:
                    print "Ham with score %.2f"%scr
                cc.ham(scr)
        de = decision(scr,is_spam) 
        if de == TRAIN_AS_SPAM: 
            d.train_spam(m)
            spamtrain += 1
        elif de == TRAIN_AS_HAM:
            d.train_ham(m)
            hamtrain += 1
        if n % 100 == 0:
            print "%5d trained:%dH+%dS wrds:%d"%(
                n, hamtrain, spamtrain, len(d.bayes.wordinfo))
            print cc
    print "="*70
    print "%5d trained:%dH+%dS wrds:%d"%(
        n, hamtrain, spamtrain, len(d.bayes.wordinfo))
    print cc

def main():
    global debug

    import getopt

    try:
        opts, args = getopt.getopt(sys.argv[1:], 'vd:hn:m:')
    except getopt.error, msg:
        usage(1, msg)

    nsets = None
    decision = decisions['unsureonly']
    m = 10

    for opt, arg in opts:
        if opt == '-h':
            usage(0)
        elif opt == '-n':
            nsets = int(arg)
        elif opt == '-v':
            debug += 1
        elif opt == '-m':
            m = int(arg)
        elif opt == '-d':
            if not decisions.has_key(arg):
                usage(1,'Unknown decisionmaker')
            decision = decisions[arg]

    if args:
        usage(1, "Positional arguments not supported")
    if nsets is None:
        usage(1, "-n is required")

    drive(nsets,decision=FirstN(m,decision()))

if __name__ == "__main__":
    main()





More information about the Spambayes-checkins mailing list