[Spambayes-checkins] spambayes weaktest.py,NONE,1.1

Sat Nov 9 21:48:55 2002

Update of /cvsroot/spambayes/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv31102

Added Files:
	weaktest.py 
Log Message:
New test driver to simulate "unsure only" training

--- NEW FILE: weaktest.py ---
#! /usr/bin/env python

# A test driver using "the standard" test directory structure.
# This simulates a user that gets E-mail, and only trains on fp,
# fn and unsure messages. It starts by training on the first 30
# messages, and from that point on well classified messages will
# not be used for training. This can be used to see what the performance
# of the scoring algorithm is under such conditions. Questions are:
#  * How does the size of the database behave over time?
#  * Does the classification get better over time?
#  * Are there other combinations of parameters for the classifier
#    that make this better behaved than the default values?

"""Usage: %(program)s  [options] -n nsets

Where:
    -h
        Show usage and exit.
    -n int
        Number of Set directories (Data/Spam/Set1, ... and Data/Ham/Set1, ...).
        This is required.

In addition, an attempt is made to merge bayescustomize.ini into the options.
If that exists, it can be used to change the settings in Options.options.
"""

from __future__ import generators

import sys,os

from Options import options
import hammie

import msgs

program = sys.argv[0]

debug = 0

def usage(code, msg=''):
    """Print usage message and sys.exit(code)."""
    if msg:
        print >> sys.stderr, msg
        print >> sys.stderr
    print >> sys.stderr, __doc__ % globals()
    sys.exit(code)

def drive(nsets):
    print options.display()

    spamdirs = [options.spam_directories % i for i in range(1, nsets+1)]
    hamdirs  = [options.ham_directories % i for i in range(1, nsets+1)]

    spamfns = [(x,y,1) for x in spamdirs for y in os.listdir(x)]
    hamfns = [(x,y,0) for x in hamdirs for y in os.listdir(x)]

    nham = len(hamfns)
    nspam = len(spamfns)

    allfns={}
    for fn in spamfns+hamfns:
        allfns[fn] = None

    d = hammie.Hammie(hammie.createbayes('weaktest.db', False))

    n=0
    unsure=0
    hamtrain=0
    spamtrain=0
    fp=0
    fn=0
    for dir,name, is_spam in allfns.iterkeys():
        n += 1
        m=msgs.Msg(dir, name).guts
        if debug:
            print "trained:%dH+%dS fp:%d fn:%d unsure:%d before %s/%s"%(hamtrain,spamtrain,fp,fn,unsure,dir,name),
        if hamtrain + spamtrain > 30:
            scr=d.score(m)
        else:
            scr=0.50
        if debug:
            print "score:%.3f"%scr,
        if scr < hammie.SPAM_THRESHOLD and is_spam:
            if scr < hammie.HAM_THRESHOLD:
                fn += 1
                if debug:
                    print "fn"
            else:
                unsure += 1
                if debug:
                    print "Unsure"
            spamtrain += 1
            d.train_spam(m)
            d.update_probabilities()
        elif scr > hammie.HAM_THRESHOLD and not is_spam:
            if scr > hammie.SPAM_THRESHOLD:
                fp += 1
                if debug:
                    print "fp"
                else:
                    print "fp: %s score:%.4f"%(os.path.join(dir,name),scr)
            else:
                unsure += 1
                if debug:
                    print "Unsure"
            hamtrain += 1
            d.train_ham(m)
            d.update_probabilities()
        else:
            if debug:
                print "OK"
        if n % 100 == 0:
            print "%5d trained:%dH+%dS wrds:%d fp:%d fn:%d unsure:%d"%(
                n,hamtrain,spamtrain,len(d.bayes.wordinfo),fp,fn,unsure)
    print "Total messages %d (%d ham and %d spam)"%(len(allfns),nham,nspam)
    print "Total unsure (including 30 startup messages): %d (%.1f%%)"%(
        unsure,unsure*100.0/len(allfns))
    print "Trained on %d ham and %d spam"%(hamtrain,spamtrain)
    print "fp: %d fn: %d"%(fp,fn)
    FPW = options.best_cutoff_fp_weight
    FNW = options.best_cutoff_fn_weight
    UNW = options.best_cutoff_unsure_weight
    print "Total cost: $%.2f"%(FPW*fp+FNW*fn+UNW*unsure)

def main():
    import getopt

    try:
        opts, args = getopt.getopt(sys.argv[1:], 'hn:s:',
                                   ['ham-keep=', 'spam-keep='])
    except getopt.error, msg:
        usage(1, msg)

    nsets = seed = hamkeep = spamkeep = None
    for opt, arg in opts:
        if opt == '-h':
            usage(0)
        elif opt == '-n':
            nsets = int(arg)

    if args:
        usage(1, "Positional arguments not supported")
    if nsets is None:
        usage(1, "-n is required")

    drive(nsets)

if __name__ == "__main__":
    main()