[Spambayes-checkins]
spambayes/testtools cmp.py,NONE,1.1.2.1 fpfn.py,NONE,1.1.2.1
mboxtest.py,NONE,1.1.2.1 rates.py,NONE,1.1.2.1
simplexloop.py,NONE,1.1.2.1 table.py,NONE,1.1.2.1
timcv.py,NONE,1.1.2.1 timtest.py,NONE,1.1.2.1weaktest.py,NONE,1.1.2.1
Anthony Baxter
anthonybaxter at users.sourceforge.net
Fri Jan 10 02:41:10 EST 2003
- Previous message: [Spambayes-checkins] spambayes/spambayes Corpus.py,NONE,1.1.2.1
CostCounter.py,NONE,1.1.2.1 FileCorpus.py,NONE,1.1.2.1
Histogram.py,NONE,1.1.2.1 Options.py,NONE,1.1.2.1
TestDriver.py,NONE,1.1.2.1 Tester.py,NONE,1.1.2.1
__init__.py,NONE,1.1.2.1 cdb.py,NONE,1.1.2.1 chi2.py,NONE,1.1.2.1
classifier.py,NONE,1.1.2.1 dbmstorage.py,NONE,1.1.2.1
hammie.py,NONE,1.1.2.1 hammiebulk.py,NONE,1.1.2.1
mboxutils.py,NONE,1.1.2.1 msgs.py,NONE,1.1.2.1
optimize.py,NONE,1.1.2.1 storage.py,NONE,1.1.2.1tokenizer.py,NONE,1.1.2.1
- Next message: [Spambayes-checkins] spambayes/utilities HistToGNU.py,NONE,1.1.2.1
loosecksum.py,NONE,1.1.2.1 mboxcount.py,NONE,1.1.2.1
rebal.py,NONE,1.1.2.1 split.py,NONE,1.1.2.1 splitn.py,NONE,1.1.2.1
splitndirs.py,NONE,1.1.2.1
- Messages sorted by:
[ date ]
[ thread ]
[ subject ]
[ author ]
Update of /cvsroot/spambayes/spambayes/testtools
In directory sc8-pr-cvs1:/tmp/cvs-serv9389/testtools
Added Files:
Tag: reorg-branch
cmp.py fpfn.py mboxtest.py rates.py simplexloop.py table.py
timcv.py timtest.py weaktest.py
Log Message:
Checkpointing before I head home.
Still to do:
- distutils magic to make sure that the 22compat modules are
installed when needed.
- Walking through testtools and utilities and fixing imports.
- Documentation.
hammie works, everything else that people use in day-to-day operation
should work - please give it a go.
--- NEW FILE: cmp.py ---
#!/usr/bin/env python
"""
cmp.py sbase1 sbase2
Combines output from sbase1.txt and sbase2.txt, which are created by
rates.py from timtest.py output, and displays comparison statistics to
stdout.
"""
import sys
f1n, f2n = sys.argv[1:3]
# Return
# (list of all f-p rates,
# list of all f-n rates,
# total f-p,
# total f-n,
# average f-p rate,
# average f-n rate,
# list of all ham score deviations,
# list of all spam score deviations,
# ham score deviation for all runs,
# spam score deviations for all runs,
# )
# from summary file f.
def suck(f):
fns = []
fps = []
hamdev = []
spamdev = []
hamdevall = spamdevall = (0.0, 0.0)
get = f.readline
while 1:
line = get()
if line.startswith('-> <stat> tested'):
print line,
if line.find(' items; mean ') != -1:
# -> <stat> Ham distribution for this pair: 1000 items; mean 0.05; sample sdev 0.68
# and later "sample " went away
vals = line.split(';')
mean = float(vals[1].split()[-1])
sdev = float(vals[2].split()[-1])
val = (mean, sdev)
typ = vals[0].split()[2]
if line.find('for all runs') != -1:
if typ == 'Ham':
hamdevall = val
else:
spamdevall = val
elif line.find('all in this') != -1:
if typ == 'Ham':
hamdev.append(val)
else:
spamdev.append(val)
continue
if line.startswith('-> '):
continue
if line.startswith('total'):
break
# A line with an f-p rate and an f-n rate.
p, n = map(float, line.split())
fps.append(p)
fns.append(n)
# "total unique false pos 0"
# "total unique false neg 0"
# "average fp % 0.0"
# "average fn % 0.0"
fptot = int(line.split()[-1])
fntot = int(get().split()[-1])
fpmean = float(get().split()[-1])
fnmean = float(get().split()[-1])
return (fps, fns, fptot, fntot, fpmean, fnmean,
hamdev, spamdev, hamdevall, spamdevall)
def tag(p1, p2):
if p1 == p2:
t = "tied "
else:
t = p1 < p2 and "lost " or "won "
if p1:
p = (p2 - p1) * 100.0 / p1
t += " %+7.2f%%" % p
else:
t += " +(was 0)"
return t
def mtag(m1, m2):
mean1, dev1 = m1
mean2, dev2 = m2
t = "%7.2f %7.2f " % (mean1, mean2)
if mean1:
mp = (mean2 - mean1) * 100.0 / mean1
t += "%+7.2f%%" % mp
else:
t += "+(was 0)"
t += " %7.2f %7.2f " % (dev1, dev2)
if dev1:
dp = (dev2 - dev1) * 100.0 / dev1
t += "%+7.2f%%" % dp
else:
t += "+(was 0)"
return t
def dump(p1s, p2s):
alltags = ""
for p1, p2 in zip(p1s, p2s):
t = tag(p1, p2)
print " %5.3f %5.3f %s" % (p1, p2, t)
alltags += t + " "
print
for t in "won", "tied", "lost":
print "%-4s %2d times" % (t, alltags.count(t))
print
def dumpdev(meandev1, meandev2):
for m1, m2 in zip(meandev1, meandev2):
print mtag(m1, m2)
def windowsfy(fn):
import os
if os.path.exists(fn + '.txt'):
return fn + '.txt'
else:
return fn
print f1n, '->', f2n
f1n = windowsfy(f1n)
f2n = windowsfy(f2n)
(fp1, fn1, fptot1, fntot1, fpmean1, fnmean1,
hamdev1, spamdev1, hamdevall1, spamdevall1) = suck(file(f1n))
(fp2, fn2, fptot2, fntot2, fpmean2, fnmean2,
hamdev2, spamdev2, hamdevall2, spamdevall2) = suck(file(f2n))
print
print "false positive percentages"
dump(fp1, fp2)
print "total unique fp went from", fptot1, "to", fptot2, tag(fptot1, fptot2)
print "mean fp % went from", fpmean1, "to", fpmean2, tag(fpmean1, fpmean2)
print
print "false negative percentages"
dump(fn1, fn2)
print "total unique fn went from", fntot1, "to", fntot2, tag(fntot1, fntot2)
print "mean fn % went from", fnmean1, "to", fnmean2, tag(fnmean1, fnmean2)
print
if len(hamdev1) == len(hamdev2) and len(spamdev1) == len(spamdev2):
print "ham mean ham sdev"
dumpdev(hamdev1, hamdev2)
print
print "ham mean and sdev for all runs"
dumpdev([hamdevall1], [hamdevall2])
print
print "spam mean spam sdev"
dumpdev(spamdev1, spamdev2)
print
print "spam mean and sdev for all runs"
dumpdev([spamdevall1], [spamdevall2])
print
diff1 = spamdevall1[0] - hamdevall1[0]
diff2 = spamdevall2[0] - hamdevall2[0]
print "ham/spam mean difference: %2.2f %2.2f %+2.2f" % (diff1,
diff2,
diff2 - diff1)
else:
print "[info about ham & spam means & sdevs not available in both files]"
--- NEW FILE: fpfn.py ---
#! /usr/bin/env python
"""Extract false positive and false negative filenames from timcv.py output."""
import sys
import re
def cmpf(a, b):
# Sort function that sorts by numerical value
ma = re.search(r'(\d+)/(\d+)$', a)
mb = re.search(r'(\d+)/(\d+)$', b)
if ma and mb:
xa, ya = map(int, ma.groups())
xb, yb = map(int, mb.groups())
return cmp((xa, ya), (xb, yb))
else:
return cmp(a, b)
def main():
for name in sys.argv[1:]:
try:
f = open(name + ".txt")
except IOError:
f = open(name)
print "===", name, "==="
fp = []
fn = []
for line in f:
if line.startswith(' new fp: '):
fp.extend(eval(line[12:]))
elif line.startswith(' new fn: '):
fn.extend(eval(line[12:]))
fp.sort(cmpf)
fn.sort(cmpf)
print "--- fp ---"
for x in fp:
print x
print "--- fn ---"
for x in fn:
print x
if __name__ == '__main__':
main()
--- NEW FILE: mboxtest.py ---
#! /usr/bin/env python
"""mboxtest.py: A test driver for classifier.
Usage: mboxtest.py [options] <ham> <spam>
Options:
-f FMT
One of unix, mmdf, mh, or qmail. Specifies mailbox format for
ham and spam files. Default is unix.
-n NSETS
Number of test sets to create for a single mailbox. Default is 5.
-s SEED
Seed for random number generator. Default is 101.
-m MSGS
Read no more than MSGS messages from mailbox.
"""
from __future__ import generators
import getopt
import mailbox
import random
import re
from sets import Set
import sys
from spambayes.tokenizer import tokenize
from spambayes.TestDriver import Driver
from spambayes.msgs import Msg
from spambayes.Options import options
try:
True, False
except NameError:
# Maintain compatibility with Python 2.2
True, False = 1, 0
mbox_fmts = {"unix": mailbox.PortableUnixMailbox,
"mmdf": mailbox.MmdfMailbox,
"mh": mailbox.MHMailbox,
"qmail": mailbox.Maildir,
}
class MboxMsg(Msg):
def __init__(self, fp, path, index):
self.guts = fp.read()
self.tag = "%s:%s %s" % (path, index, subject(self.guts))
def __str__(self):
lines = []
i = 0
for line in self.guts.split("\n"):
skip = False
for skip_prefix in 'X-', 'Received:', '\t',:
if line.startswith(skip_prefix):
skip = True
if skip:
continue
i += 1
if i > 100:
lines.append("... truncated")
break
lines.append(line)
return "\n".join(lines)
def __iter__(self):
return tokenize(self.guts)
class mbox(object):
def __init__(self, path, indices=None):
self.path = path
self.indices = {}
self.key = ''
if indices is not None:
self.key = " %s" % indices[0]
for i in indices:
self.indices[i] = 1
def __repr__(self):
return "<mbox: %s%s>" % (self.path, self.key)
def __iter__(self):
# Use a simple factory that just produces a string.
mbox = mbox_fmts[FMT](open(self.path, "rb"),
lambda f: MboxMsg(f, self.path, i))
i = 0
while 1:
msg = mbox.next()
if msg is None:
return
i += 1
if self.indices.get(i-1) or not self.indices:
yield msg
def subject(buf):
buf = buf.lower()
i = buf.find('subject:')
j = buf.find("\n", i)
return buf[i:j]
def randindices(nelts, nresults):
L = range(nelts)
random.shuffle(L)
chunk = nelts / nresults
for i in range(nresults):
yield Set(L[:chunk])
del L[:chunk]
def sort(seq):
L = list(seq)
L.sort()
return L
def main(args):
global FMT
print options.display()
FMT = "unix"
NSETS = 10
SEED = 101
MAXMSGS = None
opts, args = getopt.getopt(args, "f:n:s:m:")
for k, v in opts:
if k == '-f':
FMT = v
if k == '-n':
NSETS = int(v)
if k == '-s':
SEED = int(v)
if k == '-m':
MAXMSGS = int(v)
ham, spam = args
random.seed(SEED)
nham = len(list(mbox(ham)))
nspam = len(list(mbox(spam)))
if MAXMSGS:
nham = min(nham, MAXMSGS)
nspam = min(nspam, MAXMSGS)
print "ham", ham, nham
print "spam", spam, nspam
ihams = map(tuple, randindices(nham, NSETS))
ispams = map(tuple, randindices(nspam, NSETS))
driver = Driver()
for i in range(1, NSETS):
driver.train(mbox(ham, ihams[i]), mbox(spam, ispams[i]))
i = 0
for iham, ispam in zip(ihams, ispams):
hams = mbox(ham, iham)
spams = mbox(spam, ispam)
if i > 0:
driver.untrain(hams, spams)
driver.test(hams, spams)
driver.finishtest()
if i < NSETS - 1:
driver.train(hams, spams)
i += 1
driver.alldone()
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
--- NEW FILE: rates.py ---
#!/usr/bin/env python
"""
rates.py basename ...
Assuming that file
basename + '.txt'
or
basename
contains output from one of the test drivers (timcv, mboxtest, timtest),
scans that file for summary statistics, displays them to stdout, and also
writes them to file
basename + 's.txt'
(where the 's' means 'summary'). This doesn't need a full output file
from a test run, and will display stuff for as far as the output file
has gotten so far.
Two of these summary files can later be fed to cmp.py.
"""
import sys
"""
-> Training on Data/Ham/Set2-3 & Data/Spam/Set2-3 ... 8000 hams & 5500 spams
-> Predicting Data/Ham/Set1 & Data/Spam/Set1 ...
-> <stat> tested 4000 hams & 2750 spams against 8000 hams & 5500 spams
-> <stat> false positive %: 0.025
-> <stat> false negative %: 0.327272727273
-> <stat> 1 new false positives
"""
def doit(basename):
if basename.endswith('.txt'):
basename = basename[:-4]
try:
ifile = file(basename + '.txt')
except IOError:
ifile = file(basename)
interesting = filter(lambda line: line.startswith('-> '), ifile)
ifile.close()
oname = basename + 's.txt'
ofile = file(oname, 'w')
print basename, '->', oname
def dump(*stuff):
msg = ' '.join(map(str, stuff))
print msg
print >> ofile, msg
ntests = nfn = nfp = 0
sumfnrate = sumfprate = 0.0
for line in interesting:
dump(line[:-1])
fields = line.split()
# 0 1 2 3 4 5 6 -5 -4 -3 -2 -1
#-> <stat> tested 4000 hams & 2750 spams against 8000 hams & 5500 spams
if line.startswith('-> <stat> tested '):
ntests += 1
continue
# 0 1 2 3
# -> <stat> false positive %: 0.025
# -> <stat> false negative %: 0.327272727273
if line.startswith('-> <stat> false '):
kind = fields[3]
percent = float(fields[-1])
if kind == 'positive':
sumfprate += percent
lastval = percent
else:
sumfnrate += percent
dump(' %7.3f %7.3f' % (lastval, percent))
continue
# 0 1 2 3 4 5
# -> <stat> 1 new false positives
if len(fields) >= 5 and fields[3] == 'new' and fields[4] == 'false':
kind = fields[-1]
count = int(fields[2])
if kind == 'positives':
nfp += count
else:
nfn += count
dump('total unique false pos', nfp)
dump('total unique false neg', nfn)
dump('average fp %', sumfprate / ntests)
dump('average fn %', sumfnrate / ntests)
for name in sys.argv[1:]:
doit(name)
--- NEW FILE: simplexloop.py ---
#
# Optimize parameters
#
"""Usage: %(program)s [options] -c command
Where:
-h
Show usage and exit.
-c command
The command to be run, with all its options.
The last line of output from this program should
match 'YYYYYYY cost: $xxxx.xx'
(i.e. the third word of the last line should be the value to be
minimized, preceded by a dollar sign)
I have used
"python2.3 timcv.py -n 10 --spam-keep=600 --ham-keep=600 -s 12345"
This program will overwrite bayescustomize.ini!
"""
import sys
def usage(code, msg=''):
"""Print usage message and sys.exit(code)."""
if msg:
print >> sys.stderr, msg
print >> sys.stderr
print >> sys.stderr, __doc__ % globals()
sys.exit(code)
program = sys.argv[0]
from spambayes import Options
start = (Options.options.unknown_word_prob,
Options.options.minimum_prob_strength,
Options.options.unknown_word_strength)
err = (0.01, 0.01, 0.01)
def mkini(vars):
f=open('bayescustomize.ini', 'w')
f.write("""
[Classifier]
unknown_word_prob = %.6f
minimum_prob_strength = %.6f
unknown_word_strength = %.6f
"""%tuple(vars))
f.close()
def score(vars):
import os
mkini(vars)
status = os.system('%s > loop.out'%command)
if status != 0:
print >> sys.stderr, "Error status from subcommand"
sys.exit(status)
f = open('loop.out', 'r')
txt = f.readlines()
# Extract the flex cost field.
cost = float(txt[-1].split()[2][1:])
f.close()
os.rename('loop.out','loop.out.old')
print ''.join(txt[-20:])[:-1]
print "x=%.4f p=%.4f s=%.4f %.2f"%(tuple(vars)+(cost,))
sys.stdout.flush()
return -cost
def main():
import spambayes.optimize
finish=spambayes.optimize.SimplexMaximize(start,err,score)
mkini(finish)
print "Best result left in bayescustomize.ini"
if __name__ == "__main__":
import getopt
try:
opts, args = getopt.getopt(sys.argv[1:], 'hc:')
except getopt.error, msg:
usage(1, msg)
command = None
for opt, arg in opts:
if opt == '-h':
usage(0)
elif opt == '-c':
command = arg
if args:
usage(1, "Positional arguments not supported")
if command is None:
usage(1, "-c is required")
main()
--- NEW FILE: table.py ---
#!/usr/bin/env python
"""
table.py [-m] base1 base2 ... baseN
Combines output from base1.txt, base2.txt, etc., which are created by
the TestDriver (such as timcv.py) output, and displays tabulated
comparison statistics to stdout. Each input file is represented by
one column in the table.
Optional argument -m shows a final column with the mean value of each
statistic.
"""
# Return
# (
# ham tested,
# spam tested,
# total f-p,
# total f-n,
# total unsure,
# average f-p rate,
# average f-n rate,
# average unsure rate,
# real cost,
# best cost,
# ham score deviation for all runs,
# spam score deviations for all runs,
# )
# from summary file f.
def suck(f):
hamdevall = spamdevall = (0.0, 0.0)
cost = 0.0
bestcost = 0.0
fp = 0
fn = 0
un = 0
fpp = 0.0
fnp = 0.0
unp = 0.0
htest = 0
stest = 0
get = f.readline
while 1:
line = get()
if line.startswith('-> <stat> tested'):
# <stat> tested 1910 hams & 948 spams against 2741 hams & 948 spams
# 1 2 3 4 5 6
print line,
elif line.find(' items; mean ') > 0 and line.find('for all runs') > 0:
# <stat> Ham scores for all runs: 2741 items; mean 0.86; sdev 6.28
# 0 1 2
vals = line.split(';')
mean = float(vals[1].split()[-1])
sdev = float(vals[2].split()[-1])
val = (mean, sdev)
ntested = int(vals[0].split()[-2])
typ = vals[0].split()[2]
if line.find('for all runs') != -1:
if typ == 'Ham':
hamdevall = val
htest = ntested
else:
spamdevall = val
stest = ntested
elif line.startswith('-> best cost for all runs: $'):
# -> best cost for all runs: $28.20
bestcost = float(line.split('$')[-1])
elif line.startswith('-> <stat> all runs false positives: '):
fp = int(line.split()[-1])
elif line.startswith('-> <stat> all runs false negatives: '):
fn = int(line.split()[-1])
elif line.startswith('-> <stat> all runs unsure: '):
un = int(line.split()[-1])
elif line.startswith('-> <stat> all runs false positive %: '):
fpp = float(line.split()[-1])
elif line.startswith('-> <stat> all runs false negative %: '):
fnp = float(line.split()[-1])
elif line.startswith('-> <stat> all runs unsure %: '):
unp = float(line.split()[-1])
elif line.startswith('-> <stat> all runs cost: '):
cost = float(line.split('$')[-1])
break
return (htest, stest, fp, fn, un, fpp, fnp, unp, cost, bestcost,
hamdevall, spamdevall)
def windowsfy(fn):
import os
if os.path.exists(fn + '.txt'):
return fn + '.txt'
else:
return fn
def table():
import getopt, sys
showMean = 0
fname = "filename: "
fnam2 = " "
ratio = "ham:spam: "
rat2 = " "
fptot = "fp total: "
fpper = "fp %: "
fntot = "fn total: "
fnper = "fn %: "
untot = "unsure t: "
unper = "unsure %: "
rcost = "real cost:"
bcost = "best cost:"
hmean = "h mean: "
hsdev = "h sdev: "
smean = "s mean: "
ssdev = "s sdev: "
meand = "mean diff:"
kval = "k: "
tfptot = tfpper = tfntot = tfnper = tuntot = tunper = trcost = tbcost = \
thmean = thsdev = tsmean = tssdev = tmeand = tkval = 0
args, fileargs = getopt.getopt(sys.argv[1:], 'm')
for arg, val in args:
if arg == "-m":
showMean = 1
for filename in fileargs:
filename = windowsfy(filename)
(htest, stest, fp, fn, un, fpp, fnp, unp, cost, bestcost,
hamdevall, spamdevall) = suck(file(filename))
if filename.endswith('.txt'):
filename = filename[:-4]
filename = filename[filename.rfind('/')+1:]
filename = filename[filename.rfind("\\")+1:]
if len(fname) > len(fnam2):
fname += " "
fname = fname[0:(len(fnam2) + 8)]
fnam2 += " %7s" % filename
else:
fnam2 += " "
fnam2 = fnam2[0:(len(fname) + 8)]
fname += " %7s" % filename
if len(ratio) > len(rat2):
ratio += " "
ratio = ratio[0:(len(rat2) + 8)]
rat2 += " %7s" % ("%d:%d" % (htest, stest))
else:
rat2 += " "
rat2 = rat2[0:(len(ratio) + 8)]
ratio += " %7s" % ("%d:%d" % (htest, stest))
fptot += "%8d" % fp
tfptot += fp
fpper += "%8.2f" % fpp
tfpper += fpp
fntot += "%8d" % fn
tfntot += fn
fnper += "%8.2f" % fnp
tfnper += fnp
untot += "%8d" % un
tuntot += un
unper += "%8.2f" % unp
tunper += unp
rcost += "%8s" % ("$%.2f" % cost)
trcost += cost
bcost += "%8s" % ("$%.2f" % bestcost)
tbcost += bestcost
hmean += "%8.2f" % hamdevall[0]
thmean += hamdevall[0]
hsdev += "%8.2f" % hamdevall[1]
thsdev += hamdevall[1]
smean += "%8.2f" % spamdevall[0]
tsmean += spamdevall[0]
ssdev += "%8.2f" % spamdevall[1]
tssdev += spamdevall[1]
meand += "%8.2f" % (spamdevall[0] - hamdevall[0])
tmeand += (spamdevall[0] - hamdevall[0])
k = (spamdevall[0] - hamdevall[0]) / (spamdevall[1] + hamdevall[1])
kval += "%8.2f" % k
tkval += k
nfiles = len(fileargs)
if nfiles and showMean:
fptot += "%12d" % (tfptot/nfiles)
fpper += "%12.2f" % (tfpper/nfiles)
fntot += "%12d" % (tfntot/nfiles)
fnper += "%12.2f" % (tfnper/nfiles)
untot += "%12d" % (tuntot/nfiles)
unper += "%12.2f" % (tunper/nfiles)
rcost += "%12s" % ("$%.2f" % (trcost/nfiles))
bcost += "%12s" % ("$%.2f" % (tbcost/nfiles))
hmean += "%12.2f" % (thmean/nfiles)
hsdev += "%12.2f" % (thsdev/nfiles)
smean += "%12.2f" % (tsmean/nfiles)
ssdev += "%12.2f" % (tssdev/nfiles)
meand += "%12.2f" % (tmeand/nfiles)
kval += "%12.2f" % (tkval/nfiles)
print fname
if len(fnam2.strip()) > 0:
print fnam2
print ratio
if len(rat2.strip()) > 0:
print rat2
print fptot
print fpper
print fntot
print fnper
print untot
print unper
print rcost
print bcost
print hmean
print hsdev
print smean
print ssdev
print meand
print kval
if __name__ == "__main__":
table()
--- NEW FILE: timcv.py ---
#! /usr/bin/env python
# A driver for N-fold cross validation.
"""Usage: %(program)s [options] -n nsets
Where:
-h
Show usage and exit.
-n int
Number of Set directories (Data/Spam/Set1, ... and Data/Ham/Set1, ...).
This is required.
If you only want to use some of the messages in each set,
--HamTrain int
The maximum number of msgs to use from each Ham set for training.
The msgs are chosen randomly. See also the -s option.
--SpamTrain int
The maximum number of msgs to use from each Spam set for training.
The msgs are chosen randomly. See also the -s option.
--HamTest int
The maximum number of msgs to use from each Ham set for testing.
The msgs are chosen randomly. See also the -s option.
--SpamTest int
The maximum number of msgs to use from each Spam set for testing.
The msgs are chosen randomly. See also the -s option.
--ham-keep int
The maximum number of msgs to use from each Ham set for testing
and training. The msgs are chosen randomly. See also the -s option.
--spam-keep int
The maximum number of msgs to use from each Spam set for testing
and training. The msgs are chosen randomly. See also the -s option.
-s int
A seed for the random number generator. Has no effect unless
at least on of {--ham-keep, --spam-keep} is specified. If -s
isn't specifed, the seed is taken from current time.
In addition, an attempt is made to merge bayescustomize.ini into the options.
If that exists, it can be used to change the settings in Options.options.
"""
from __future__ import generators
import sys
from spambayes.Options import options
from spambayes import TestDriver
from spambayes import msgs
program = sys.argv[0]
def usage(code, msg=''):
"""Print usage message and sys.exit(code)."""
if msg:
print >> sys.stderr, msg
print >> sys.stderr
print >> sys.stderr, __doc__ % globals()
sys.exit(code)
def drive(nsets):
print options.display()
hamdirs = [options.ham_directories % i for i in range(1, nsets+1)]
spamdirs = [options.spam_directories % i for i in range(1, nsets+1)]
d = TestDriver.Driver()
# Train it on all sets except the first.
d.train(msgs.HamStream("%s-%d" % (hamdirs[1], nsets),
hamdirs[1:], train=1),
msgs.SpamStream("%s-%d" % (spamdirs[1], nsets),
spamdirs[1:], train=1))
# Now run nsets times, predicting pair i against all except pair i.
for i in range(nsets):
h = hamdirs[i]
s = spamdirs[i]
hamstream = msgs.HamStream(h, [h], train=0)
spamstream = msgs.SpamStream(s, [s], train=0)
if i > 0:
if options.build_each_classifier_from_scratch:
# Build a new classifier from the other sets.
d.new_classifier()
hname = "%s-%d, except %d" % (hamdirs[0], nsets, i+1)
h2 = hamdirs[:]
del h2[i]
sname = "%s-%d, except %d" % (spamdirs[0], nsets, i+1)
s2 = spamdirs[:]
del s2[i]
d.train(msgs.HamStream(hname, h2, train=1),
msgs.SpamStream(sname, s2, train=1))
else:
# Forget this set.
d.untrain(hamstream, spamstream)
# Predict this set.
d.test(hamstream, spamstream)
d.finishtest()
if i < nsets - 1 and not options.build_each_classifier_from_scratch:
# Add this set back in.
d.train(hamstream, spamstream)
d.alldone()
def main():
import getopt
try:
opts, args = getopt.getopt(sys.argv[1:], 'hn:s:',
['HamTrain=', 'SpamTrain=',
'HamTest=', 'SpamTest=',
'ham-keep=', 'spam-keep='])
except getopt.error, msg:
usage(1, msg)
nsets = seed = hamtrain = spamtrain = None
hamtest = spamtest = hamkeep = spamkeep = None
for opt, arg in opts:
if opt == '-h':
usage(0)
elif opt == '-n':
nsets = int(arg)
elif opt == '-s':
seed = int(arg)
elif opt == '--HamTest':
hamtest = int(arg)
elif opt == '--SpamTest':
spamtest = int(arg)
elif opt == '--HamTrain':
hamtrain = int(arg)
elif opt == '--SpamTrain':
spamtrain = int(arg)
elif opt == '--ham-keep':
hamkeep = int(arg)
elif opt == '--spam-keep':
spamkeep = int(arg)
if args:
usage(1, "Positional arguments not supported")
if nsets is None:
usage(1, "-n is required")
if hamkeep is not None:
msgs.setparms(hamkeep, spamkeep, seed=seed)
else:
msgs.setparms(hamtrain, spamtrain, hamtest, spamtest, seed)
drive(nsets)
if __name__ == "__main__":
main()
--- NEW FILE: timtest.py ---
#! /usr/bin/env python
# A test driver using "the standard" test directory structure. See also
# rates.py and cmp.py for summarizing results. This runs an NxN test grid,
# skipping the diagonal.
"""Usage: %(program)s [options] -n nsets
Where:
-h
Show usage and exit.
-n int
Number of Set directories (Data/Spam/Set1, ... and Data/Ham/Set1, ...).
This is required.
If you only want to use some of the messages in each set,
--ham-keep int
The maximum number of msgs to use from each Ham set. The msgs are
chosen randomly. See also the -s option.
--spam-keep int
The maximum number of msgs to use from each Spam set. The msgs are
chosen randomly. See also the -s option.
-s int
A seed for the random number generator. Has no effect unless
at least on of {--ham-keep, --spam-keep} is specified. If -s
isn't specifed, the seed is taken from current time.
In addition, an attempt is made to merge bayescustomize.ini into the options.
If that exists, it can be used to change the settings in Options.options.
"""
from __future__ import generators
import sys
from spambayes.Options import options
from spambayes import TestDriver
from spambayes import msgs
program = sys.argv[0]
def usage(code, msg=''):
"""Print usage message and sys.exit(code)."""
if msg:
print >> sys.stderr, msg
print >> sys.stderr
print >> sys.stderr, __doc__ % globals()
sys.exit(code)
def drive(nsets):
print options.display()
spamdirs = [options.spam_directories % i for i in range(1, nsets+1)]
hamdirs = [options.ham_directories % i for i in range(1, nsets+1)]
spamhamdirs = zip(spamdirs, hamdirs)
d = TestDriver.Driver()
for spamdir, hamdir in spamhamdirs:
d.new_classifier()
d.train(msgs.HamStream(hamdir, [hamdir]),
msgs.SpamStream(spamdir, [spamdir]))
for sd2, hd2 in spamhamdirs:
if (sd2, hd2) == (spamdir, hamdir):
continue
d.test(msgs.HamStream(hd2, [hd2]),
msgs.SpamStream(sd2, [sd2]))
d.finishtest()
d.alldone()
def main():
import getopt
try:
opts, args = getopt.getopt(sys.argv[1:], 'hn:s:',
['ham-keep=', 'spam-keep='])
except getopt.error, msg:
usage(1, msg)
nsets = seed = hamkeep = spamkeep = None
for opt, arg in opts:
if opt == '-h':
usage(0)
elif opt == '-n':
nsets = int(arg)
elif opt == '-s':
seed = int(arg)
elif opt == '--ham-keep':
hamkeep = int(arg)
elif opt == '--spam-keep':
spamkeep = int(arg)
if args:
usage(1, "Positional arguments not supported")
if nsets is None:
usage(1, "-n is required")
msgs.setparms(hamkeep, spamkeep, seed=seed)
drive(nsets)
if __name__ == "__main__":
main()
--- NEW FILE: weaktest.py ---
#! /usr/bin/env python
# A test driver using "the standard" test directory structure.
# This simulates a user that gets E-mail, and only trains on fp,
# fn and unsure messages. It starts by training on the first 30
# messages, and from that point on well classified messages will
# not be used for training. This can be used to see what the performance
# of the scoring algorithm is under such conditions. Questions are:
# * How does the size of the database behave over time?
# * Does the classification get better over time?
# * Are there other combinations of parameters for the classifier
# that make this better behaved than the default values?
"""Usage: %(program)s [options] -n nsets
Where:
-h
Show usage and exit.
-n int
Number of Set directories (Data/Spam/Set1, ... and Data/Ham/Set1, ...).
This is required.
-d decider
Name of the decider. One of %(decisionkeys)s
-m min
Minimal number of messages to train on before involving the decider.
In addition, an attempt is made to merge bayescustomize.ini into the options.
If that exists, it can be used to change the settings in Options.options.
"""
from __future__ import generators
import sys,os
from spambayes.Options import options
from spambayes import hammie, msgs, CostCounter
program = sys.argv[0]
debug = 0
def usage(code, msg=''):
"""Print usage message and sys.exit(code)."""
if msg:
print >> sys.stderr, msg
print >> sys.stderr
print >> sys.stderr, __doc__ % globals()
sys.exit(code)
DONT_TRAIN = None
TRAIN_AS_HAM = 1
TRAIN_AS_SPAM = 2
class TrainDecision:
def __call__(self,scr,is_spam):
if is_spam:
return self.spamtrain(scr)
else:
return self.hamtrain(scr)
class UnsureAndFalses(TrainDecision):
def spamtrain(self,scr):
if scr < options.spam_cutoff:
return TRAIN_AS_SPAM
def hamtrain(self,scr):
if scr > options.ham_cutoff:
return TRAIN_AS_HAM
class UnsureOnly(TrainDecision):
def spamtrain(self,scr):
if options.ham_cutoff < scr < options.spam_cutoff:
return TRAIN_AS_SPAM
def hamtrain(self,scr):
if options.ham_cutoff < scr < options.spam_cutoff:
return TRAIN_AS_HAM
class All(TrainDecision):
def spamtrain(self,scr):
return TRAIN_AS_SPAM
def hamtrain(self,scr):
return TRAIN_AS_HAM
class AllBut0and100(TrainDecision):
def spamtrain(self,scr):
if scr < 0.995:
return TRAIN_AS_SPAM
def hamtrain(self,scr):
if scr > 0.005:
return TRAIN_AS_HAM
class OwnDecision(TrainDecision):
def hamtrain(self,scr):
if scr < options.ham_cutoff:
return TRAIN_AS_HAM
elif scr > options.spam_cutoff:
return TRAIN_AS_SPAM
spamtrain = hamtrain
class OwnDecisionFNCorrection(OwnDecision):
def spamtrain(self,scr):
return TRAIN_AS_SPAM
decisions={'all': All,
'allbut0and100': AllBut0and100,
'unsureonly': UnsureOnly,
'unsureandfalses': UnsureAndFalses,
'owndecision': OwnDecision,
'owndecision+fn': OwnDecisionFNCorrection,
}
decisionkeys=decisions.keys()
decisionkeys.sort()
class FirstN:
def __init__(self,n,client):
self.client = client
self.x = 0
self.n = n
def __call__(self,scr,is_spam):
self.x += 1
if self.tooearly():
if is_spam:
return TRAIN_AS_SPAM
else:
return TRAIN_AS_HAM
else:
return self.client(scr,is_spam)
def tooearly(self):
return self.x < self.n
class Updater:
def __init__(self,d=None):
self.setd(d)
def setd(self,d):
self.d=d
def drive(nsets,decision):
print options.display()
spamdirs = [options.spam_directories % i for i in range(1, nsets+1)]
hamdirs = [options.ham_directories % i for i in range(1, nsets+1)]
spamfns = [(x,y,1) for x in spamdirs for y in os.listdir(x)]
hamfns = [(x,y,0) for x in hamdirs for y in os.listdir(x)]
nham = len(hamfns)
nspam = len(spamfns)
cc = CostCounter.nodelay()
allfns = {}
for fn in spamfns+hamfns:
allfns[fn] = None
d = hammie.open('weaktest.db', False)
hamtrain = 0
spamtrain = 0
n = 0
for dir,name, is_spam in allfns.iterkeys():
n += 1
m=msgs.Msg(dir, name).guts
if debug > 1:
print "trained:%dH+%dS"%(hamtrain,spamtrain)
scr=d.score(m)
if debug > 1:
print "score:%.3f"%scr
if not decision.tooearly():
if is_spam:
if debug > 0:
print "Spam with score %.2f"%scr
cc.spam(scr)
else:
if debug > 0:
print "Ham with score %.2f"%scr
cc.ham(scr)
de = decision(scr,is_spam)
if de == TRAIN_AS_SPAM:
d.train_spam(m)
spamtrain += 1
elif de == TRAIN_AS_HAM:
d.train_ham(m)
hamtrain += 1
if n % 100 == 0:
print "%5d trained:%dH+%dS wrds:%d"%(
n, hamtrain, spamtrain, len(d.bayes.wordinfo))
print cc
print "="*70
print "%5d trained:%dH+%dS wrds:%d"%(
n, hamtrain, spamtrain, len(d.bayes.wordinfo))
print cc
def main():
global debug
import getopt
try:
opts, args = getopt.getopt(sys.argv[1:], 'vd:hn:m:')
except getopt.error, msg:
usage(1, msg)
nsets = None
decision = decisions['unsureonly']
m = 10
for opt, arg in opts:
if opt == '-h':
usage(0)
elif opt == '-n':
nsets = int(arg)
elif opt == '-v':
debug += 1
elif opt == '-m':
m = int(arg)
elif opt == '-d':
if not decisions.has_key(arg):
usage(1,'Unknown decisionmaker')
decision = decisions[arg]
if args:
usage(1, "Positional arguments not supported")
if nsets is None:
usage(1, "-n is required")
drive(nsets,decision=FirstN(m,decision()))
if __name__ == "__main__":
main()
- Previous message: [Spambayes-checkins] spambayes/spambayes Corpus.py,NONE,1.1.2.1
CostCounter.py,NONE,1.1.2.1 FileCorpus.py,NONE,1.1.2.1
Histogram.py,NONE,1.1.2.1 Options.py,NONE,1.1.2.1
TestDriver.py,NONE,1.1.2.1 Tester.py,NONE,1.1.2.1
__init__.py,NONE,1.1.2.1 cdb.py,NONE,1.1.2.1 chi2.py,NONE,1.1.2.1
classifier.py,NONE,1.1.2.1 dbmstorage.py,NONE,1.1.2.1
hammie.py,NONE,1.1.2.1 hammiebulk.py,NONE,1.1.2.1
mboxutils.py,NONE,1.1.2.1 msgs.py,NONE,1.1.2.1
optimize.py,NONE,1.1.2.1 storage.py,NONE,1.1.2.1tokenizer.py,NONE,1.1.2.1
- Next message: [Spambayes-checkins] spambayes/utilities HistToGNU.py,NONE,1.1.2.1
loosecksum.py,NONE,1.1.2.1 mboxcount.py,NONE,1.1.2.1
rebal.py,NONE,1.1.2.1 split.py,NONE,1.1.2.1 splitn.py,NONE,1.1.2.1
splitndirs.py,NONE,1.1.2.1
- Messages sorted by:
[ date ]
[ thread ]
[ subject ]
[ author ]
More information about the Spambayes-checkins
mailing list