[Spambayes] Modifications to timcv.py
T. Alexander Popiel
popiel@wolfskeep.com
Wed, 09 Oct 2002 11:04:39 -0700
The inability to use timcv.py with the central limit stuff
annoyed me. I offer this patch to correct that problem...
- Alex
Index: timcv.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/timcv.py,v
retrieving revision 1.9
diff -u -r1.9 timcv.py
--- timcv.py 24 Sep 2002 05:37:11 -0000 1.9
+++ timcv.py 9 Oct 2002 17:59:56 -0000
@@ -26,6 +26,15 @@
at least on of {--ham-keep, --spam-keep} is specified. If -s
isn't specifed, the seed is taken from current time.
+If you want full retraining for each classifier (because untrain and
+retrain don't work),
+
+ --trainstyle arg
+ Use one of the following training styles:
+ partial: train on everything, then untrain individual sets
+ full: train from scratch on only applicable sets
+ partial is the historical (and default) behaviour.
+
In addition, an attempt is made to merge bayescustomize.ini into the options.
If that exists, it can be used to change the settings in Options.options.
"""
@@ -48,7 +57,7 @@
print >> sys.stderr, __doc__ % globals()
sys.exit(code)
-def drive(nsets):
+def drive(nsets, trainstyle):
print options.display()
hamdirs = [options.ham_directories % i for i in range(1, nsets+1)]
@@ -67,16 +76,28 @@
spamstream = msgs.SpamStream(s, [s])
if i > 0:
- # Forget this set.
- d.untrain(hamstream, spamstream)
+ if trainstyle == 'partial':
+ # Forget this set.
+ d.untrain(hamstream, spamstream)
+ elif trainstyle == 'full':
+ # Retrain with the other sets.
+ hname = "%s-%d, except %d" % (hamdirs[0], nsets, i + 1)
+ h2 = hamdirs * 1
+ del h2[i]
+ sname = "%s-%d, except %d" % (spamdirs[0], nsets, i + 1)
+ s2 = spamdirs * 1
+ del s2[i]
+ d.new_classifier()
+ d.train(msgs.HamStream(hname, h2), msgs.SpamStream(sname, s2))
# Predict this set.
d.test(hamstream, spamstream)
d.finishtest()
if i < nsets - 1:
- # Add this set back in.
- d.train(hamstream, spamstream)
+ if trainstyle == 'partial':
+ # Add this set back in.
+ d.train(hamstream, spamstream)
d.alldone()
@@ -85,11 +106,12 @@
try:
opts, args = getopt.getopt(sys.argv[1:], 'hn:s:',
- ['ham-keep=', 'spam-keep='])
+ ['ham-keep=', 'spam-keep=', 'trainstyle='])
except getopt.error, msg:
usage(1, msg)
nsets = seed = hamkeep = spamkeep = None
+ trainstyle = 'partial'
for opt, arg in opts:
if opt == '-h':
usage(0)
@@ -101,14 +123,18 @@
hamkeep = int(arg)
elif opt == '--spam-keep':
spamkeep = int(arg)
+ elif opt == '--trainstyle':
+ trainstyle = arg
if args:
usage(1, "Positional arguments not supported")
if nsets is None:
usage(1, "-n is required")
+ if trainstyle not in ('partial', 'full'):
+ usage(1, "Unknown train style '%s'" % trainstyle)
msgs.setparms(hamkeep, spamkeep, seed)
- drive(nsets)
+ drive(nsets, trainstyle)
if __name__ == "__main__":
main()