[Spambayes] Modifications to timcv.py

Wed, 09 Oct 2002 11:04:39 -0700

The inability to use timcv.py with the central limit stuff
annoyed me.  I offer this patch to correct that problem...

- Alex

Index: timcv.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/timcv.py,v
retrieving revision 1.9
diff -u -r1.9 timcv.py

--- timcv.py	24 Sep 2002 05:37:11 -0000	1.9
+++ timcv.py	9 Oct 2002 17:59:56 -0000
@@ -26,6 +26,15 @@
         at least on of {--ham-keep, --spam-keep} is specified.  If -s
         isn't specifed, the seed is taken from current time.
 
+If you want full retraining for each classifier (because untrain and
+retrain don't work),
+
+    --trainstyle arg
+        Use one of the following training styles:
+            partial: train on everything, then untrain individual sets
+            full: train from scratch on only applicable sets
+        partial is the historical (and default) behaviour.
+
 In addition, an attempt is made to merge bayescustomize.ini into the options.
 If that exists, it can be used to change the settings in Options.options.
 """
@@ -48,7 +57,7 @@
     print >> sys.stderr, __doc__ % globals()
     sys.exit(code)
 
-def drive(nsets):
+def drive(nsets, trainstyle):
     print options.display()
 
     hamdirs  = [options.ham_directories % i for i in range(1, nsets+1)]
@@ -67,16 +76,28 @@
         spamstream = msgs.SpamStream(s, [s])
 
         if i > 0:
-            # Forget this set.
-            d.untrain(hamstream, spamstream)
+            if trainstyle == 'partial':
+                # Forget this set.
+                d.untrain(hamstream, spamstream)
+            elif trainstyle == 'full':
+                # Retrain with the other sets.
+                hname = "%s-%d, except %d" % (hamdirs[0], nsets, i + 1)
+                h2 = hamdirs * 1
+                del h2[i]
+                sname = "%s-%d, except %d" % (spamdirs[0], nsets, i + 1)
+                s2 = spamdirs * 1
+                del s2[i]
+                d.new_classifier()
+                d.train(msgs.HamStream(hname, h2), msgs.SpamStream(sname, s2))
 
         # Predict this set.
         d.test(hamstream, spamstream)
         d.finishtest()
 
         if i < nsets - 1:
-            # Add this set back in.
-            d.train(hamstream, spamstream)
+            if trainstyle == 'partial':
+                # Add this set back in.
+                d.train(hamstream, spamstream)
 
     d.alldone()
 
@@ -85,11 +106,12 @@
 
     try:
         opts, args = getopt.getopt(sys.argv[1:], 'hn:s:',
-                                   ['ham-keep=', 'spam-keep='])
+                                   ['ham-keep=', 'spam-keep=', 'trainstyle='])
     except getopt.error, msg:
         usage(1, msg)
 
     nsets = seed = hamkeep = spamkeep = None
+    trainstyle = 'partial'
     for opt, arg in opts:
         if opt == '-h':
             usage(0)
@@ -101,14 +123,18 @@
             hamkeep = int(arg)
         elif opt == '--spam-keep':
             spamkeep = int(arg)
+        elif opt == '--trainstyle':
+            trainstyle = arg
 
     if args:
         usage(1, "Positional arguments not supported")
     if nsets is None:
         usage(1, "-n is required")
+    if trainstyle not in ('partial', 'full'):
+        usage(1, "Unknown train style '%s'" % trainstyle)
 
     msgs.setparms(hamkeep, spamkeep, seed)
-    drive(nsets)
+    drive(nsets, trainstyle)
 
 if __name__ == "__main__":
     main()