[Spambayes] proposed changes to hammie & co.

Tue Nov 19 00:35:55 2002

okay, here's the big diff I was talking about.  This would take all
hammie functionality out of hammie.  So there would need to be yet
another hammie*.py file, a front-end to this new hammie class which acts
like the all-singing, all-dancing program that hammie is currently.

This moves everything but the Hammie class out of hammie.py.  DBDict
goes into its own module, which you could take out and use elsewhere if
you wanted.  PersistentBayes goes away, replaced by a the DBDictBayes
class in Bayes.py.  I haven't had time to implement the rest of the
stuff yet, but that would be what'd go into the new front-end.

So the happy hammie family would then stand at:

  hammie.py
  |-- hammiefilter.py
  |-- pop3proxy.py
  |-- hammiesrv.py
  \-- hammie-new-front-end.py

This change appears to work fine with hammiefilter and pop3proxy.  But
it's a pretty big change, so I'd like to hear what at least Richie and
Tim Stone think before I commit anything.

Neale

? Outlook2000
? diff
? email
? hammiebatch.py
Index: Bayes.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Bayes.py,v
retrieving revision 1.5
diff -u -r1.5 Bayes.py

--- Bayes.py	18 Nov 2002 13:04:20 -0000	1.5
+++ Bayes.py	19 Nov 2002 00:24:57 -0000
@@ -56,11 +56,10 @@
 all the spambayes contributors."
 
 import Corpus
-from classifier import Bayes
+import classifier
 from Options import options
-from hammie import DBDict     # hammie only for DBDict, which should
-                              # probably really be somewhere else
 import cPickle as pickle
+import dbdict
 import errno
 import copy
 import anydbm
@@ -69,7 +68,7 @@
 NO_UPDATEPROBS = False   # Probabilities will not be autoupdated with training
 UPDATEPROBS = True       # Probabilities will be autoupdated with training
 
-class PersistentBayes(Bayes):
+class PersistentBayes(classifier.Bayes):
     '''Persistent Bayes database object'''
 
     def __init__(self, db_name):
@@ -169,12 +168,49 @@
         self.wordinfo, self.nspam, self.nham = t[1:]
 
 
+class WIDict(dbdict.DBDict):
+    """DBDict optimized for holding lots of WordInfo objects.
+
+    Normally, the pickler can figure out that you're pickling the same
+    type thing over and over, and will just tag the type with a new
+    byte, thus reducing Administrative Pickle Bloat(R).  Since the
+    DBDict continually creates new picklers, however, nothing ever gets
+    the chance to do this optimization.
+
+    The WIDict class forces this optimization by stealing the
+    (currently) unused 'W' pickle type for WordInfo objects.  This
+    results in about a 50% reduction in database size.
+
+    """
+
+    def __getitem__(self, key):
+        v = self.hash[key]
+        if v[0] == 'W':
+            val = pickle.loads(v[1:])
+            # We could be sneaky, like pickle.Unpickler.load_inst,
+            # but I think that's overly confusing.
+            obj = classifier.WordInfo(0)
+            obj.__setstate__(val)
+            return obj
+        else:
+            return pickle.loads(v)
+
+    def __setitem__(self, key, val):
+        if isinstance(val, classifier.WordInfo):
+            val = val.__getstate__()
+            v = 'W' + pickle.dumps(val, 1)
+        else:
+            v = pickle.dumps(val, 1)
+        self.hash[key] = v
+
+
 class DBDictBayes(PersistentBayes):
     '''Bayes object persisted in a hammie.DB_Dict'''
 
-    def __init__(self, db_name):
+    def __init__(self, db_name, mode='c'):
         '''Constructor(database name)'''
 
+        self.mode = mode
         self.db_name = db_name
         self.statekey = "saved state"
 
@@ -186,7 +222,8 @@
         if Corpus.Verbose:
             print 'Loading state from',self.db_name,'DB_Dict'
 
-        self.wordinfo = DBDict(self.db_name, 'c')
+        self.wordinfo = WIDict(self.db_name, self.mode,
+                               iterskip=[self.statekey])
 
         if self.wordinfo.has_key(self.statekey):
 
@@ -216,7 +253,7 @@
 
     def __init__(self, bayes, trainertype, updateprobs=NO_UPDATEPROBS):
         '''Constructor(Bayes, \
-                       Corpus.SPAM|Corpus.HAM), updprobs(True|False)'''
+            Corpus.SPAM|Corpus.HAM), updprobs(True|False)'''
 
         self.bayes = bayes
         self.trainertype = trainertype
@@ -286,4 +323,4 @@
 
 
 if __name__ == '__main__':
-    print >>sys.stderr, __doc__
\ No newline at end of file
+    print >>sys.stderr, __doc__
Index: dbdict.py
===================================================================
RCS file: dbdict.py
diff -N dbdict.py
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ dbdict.py	19 Nov 2002 00:24:57 -0000
@@ -0,0 +1,92 @@
+#! /usr/bin/env python
+
+from __future__ import generators
+import dbhash
+try:
+    import cPickle as pickle
+except ImportError:
+    import pickle
+
+class DBDict:
+    """Database Dictionary.
+
+    This wraps a dbhash database to make it look even more like a
+    dictionary, much like the built-in shelf class.  The difference is
+    that a DBDict supports all dict methods.
+
+    Call it with the database.  Optionally, you can specify a list of
+    keys to skip when iterating.  This only affects iterators; things
+    like .keys() still list everything.  For instance:
+
+    >>> d = DBDict('goober.db', 'c', ('skipme', 'skipmetoo'))
+    >>> d['skipme'] = 'booga'
+    >>> d['countme'] = 'wakka'
+    >>> print d.keys()
+    ['skipme', 'countme']
+    >>> for k in d.iterkeys():
+    ...     print k
+    countme
+
+    """
+
+    def __init__(self, dbname, mode, iterskip=()):
+        self.hash = dbhash.open(dbname, mode)
+        self.iterskip = iterskip
+
+    def __getitem__(self, key):
+        return pickle.loads(self.hash[key])
+
+    def __setitem__(self, key, val):
+        self.hash[key] = pickle.dumps(val, 1)
+
+    def __delitem__(self, key, val):
+        del(self.hash[key])
+
+    def __iter__(self, fn=None):
+        k = self.hash.first()
+        while k != None:
+            key = k[0]
+            val = self.__getitem__(key)
+            if key not in self.iterskip:
+                if fn:
+                    yield fn((key, val))
+                else:
+                    yield (key, val)
+            try:
+                k = self.hash.next()
+            except KeyError:
+                break
+
+    def __contains__(self, name):
+        return self.has_key(name)
+
+    def __getattr__(self, name):
+        # Pass the buck
+        return getattr(self.hash, name)
+
+    def get(self, key, dfl=None):
+        if self.has_key(key):
+            return self[key]
+        else:
+            return dfl
+
+    def iteritems(self):
+        return self.__iter__()
+
+    def iterkeys(self):
+        return self.__iter__(lambda k: k[0])
+
+    def itervalues(self):
+        return self.__iter__(lambda k: k[1])
+
+open = DBDict
+
+def _test():
+    import doctest
+    import dbdict
+
+    doctest.testmod(dbdict)
+
+if __name__ == '__main__':
+    _test()
+
Index: hammie.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/hammie.py,v
retrieving revision 1.40
diff -u -r1.40 hammie.py
--- hammie.py	18 Nov 2002 18:13:54 -0000	1.40
+++ hammie.py	19 Nov 2002 00:24:57 -0000
@@ -1,57 +1,11 @@
 #! /usr/bin/env python
 
-# A driver for the classifier module and Tim's tokenizer that you can
-# call from procmail.
-
-"""Usage: %(program)s [options]
-
-Where:
-    -h
-        show usage and exit
-    -g PATH
-        mbox or directory of known good messages (non-spam) to train on.
-        Can be specified more than once, or use - for stdin.
-    -s PATH
-        mbox or directory of known spam messages to train on.
-        Can be specified more than once, or use - for stdin.
-    -u PATH
-        mbox of unknown messages.  A ham/spam decision is reported for each.
-        Can be specified more than once.
-    -r
-        reverse the meaning of the check (report ham instead of spam).
-        Only meaningful with the -u option.
-    -p FILE
-        use file as the persistent store.  loads data from this file if it
-        exists, and saves data to this file at the end.
-        Default: %(DEFAULTDB)s
-    -d
-        use the DBM store instead of cPickle.  The file is larger and
-        creating it is slower, but checking against it is much faster,
-        especially for large word databases. Default: %(USEDB)s
-    -D
-        the reverse of -d: use the cPickle instead of DBM
-    -f
-        run as a filter: read a single message from stdin, add an
-        %(DISPHEADER)s header, and write it to stdout.  If you want to
-        run from procmail, this is your option.
-"""
-
-from __future__ import generators
-
-import sys
-import os
-import types
-import getopt
-import mailbox
-import glob
-import email
-import errno
-import anydbm
-import cPickle as pickle
 
+import dbdict
 import mboxutils
-import classifier
+import Bayes
 from Options import options
+from tokenizer import tokenize
 
 try:
     True, False
@@ -60,166 +14,14 @@
     True, False = 1, 0
 
 
-program = sys.argv[0] # For usage(); referenced by docstring above
-
-# Name of the header to add in filter mode
-DISPHEADER = options.hammie_header_name
-DEBUGHEADER = options.hammie_debug_header_name
-DODEBUG = options.hammie_debug_header
-
-# Default database name
-DEFAULTDB = options.persistent_storage_file
-
-# Probability at which a message is considered spam
-SPAM_THRESHOLD = options.spam_cutoff
-HAM_THRESHOLD = options.ham_cutoff
-
-# Probability limit for a clue to be added to the DISPHEADER
-SHOWCLUE = options.clue_mailheader_cutoff
-
-# Use a database? If False, use a pickle
-USEDB = options.persistent_use_database
-
-# Tim's tokenizer kicks far more booty than anything I would have
-# written.  Score one for analysis ;)
-from tokenizer import tokenize
-
-class DBDict:
-
-    """Database Dictionary.
-
-    This wraps an anydbm to make it look even more like a dictionary.
-
-    Call it with the name of your database file.  Optionally, you can
-    specify a list of keys to skip when iterating.  This only affects
-    iterators; things like .keys() still list everything.  For instance:
-
-    >>> d = DBDict('/tmp/goober.db', ('skipme', 'skipmetoo'))
-    >>> d['skipme'] = 'booga'
-    >>> d['countme'] = 'wakka'
-    >>> print d.keys()
-    ['skipme', 'countme']
-    >>> for k in d.iterkeys():
-    ...     print k
-    countme
-
-    """
-
-    def __init__(self, dbname, mode, iterskip=()):
-        self.hash = anydbm.open(dbname, mode)
-        self.iterskip = iterskip
-
-    def __getitem__(self, key):
-        v = self.hash[key]
-        if v[0] == 'W':
-            val = pickle.loads(v[1:])
-            # We could be sneaky, like pickle.Unpickler.load_inst,
-            # but I think that's overly confusing.
-            obj = classifier.WordInfo(0)
-            obj.__setstate__(val)
-            return obj
-        else:
-            return pickle.loads(v)
-
-    def __setitem__(self, key, val):
-        if isinstance(val, classifier.WordInfo):
-            val = val.__getstate__()
-            v = 'W' + pickle.dumps(val, 1)
-        else:
-            v = pickle.dumps(val, 1)
-        self.hash[key] = v
-
-    def __delitem__(self, key, val):
-        del(self.hash[key])
-
-    def __iter__(self, fn=None):
-        k = self.hash.first()
-        while k != None:
-            key = k[0]
-            val = self.__getitem__(key)
-            if key not in self.iterskip:
-                if fn:
-                    yield fn((key, val))
-                else:
-                    yield (key, val)
-            try:
-                k = self.hash.next()
-            except KeyError:
-                break
-
-    def __contains__(self, name):
-        return self.has_key(name)
-
-    def __getattr__(self, name):
-        # Pass the buck
-        return getattr(self.hash, name)
-
-    def get(self, key, dfl=None):
-        if self.has_key(key):
-            return self[key]
-        else:
-            return dfl
-
-    def iteritems(self):
-        return self.__iter__()
-
-    def iterkeys(self):
-        return self.__iter__(lambda k: k[0])
-
-    def itervalues(self):
-        return self.__iter__(lambda k: k[1])
-
-
-class PersistentBayes(classifier.Bayes):
-
-    """A persistent Bayes classifier.
-
-    This is just like classifier.Bayes, except that the dictionary is a
-    database.  You take less disk this way and you can pretend it's
-    persistent.  The tradeoffs vs. a pickle are: 1. it's slower
-    training, but faster checking, and 2. it needs less memory to run,
-    but takes more space on the hard drive.
+class Hammie:
+    """A spambayes mail filter.
 
-    On destruction, an instantiation of this class will write its state
-    to a special key.  When you instantiate a new one, it will attempt
-    to read these values out of that key again, so you can pick up where
-    you left off.
+    This implements the basic functionality needed to score, filter, or
+    train.  
 
     """
 
-    # XXX: Would it be even faster to remember (in a list) which keys
-    # had been modified, and only recalculate those keys?  No sense in
-    # going over the entire word database if only 100 words are
-    # affected.
-
-    # XXX: Another idea: cache stuff in memory.  But by then maybe we
-    # should just use ZODB.
-
-    def __init__(self, dbname, mode):
-        classifier.Bayes.__init__(self)
-        self.statekey = "saved state"
-        self.wordinfo = DBDict(dbname, mode, (self.statekey,))
-        self.dbmode = mode
-
-        self.restore_state()
-
-    def __del__(self):
-        #super.__del__(self)
-        self.save_state()
-
-    def save_state(self):
-        if self.dbmode != 'r':
-            self.wordinfo[self.statekey] = (self.nham, self.nspam)
-
-    def restore_state(self):
-        if self.wordinfo.has_key(self.statekey):
-            self.nham, self.nspam = self.wordinfo[self.statekey]
-
-
-class Hammie:
-
-    """A spambayes mail filter"""
-
     def __init__(self, bayes):
         self.bayes = bayes
 
@@ -262,9 +64,9 @@
             import traceback
             traceback.print_exc()
 
-    def filter(self, msg, header=DISPHEADER, spam_cutoff=SPAM_THRESHOLD,
-               ham_cutoff=HAM_THRESHOLD, debugheader=DEBUGHEADER,
-               debug=DODEBUG):
+    def filter(self, msg, header=None, spam_cutoff=None,
+               ham_cutoff=None, debugheader=None,
+               debug=None):
         """Score (judge) a message and add a disposition header.
 
         msg can be a string, a file object, or a Message object.
@@ -282,6 +84,17 @@
 
         """
 
+        if header == None:
+            header = options.hammie_header_name
+        if spam_cutoff == None:
+            spam_cutoff = options.spam_cutoff
+        if ham_cutoff == None:
+            ham_cutoff = options.ham_cutoff
+        if debugheader == None:
+            debugheader = options.hammie_debug_header_name
+        if debug == None:
+            debug = options.hammie_debug_header
+
         msg = mboxutils.get_message(msg)
         try:
             del msg[header]
@@ -348,163 +161,47 @@
 
         self.train(msg, True)
 
-    def update_probabilities(self):
+    def update_probabilities(self, store=True):
         """Update probability values.
 
         You would want to call this after a training session.  It's
         pretty slow, so if you have a lot of messages to train, wait
         until you're all done before calling this.
 
+        Unless store is false, the peristent store will be written after
+        updating probabilities.
+
         """
 
         self.bayes.update_probabilities()
+        if store:
+            self.store()
 
+    def store(self):
+        """Write out the persistent store.
+
+        This makes sure the persistent store reflects what is currently
+        in memory.  You would want to do this after a write and before
+        exiting.
+
+        """
+
+        self.bayes.store()
+
+
+def open(filename, usedb=True, mode='r'):
+    """Open a file, returning a Hammie instance.
+
+    If usedb is False, open as a pickle instead of a DBDict.  mode is
+
+    used as the flag to open DBDict objects.  'c' for read-write (create
+    if needed), 'r' for read-only, 'w' for read-write.
+
+    """
 
-def train(hammie, msgs, is_spam):
-    """Train bayes with all messages from a mailbox."""
-    mbox = mboxutils.getmbox(msgs)
-    i = 0
-    for msg in mbox:
-        i += 1
-        # XXX: Is the \r a Unixism?  I seem to recall it working in DOS
-        # back in the day.  Maybe it's a line-printer-ism ;)
-        sys.stdout.write("\r%6d" % i)
-        sys.stdout.flush()
-        hammie.train(msg, is_spam)
-    print
-
-def score(hammie, msgs, reverse=0):
-    """Score (judge) all messages from a mailbox."""
-    # XXX The reporting needs work!
-    mbox = mboxutils.getmbox(msgs)
-    i = 0
-    spams = hams = 0
-    for msg in mbox:
-        i += 1
-        prob, clues = hammie.score(msg, True)
-        if hasattr(msg, '_mh_msgno'):
-            msgno = msg._mh_msgno
-        else:
-            msgno = i
-        isspam = (prob >= SPAM_THRESHOLD)
-        if isspam:
-            spams += 1
-            if not reverse:
-                print "%6s %4.2f %1s" % (msgno, prob, isspam and "S" or "."),
-                print hammie.formatclues(clues)
-        else:
-            hams += 1
-            if reverse:
-                print "%6s %4.2f %1s" % (msgno, prob, isspam and "S" or "."),
-                print hammie.formatclues(clues)
-    return (spams, hams)
-
-def createbayes(pck=DEFAULTDB, usedb=False, mode='r'):
-    """Create a Bayes instance for the given pickle (which
-    doesn't have to exist).  Create a PersistentBayes if
-    usedb is True."""
     if usedb:
-        bayes = PersistentBayes(pck, mode)
+        b = Bayes.DBDictBayes(filename, mode)
     else:
-        bayes = None
-        try:
-            fp = open(pck, 'rb')
-        except IOError, e:
-            if e.errno <> errno.ENOENT: raise
-        else:
-            bayes = pickle.load(fp)
-            fp.close()
-        if bayes is None:
-            bayes = classifier.Bayes()
-    return bayes
-
-def usage(code, msg=''):
-    """Print usage message and sys.exit(code)."""
-    if msg:
-        print >> sys.stderr, msg
-        print >> sys.stderr
-    print >> sys.stderr, __doc__ % globals()
-    sys.exit(code)
-
-def main():
-    """Main program; parse options and go."""
-    try:
-        opts, args = getopt.getopt(sys.argv[1:], 'hdDfg:s:p:u:r')
-    except getopt.error, msg:
-        usage(2, msg)
-
-    if not opts:
-        usage(2, "No options given")
-
-    pck = DEFAULTDB
-    good = []
-    spam = []
-    unknown = []
-    reverse = 0
-    do_filter = False
-    usedb = USEDB
-    mode = 'r'
-    for opt, arg in opts:
-        if opt == '-h':
-            usage(0)
-        elif opt == '-g':
-            good.append(arg)
-            mode = 'c'
-        elif opt == '-s':
-            spam.append(arg)
-            mode = 'c'
-        elif opt == '-p':
-            pck = arg
-        elif opt == "-d":
-            usedb = True
-        elif opt == "-D":
-            usedb = False
-        elif opt == "-f":
-            do_filter = True
-        elif opt == '-u':
-            unknown.append(arg)
-        elif opt == '-r':
-            reverse = 1
-    if args:
-        usage(2, "Positional arguments not allowed")
-
-    save = False
-
-    bayes = createbayes(pck, usedb, mode)
-    h = Hammie(bayes)
-
-    for g in good:
-        print "Training ham (%s):" % g
-        train(h, g, False)
-        save = True
-
-    for s in spam:
-        print "Training spam (%s):" % s
-        train(h, s, True)
-        save = True
-
-    if save:
-        h.update_probabilities()
-        if not usedb and pck:
-            fp = open(pck, 'wb')
-            pickle.dump(bayes, fp, 1)
-            fp.close()
-
-    if do_filter:
-        msg = sys.stdin.read()
-        filtered = h.filter(msg)
-        sys.stdout.write(filtered)
-
-    if unknown:
-        (spams, hams) = (0, 0)
-        for u in unknown:
-            if len(unknown) > 1:
-                print "Scoring", u
-            s, g = score(h, u, reverse)
-            spams += s
-            hams += g
-        print "Total %d spam, %d ham" % (spams, hams)
-
+        b = Bayes.PickledBayes(filename)
+    return Hammie(b)
 
-if __name__ == "__main__":
-    main()
Index: hammiefilter.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/hammiefilter.py,v
retrieving revision 1.2
diff -u -r1.2 hammiefilter.py
--- hammiefilter.py	18 Nov 2002 18:14:04 -0000	1.2
+++ hammiefilter.py	19 Nov 2002 00:24:57 -0000
@@ -51,43 +51,37 @@
     print >> sys.stderr, __doc__ % globals()
     sys.exit(code)
 
-def jar_pickle(h):
-    if not options.persistent_use_database:
-        import pickle
-        fp = open(options.persistent_storage_file, 'wb')
-        pickle.dump(h.bayes, fp, 1)
-        fp.close()
-    
-
-def hammie_open(mode):
-    b = hammie.createbayes(options.persistent_storage_file,
-                           options.persistent_use_database,
-                           mode)
-    return hammie.Hammie(b)
-
 def newdb():
-    h = hammie_open('n')
-    jar_pickle(h)
+    h = hammie.open(options.persistent_storage_file,
+                    options.persistent_use_database,
+                    'n')
+    h.store()
     print "Created new database in", options.persistent_storage_file
 
 def filter():
-    h = hammie_open('r')
+    h = hammie.open(options.persistent_storage_file,
+                    options.persistent_use_database,
+                    'r')
     msg = sys.stdin.read()
     print h.filter(msg)
 
 def train_ham():
-    h = hammie_open('w')
+    h = hammie.open(options.persistent_storage_file,
+                    options.persistent_use_database,
+                    'w')
     msg = sys.stdin.read()
     h.train_ham(msg)
     h.update_probabilities()
-    jar_pickle(h)    
+    h.store()
 
 def train_spam():
-    h = hammie_open('w')
+    h = hammie.open(options.persistent_storage_file,
+                    options.persistent_use_database,
+                    'w')
     msg = sys.stdin.read()
     h.train_spam(msg)
     h.update_probabilities()
-    jar_pickle(h)    
+    h.store()
 
 def main():
     action = filter