[Spambayes] Guidance re pickles versus DB for Outlook

Neale Pickett neale@woozle.org
Wed Nov 27 07:29:33 2002


So then, "Tim Peters" <tim_one@email.msn.com> is all like:

> Suggestion: train into a brand new all-dict all-in-memory classifier.
> When that's done, add the dict counts to the DB counts, then throw
> away the dict.  The advantage is code and conceptual simplicity.

Gaw, Tim, do you ever run out of good ideas?

So this /does/ make things simpler.  In fact, I've completely eliminated
the need for dbdict.py.  Results are good, training 1088 messages:

pickle:
  real    0m26.581s
  user    0m25.110s
  sys     0m0.620s

db:
  real    0m52.737s
  user    0m31.730s
  sys     0m10.260s

I can live with 2x slower.  Training and scoring single messages with
the db still blows the pickle's doors off, of course.

I want to know what people think of this diff.  I stopped short of doing
away with WordInfo altogether, though I was tempted >-]

To do this in as simple a way possible, I added three new methods to the
Classifier class:

    def _wordinfoget(self, word):
        return self.wordinfo.get(word)

    def _wordinfoset(self, word, record):
        self.wordinfo[word] = record

    def _wordinfodel(self, word):
        del self.wordinfo[word]

These are then overloaded by the DBDict class.  I also got rid of the
lame MetaInfo class.

Well, a picture's worth a thousand words:

Index: classifier.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/classifier.py,v
retrieving revision 1.60
diff -u -r1.60 classifier.py
--- classifier.py	26 Nov 2002 20:22:05 -0000	1.60
+++ classifier.py	27 Nov 2002 07:27:38 -0000
@@ -46,31 +46,7 @@
 
 LN2 = math.log(2)       # used frequently by chi-combining
 
-PICKLE_VERSION = 4
-
-class MetaInfo(object):
-    """Information about the corpora.
-
-    Contains nham and nspam, used for calculating probabilities.
-
-    """
-    def __init__(self):
-        self.__setstate__((PICKLE_VERSION, 0, 0))
-
-    def __repr__(self):
-        return "MetaInfo%r" % repr((self._nspam,
-                                    self._nham,
-                                    self.revision))
-
-    def __getstate__(self):
-        return (PICKLE_VERSION, self.nspam, self.nham)
-
-    def __setstate__(self, t):
-        if t[0] != PICKLE_VERSION:
-            raise ValueError("Can't unpickle -- version %s unknown" % t[0])
-        self.nspam, self.nham = t[1:]
-        self.revision = 0
-
+PICKLE_VERSION = 5
 
 class WordInfo(object):
     # Invariant:  For use in a classifier database, at least one of
@@ -108,32 +84,18 @@
 
     def __init__(self):
         self.wordinfo = {}
-        self.meta = MetaInfo()
         self.probcache = {}
+        self.nspam = self.nham = 0
 
     def __getstate__(self):
-        return PICKLE_VERSION, self.wordinfo, self.meta
+        return (PICKLE_VERSION, self.wordinfo, self.nspam, self.nham)
 
     def __setstate__(self, t):
         if t[0] != PICKLE_VERSION:
             raise ValueError("Can't unpickle -- version %s unknown" % t[0])
-        self.wordinfo, self.meta = t[1:]
+        (self.wordinfo, self.nspam, self.nham) = t[1:]
         self.probcache = {}
 
-    # Slacker's way out--pass calls to nham/nspam up to the meta class
-
-    def get_nham(self):
-        return self.meta.nham
-    def set_nham(self, val):
-        self.meta.nham = val
-    nham = property(get_nham, set_nham)
-
-    def get_nspam(self):
-        return self.meta.nspam
-    def set_nspam(self, val):
-        self.meta.nspam = val
-    nspam = property(get_nspam, set_nspam)
-
     # spamprob() implementations.  One of the following is aliased to
     # spamprob, depending on option settings.
 
@@ -330,8 +292,8 @@
         except KeyError:
             pass
 
-        nham = float(self.meta.nham or 1)
-        nspam = float(self.meta.nspam or 1)
+        nham = float(self.nham or 1)
+        nspam = float(self.nspam or 1)
 
         assert hamcount <= nham
         hamratio = hamcount / nham
@@ -419,14 +381,12 @@
     def _add_msg(self, wordstream, is_spam):
         self.probcache = {}    # nuke the prob cache
         if is_spam:
-            self.meta.nspam += 1
+            self.nspam += 1
         else:
-            self.meta.nham += 1
+            self.nham += 1
 
-        wordinfo = self.wordinfo
-        wordinfoget = wordinfo.get
         for word in Set(wordstream):
-            record = wordinfoget(word)
+            record = self._wordinfoget(word)
             if record is None:
                 record = self.WordInfoClass()
 
@@ -435,25 +395,22 @@
             else:
                 record.hamcount += 1
 
-            # Needed to tell a persistent DB that the content changed.
-            wordinfo[word] = record
+            self._wordinfoset(word, record)
 
 
     def _remove_msg(self, wordstream, is_spam):
         self.probcache = {}    # nuke the prob cache
         if is_spam:
-            if self.meta.nspam <= 0:
+            if self.nspam <= 0:
                 raise ValueError("spam count would go negative!")
-            self.meta.nspam -= 1
+            self.nspam -= 1
         else:
-            if self.meta.nham <= 0:
+            if self.nham <= 0:
                 raise ValueError("non-spam count would go negative!")
-            self.meta.nham -= -1
+            self.nham -= -1
 
-        wordinfo = self.wordinfo
-        wordinfoget = wordinfo.get
         for word in Set(wordstream):
-            record = wordinfoget(word)
+            record = self._wordinfoget(word)
             if record is not None:
                 if is_spam:
                     if record.spamcount > 0:
@@ -462,11 +419,9 @@
                     if record.hamcount > 0:
                         record.hamcount -= 1
                 if record.hamcount == 0 == record.spamcount:
-                    del wordinfo[word]
+                    self._wordinfodel(word)
                 else:
-                    # Needed to tell a persistent DB that the content
-                    # changed.
-                    wordinfo[word] = record
+                    self._wordinfoset(word, record)
 
     def _getclues(self, wordstream):
         mindist = options.minimum_prob_strength
@@ -475,9 +430,8 @@
         clues = []  # (distance, prob, word, record) tuples
         pushclue = clues.append
 
-        wordinfoget = self.wordinfo.get
         for word in Set(wordstream):
-            record = wordinfoget(word)
+            record = self._wordinfoget(word)
             if record is None:
                 prob = unknown
             else:
@@ -491,6 +445,16 @@
             del clues[0 : -options.max_discriminators]
         # Return (prob, word, record).
         return [t[1:] for t in clues]
+
+    def _wordinfoget(self, word):
+        return self.wordinfo.get(word)
+
+    def _wordinfoset(self, word, record):
+        self.wordinfo[word] = record
+
+    def _wordinfodel(self, word):
+        del self.wordinfo[word]
+        
 
 
 Bayes = Classifier
Index: dbdict.py
===================================================================
RCS file: dbdict.py
diff -N dbdict.py
--- dbdict.py	25 Nov 2002 20:49:16 -0000	1.4
+++ /dev/null	1 Jan 1970 00:00:00 -0000
@@ -1,152 +0,0 @@
-#! /usr/bin/env python
-
-"""DBDict.py - Dictionary access to anydbm
-
-Classes:
-    DBDict - wraps an anydbm file
-
-Abstract:
-    DBDict class wraps an anydbm file with a reasonably complete set
-    of dictionary access methods.  DBDicts can be iterated like a dictionary.
-    
-    The constructor accepts a class name which is used specifically to
-    to pickle/unpickle an instance of that class.  When an instance of
-    that class is being pickled, the pickler (actually __getstate__) prepends
-    a 'W' to the pickled string, and when the unpickler (really __setstate__)
-    encounters that 'W', it constructs that class (with no constructor
-    arguments) and executes __setstate__ on the constructed instance.
-
-    DBDict accepts an iterskip operand on the constructor.  This is a tuple
-    of hash keys that will be skipped (not seen) during iteration.  This
-    is for iteration only.  Methods such as keys() will return the entire
-    complement of keys in the dbm hash, even if they're in iterskip.  An
-    iterkeys() method is provided for iterating with skipped keys, and
-    itervaluess() is provided for iterating values with skipped keys.
-
-        >>> d = DBDict('/tmp/goober.db', MODE_CREATE, ('skipme', 'skipmetoo'))
-        >>> d['skipme'] = 'booga'
-        >>> d['countme'] = 'wakka'
-        >>> print d.keys()
-        ['skipme', 'countme']
-        >>> for k in d.iterkeys():
-        ...     print k
-        countme
-        >>> for v in d.itervalues():
-        ...     print v
-        wakka
-        >>> for k,v in d.iteritems():
-        ...     print k,v
-        countme wakka
-
-To Do:
-    """
-
-# This module is part of the spambayes project, which is Copyright 2002
-# The Python Software Foundation and is covered by the Python Software
-# Foundation license.
-
-__author__ = "Neale Pickett <neale@woozle.org>, \
-              Tim Stone <tim@fourstonesExpressions.com>"
-__credits__ = "Tim Peters (author of DBDict class), \
-               all the spambayes contributors."
-
-try:
-    import cPickle as pickle
-except ImportError:
-    import pickle
-
-import anydbm
-import errno
-import copy
-import shutil
-import os
-
-MODE_CREATE = 'c'       # create file if necessary, open for readwrite
-MODE_NEW = 'n'          # always create new file, open for readwrite
-MODE_READWRITE = 'w'    # open existing file for readwrite
-MODE_READONLY = 'r'     # open existing file for read only
-
-
-class DBDict:
-    """Database Dictionary.
-
-    This wraps an anydbm database to make it look even more like a
-    dictionary, much like the built-in shelf class.  The difference is
-    that a DBDict supports all dict methods.
-
-    Call it with the database.  Optionally, you can specify a list of
-    keys to skip when iterating.  This only affects iterators; things
-    like .keys() still list everything.  For instance:
-
-    >>> d = DBDict('goober.db', MODE_CREATE, ('skipme', 'skipmetoo'))
-    >>> d['skipme'] = 'booga'
-    >>> d['countme'] = 'wakka'
-    >>> print d.keys()
-    ['skipme', 'countme']
-    >>> for k in d.iterkeys():
-    ...     print k
-    countme
-
-    """
-
-    def __init__(self, dbname, mode, wclass, iterskip=()):
-        self.hash = anydbm.open(dbname, mode)
-        if not iterskip:
-            self.iterskip = iterskip
-        else:
-            self.iterskip = ()
-        self.wclass=wclass
-
-    def __getitem__(self, key):
-        v = self.hash[key]
-        if v[0] == 'W':
-            val = pickle.loads(v[1:])
-            # We could be sneaky, like pickle.Unpickler.load_inst,
-            # but I think that's overly confusing.
-            obj = self.wclass()
-            obj.__setstate__(val)
-            return obj
-        else:
-            return pickle.loads(v)
-
-    def __setitem__(self, key, val):
-        if isinstance(val, self.wclass):
-            val = val.__getstate__()
-            v = 'W' + pickle.dumps(val, 1)
-        else:
-            v = pickle.dumps(val, 1)
-        self.hash[key] = v
-
-    def __getitem__(self, key):
-        return pickle.loads(self.hash[key])
-
-    def __setitem__(self, key, val):
-        self.hash[key] = pickle.dumps(val, 1)
-
-    def __delitem__(self, key, val):
-        del(self.hash[key])
-
-    def __contains__(self, name):
-        return self.has_key(name)
-
-    def __getattr__(self, name):
-        # Pass the buck
-        return getattr(self.hash, name)
-
-    def get(self, key, dfl=None):
-        if self.has_key(key):
-            return self[key]
-        else:
-            return dfl
-
-open = DBDict
-
-def _test():
-    import doctest
-    import dbdict
-
-    doctest.testmod(dbdict)
-
-if __name__ == '__main__':
-    _test()
-
Index: storage.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/storage.py,v
retrieving revision 1.2
diff -u -r1.2 storage.py
--- storage.py	26 Nov 2002 00:43:51 -0000	1.2
+++ storage.py	27 Nov 2002 07:27:38 -0000
@@ -4,7 +4,7 @@
 
 Classes:
     PickledClassifier - Classifier that uses a pickle db
-    DBDictClassifier - Classifier that uses a DBDict db
+    DBDictClassifier - Classifier that uses a DBM db
     Trainer - Classifier training observer
     SpamTrainer - Trainer for spam
     HamTrainer - Trainer for ham
@@ -17,8 +17,8 @@
     datastore.  This database is relatively small, but slower than other
     databases.
 
-    DBDictClassifier is a Classifier class that uses a DBDict
-    datastore.
+    DBDictClassifier is a Classifier class that uses a database
+    store.
 
     Trainer is concrete class that observes a Corpus and trains a
     Classifier object based upon movement of messages between corpora  When
@@ -49,8 +49,8 @@
 import classifier
 from Options import options
 import cPickle as pickle
-import dbdict
 import errno
+import shelve
 
 PICKLE_TYPE = 1
 NO_UPDATEPROBS = False   # Probabilities will not be autoupdated with training
@@ -83,10 +83,11 @@
             tempbayes = pickle.load(fp)
             fp.close()
 
+        # XXX: why not self.__setstate__(tempbayes.__getstate__())?
         if tempbayes:
             self.wordinfo = tempbayes.wordinfo
-            self.meta.nham = tempbayes.get_nham()
-            self.meta.nspam = tempbayes.get_nspam()
+            self.nham = tempbayes.nham
+            self.nspam = tempbayes.nspam
 
             if options.verbose:
                 print '%s is an existing pickle, with %d ham and %d spam' \
@@ -96,8 +97,8 @@
             if options.verbose:
                 print self.db_name,'is a new pickle'
             self.wordinfo = {}
-            self.meta.nham = 0
-            self.meta.nspam = 0
+            self.nham = 0
+            self.nspam = 0
 
     def store(self):
         '''Store self as a pickle'''
@@ -109,59 +110,78 @@
         pickle.dump(self, fp, PICKLE_TYPE)
         fp.close()
 
-    def __getstate__(self):
-        return PICKLE_TYPE, self.wordinfo, self.meta
-
-    def __setstate__(self, t):
-        if t[0] != PICKLE_TYPE:
-            raise ValueError("Can't unpickle -- version %s unknown" % t[0])
-        self.wordinfo, self.meta = t[1:]
-
 
 class DBDictClassifier(classifier.Classifier):
-    '''Classifier object persisted in a WIDict'''
+    '''Classifier object persisted in a caching database'''
 
     def __init__(self, db_name, mode='c'):
         '''Constructor(database name)'''
 
         classifier.Classifier.__init__(self)
+        self.wordcache = {}
         self.statekey = "saved state"
         self.mode = mode
         self.db_name = db_name
         self.load()
 
     def load(self):
-        '''Load state from WIDict'''
+        '''Load state from database'''
 
         if options.verbose:
-            print 'Loading state from',self.db_name,'WIDict'
+            print 'Loading state from',self.db_name,'database'
 
-        self.wordinfo = dbdict.DBDict(self.db_name, self.mode,
-                             classifier.WordInfo,iterskip=[self.statekey])
+        self.db = shelve.DbfilenameShelf(self.db_name, self.mode)
 
-        if self.wordinfo.has_key(self.statekey):
-            (nham, nspam) = self.wordinfo[self.statekey]
-            self.set_nham(nham)
-            self.set_nspam(nspam)
+        if self.db.has_key(self.statekey):
+            t = self.db[self.statekey]
+            if t[0] != classifier.PICKLE_VERSION:
+                raise ValueError("Can't unpickle -- version %s unknown" % t[0])
+            (self.nspam, self.nham) = t[1:]
 
             if options.verbose:
-                print '%s is an existing DBDict, with %d ham and %d spam' \
-                      % (self.db_name, self.nham, self.nspam)
+                print '%s is an existing database, with %d spam and %d ham' \
+                      % (self.db_name, self.nspam, self.nham)
         else:
             # new dbdict
             if options.verbose:
-                print self.db_name,'is a new DBDict'
-            self.set_nham(0)
-            self.set_nspam(0)
+                print self.db_name,'is a new database'
+            self.nspam = 0
+            self.nham = 0
+        self.wordinfo = {}
 
     def store(self):
         '''Place state into persistent store'''
 
         if options.verbose:
-            print 'Persisting',self.db_name,'state in WIDict'
+            print 'Persisting',self.db_name,'state in database'
+
+        for key, val in self.wordinfo.iteritems():
+            if val == None:
+                del self.wordinfo[key]
+                try:
+                    del self.db[key]
+                except KeyError:
+                    pass
+            else:
+                self.db[key] = val.__getstate__()
+        self.db[self.statekey] = (classifier.PICKLE_VERSION,
+                                  self.nspam, self.nham)
+        self.db.sync()
+
+    def _wordinfoget(self, word):
+        ret = self.wordinfo.get(word)
+        if not ret:
+            r = self.db.get(word)
+            if r:
+                ret = self.WordInfoClass()
+                ret.__setstate__(r)
+                self.wordinfo[word] = ret
+        return ret
+
+    # _wordinfoset is the same
 
-        self.wordinfo[self.statekey] = (self.get_nham(), self.get_nspam())
-        self.wordinfo.sync()
+    def _wordinfodel(self, word):
+        self.wordinfo[word] = None
 
 
 class Trainer:



More information about the Spambayes mailing list