[Spambayes-checkins] spambayes/spambayes Options.py, 1.117, 1.118 storage.py, 1.43, 1.44

Tony Meyer anadelonbrin at users.sourceforge.net
Mon Nov 22 01:26:47 CET 2004


Update of /cvsroot/spambayes/spambayes/spambayes
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv21225/spambayes

Modified Files:
	Options.py storage.py 
Log Message:
Add new storage types:

CBDClassifier
ZODBClassifier
ZEOClassifier

ZODB and ZEO need ZODB installed, obviously.  ZODB seems to work, but I'm only 50%
 sure that ZEO is working correctly.  I'll keep working on this as I can.

Add code to allow persistent_storage_name to not be expanded into an absolute path
 with certain storage types (e.g. the SQL ones).

Index: Options.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/Options.py,v
retrieving revision 1.117
retrieving revision 1.118
diff -C2 -d -r1.117 -r1.118
*** Options.py	9 Nov 2004 02:37:41 -0000	1.117
--- Options.py	22 Nov 2004 00:26:44 -0000	1.118
***************
*** 518,522 ****
       with the default.""",
       # True == "dbm", False == "pickle", "True" == "dbm", "False" == "pickle"
!      ("mysql", "pgsql", "dbm", "pickle", "True", "False", True, False), RESTORE),
  
      ("persistent_storage_file", "Storage file name", "hammie.db",
--- 518,522 ----
       with the default.""",
       # True == "dbm", False == "pickle", "True" == "dbm", "False" == "pickle"
!      ("zeo", "zodb", "cdb", "mysql", "pgsql", "dbm", "pickle", "True", "False", True, False), RESTORE),
  
      ("persistent_storage_file", "Storage file name", "hammie.db",

Index: storage.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/storage.py,v
retrieving revision 1.43
retrieving revision 1.44
diff -C2 -d -r1.43 -r1.44
*** storage.py	28 Oct 2004 05:11:19 -0000	1.43
--- storage.py	22 Nov 2004 00:26:44 -0000	1.44
***************
*** 8,11 ****
--- 8,14 ----
      PGClassifier - Classifier that uses postgres
      mySQLClassifier - Classifier that uses mySQL
+     CBDClassifier - Classifier that uses CDB
+     ZODBClassifier - Classifier that uses ZODB
+     ZEOClassifier - Classifier that uses ZEO
      Trainer - Classifier training observer
      SpamTrainer - Trainer for spam
***************
*** 36,40 ****
  
  To Do:
-     o ZODBClassifier
      o Would Trainer.trainall really want to train with the whole corpus,
          or just a random subset?
--- 39,42 ----
***************
*** 43,47 ****
      '''
  
! # This module is part of the spambayes project, which is Copyright 2002
  # The Python Software Foundation and is covered by the Python Software
  # Foundation license.
--- 45,49 ----
      '''
  
! # This module is part of the spambayes project, which is Copyright 2002-5
  # The Python Software Foundation and is covered by the Python Software
  # Foundation license.
***************
*** 71,74 ****
--- 73,77 ----
  import errno
  import shelve
+ from spambayes import cdb
  from spambayes import dbmstorage
  
***************
*** 147,151 ****
          except IOError, e: 
              if options["globals", "verbose"]: 
!                 print 'Failed update: ' + str(e)
              if fp is not None: 
                  os.remove(tmp) 
--- 150,154 ----
          except IOError, e: 
              if options["globals", "verbose"]: 
!                 print >> sys.stderr, 'Failed update: ' + str(e)
              if fp is not None: 
                  os.remove(tmp) 
***************
*** 595,598 ****
--- 598,761 ----
  
  
+ class CDBClassifier(classifier.Classifier):
+     """A classifier that uses a CDB database.
+ 
+     A CDB wordinfo database is quite small and fast but is slow to update.
+     It is appropriate if training is done rarely (e.g. monthly or weekly
+     using archived ham and spam).
+     """
+     def __init__(self, db_name):
+         classifier.Classifier.__init__(self)
+         self.db_name = db_name
+         self.statekey = STATE_KEY
+         self.load()
+ 
+     def _WordInfoFactory(self, counts):
+         # For whatever reason, WordInfo's cannot be created with
+         # constructor ham/spam counts, so we do the work here.
+         # Since we're doing the work, we accept the ham/spam count
+         # in the form of a comma-delimited string, as that's what
+         # we get.
+         ham, spam = counts.split(',')
+         wi = classifier.WordInfo()
+         wi.hamcount = int(ham)
+         wi.spamcount = int(spam)
+         return wi
+ 
+     def load(self):
+         if os.path.exists(self.db_name):
+             db = open(self.db_name, "rb")
+             data = dict(cdb.Cdb(db))
+             db.close()
+             self.nham, self.nspam = [int(i) for i in \
+                                      data[self.statekey].split(',')]
+             self.wordinfo = dict([(k, self._WordInfoFactory(v)) \
+                                   for k, v in data.iteritems() \
+                                       if k != self.statekey])
+             if options["globals", "verbose"]:
+                 print >> sys.stderr, ('%s is an existing CDB,'
+                                       ' with %d ham and %d spam') \
+                                       % (self.db_name, self.nham,
+                                          self.nspam)
+         else:
+             if options["globals", "verbose"]:
+                 print >> sys.stderr, self.db_name, 'is a new CDB'
+             self.wordinfo = {}
+             self.nham = 0
+             self.nspam = 0
+ 
+     def store(self):
+         items = [(self.statekey, "%d,%d" % (self.nham, self.nspam))]
+         for word, wi in self.wordinfo.iteritems():
+             items.append((word, "%d,%d" % (wi.hamcount, wi.spamcount)))
+         db = open(self.db_name, "wb")
+         cdb.cdb_make(db, items)
+         db.close()
+ 
+     def close(self):
+         # We keep no resources open - nothing to do.
+         pass
+ 
+ 
+ # If ZODB isn't available, then this class won't be useable, but we
+ # still need to be able to import this module.  So we pretend that all
+ # is ok.
+ try:
+     Persistent
+ except NameError:
+     Persistent = object
+ class _PersistentClassifier(classifier.Classifier, Persistent):
+     def __init__(self):
+         import ZODB
+         from BTrees.OOBTree import OOBTree
+ 
+         classifier.Classifier.__init__(self)
+         self.wordinfo = OOBTree()
+ 
+ class ZODBClassifier(object):
+     def __init__(self, db_name):
+         self.statekey = STATE_KEY
+         self.db_name = db_name
+         self.load()
+ 
+     def __getattr__(self, att):
+         # We pretend that we are a classifier subclass.
+         if hasattr(self.classifier, att):
+             return getattr(self.classifier, att)
+         raise AttributeError("ZODBClassifier object has no attribute '%s'"
+                              % (att,))
+ 
+     def __setattr__(self, att, value):
+         # For some attributes, we change the classifier instead.
+         if att in ["nham", "nspam"]:
+             setattr(self.classifier, att, value)
+         else:
+             object.__setattr__(self, att, value)
+ 
+     def create_storage(self):
+         import ZODB
+         from ZODB.FileStorage import FileStorage
+         self.storage = FileStorage(self.db_name)
+ 
+     def load(self):
+         import ZODB
+         self.create_storage()
+         self.db = ZODB.DB(self.storage)
+         root = self.db.open().root()
+         self.classifier = root.get(self.db_name)
+         if self.classifier is None:
+             # There is no classifier, so create one.
+             if options["globals", "verbose"]:
+                 print >> sys.stderr, self.db_name, 'is a new ZODB'
+             self.classifier = root[self.db_name] = _PersistentClassifier()
+             get_transaction().commit()
+         else:
+             # It seems to me that the persistent classifier should store
+             # the nham and nspam values, but that doesn't appear to be the
+             # case, so work around that.  This can be removed once I figure
+             # out the problem.
+             self.nham, self.nspam = self.classifier.wordinfo[self.statekey]
+             if options["globals", "verbose"]:
+                 print >> sys.stderr, '%s is an existing ZODB, with %d ' \
+                       'ham and %d spam' % (self.db_name, self.nham,
+                                            self.nspam)
+         
+     def store(self):
+         # It seems to me that the persistent classifier should store
+         # the nham and nspam values, but that doesn't appear to be the
+         # case, so work around that.  This can be removed once I figure
+         # out the problem.
+         self.classifier.wordinfo[self.statekey] = (self.nham, self.nspam)
+         get_transaction().commit()
+ 
+     def close(self):
+         self.db.close()
+         self.storage.close()
+ 
+ 
+ class ZEOClassifier(ZODBClassifier):
+     def __init__(self, data_source_name):
+         source_info = data_source_name.split()
+         self.host = "localhost"
+         self.port = None
+         db_name = "SpamBayes"
+         for info in source_info:
+             if info.startswith("host"):
+                 self.host = info[5:]
+             elif info.startswith("port"):
+                 self.port = int(info[5:])
+             elif info.startswith("dbname"):
+                 db_name = info[7:]
+         ZODBClassifier.__init__(self, db_name)
+ 
+     def create_storage(self):
+         from ZEO.ClientStorage import ClientStorage
+         if self.port:
+             addr = self.host, self.port
+         else:
+             addr = self.host
+         self.storage = ClientStorage(addr)
+ 
+ 
  # Flags that the Trainer will recognise.  These should be or'able integer
  # values (i.e. 1, 2, 4, 8, etc.).
***************
*** 683,692 ****
          return "Only one type of database can be specified"
  
! # values are classifier class and True if it accepts a mode
! # arg, False otherwise
! _storage_types = {"dbm" : (DBDictClassifier, True),
!                   "pickle" : (PickledClassifier, False),
!                   "pgsql" : (PGClassifier, False),
!                   "mysql" : (mySQLClassifier, False),
                    }
  
--- 846,858 ----
          return "Only one type of database can be specified"
  
! # values are classifier class, True if it accepts a mode
! # arg, and True if the argument is a pathname
! _storage_types = {"dbm" : (DBDictClassifier, True, True),
!                   "pickle" : (PickledClassifier, False, True),
!                   "pgsql" : (PGClassifier, False, False),
!                   "mysql" : (mySQLClassifier, False, False),
!                   "cdb" : (CDBClassifier, False, True),
!                   "zodb" : (ZODBClassifier, False, True),
!                   "zeo" : (ZEOClassifier, False, False),
                    }
  
***************
*** 696,705 ****
      By centralizing this code here, all the applications will behave
      the same given the same options.
- 
-     db_type must be one of the following strings:
-       dbm, pickle, pgsql, mysql
      """
      try:
!         klass, supports_mode = _storage_types[db_type]
      except KeyError:
          raise NoSuchClassifierError(db_type)
--- 862,868 ----
      By centralizing this code here, all the applications will behave
      the same given the same options.
      """
      try:
!         klass, supports_mode, unused = _storage_types[db_type]
      except KeyError:
          raise NoSuchClassifierError(db_type)
***************
*** 727,731 ****
                       }
  
! def database_type(opts):
      """Return the name of the database and the type to use.  The output of
      this function can be used as the db_type parameter for the open_storage
--- 890,895 ----
                       }
  
! def database_type(opts, default_type=("Storage", "persistent_use_database"),
!                   default_name=("Storage", "persistent_storage_file")):
      """Return the name of the database and the type to use.  The output of
      this function can be used as the db_type parameter for the open_storage
***************
*** 752,761 ****
                  raise MutuallyExclusiveError()
      if nm is None and typ is None:
!         typ = options["Storage", "persistent_use_database"]
          if typ is True or typ == "True":
              typ = "dbm"
          elif typ is False or typ == "False":
              typ = "pickle"
!         nm = get_pathname_option("Storage", "persistent_storage_file")
      return nm, typ
  
--- 916,933 ----
                  raise MutuallyExclusiveError()
      if nm is None and typ is None:
!         typ = options[default_type]
!         # Backwards compatibility crud.
          if typ is True or typ == "True":
              typ = "dbm"
          elif typ is False or typ == "False":
              typ = "pickle"
!         try:
!             unused, unused, is_path = _storage_types[typ]
!         except KeyError:
!             raise NoSuchClassifierError(db_type)
!         if is_path:
!             nm = get_pathname_option(*default_name)
!         else:
!             nm = options[default_name]
      return nm, typ
  



More information about the Spambayes-checkins mailing list