[Spambayes-checkins] spambayes/spambayes Options.py, 1.117,
1.118 storage.py, 1.43, 1.44
Tony Meyer
anadelonbrin at users.sourceforge.net
Mon Nov 22 01:26:47 CET 2004
- Previous message: [Spambayes-checkins] spambayes/spambayes/test .cvsignore, NONE,
1.1 test_message.py, NONE, 1.1 test_sb_filter.py, NONE,
1.1 test_sb_dbexpimp.py, 1.2, 1.3 test_sb_imapfilter.py, 1.5, 1.6
- Next message: [Spambayes-checkins] spambayes/spambayes smtpproxy.py,1.8,1.9
- Messages sorted by:
[ date ]
[ thread ]
[ subject ]
[ author ]
Update of /cvsroot/spambayes/spambayes/spambayes
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv21225/spambayes
Modified Files:
Options.py storage.py
Log Message:
Add new storage types:
CBDClassifier
ZODBClassifier
ZEOClassifier
ZODB and ZEO need ZODB installed, obviously. ZODB seems to work, but I'm only 50%
sure that ZEO is working correctly. I'll keep working on this as I can.
Add code to allow persistent_storage_name to not be expanded into an absolute path
with certain storage types (e.g. the SQL ones).
Index: Options.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/Options.py,v
retrieving revision 1.117
retrieving revision 1.118
diff -C2 -d -r1.117 -r1.118
*** Options.py 9 Nov 2004 02:37:41 -0000 1.117
--- Options.py 22 Nov 2004 00:26:44 -0000 1.118
***************
*** 518,522 ****
with the default.""",
# True == "dbm", False == "pickle", "True" == "dbm", "False" == "pickle"
! ("mysql", "pgsql", "dbm", "pickle", "True", "False", True, False), RESTORE),
("persistent_storage_file", "Storage file name", "hammie.db",
--- 518,522 ----
with the default.""",
# True == "dbm", False == "pickle", "True" == "dbm", "False" == "pickle"
! ("zeo", "zodb", "cdb", "mysql", "pgsql", "dbm", "pickle", "True", "False", True, False), RESTORE),
("persistent_storage_file", "Storage file name", "hammie.db",
Index: storage.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/storage.py,v
retrieving revision 1.43
retrieving revision 1.44
diff -C2 -d -r1.43 -r1.44
*** storage.py 28 Oct 2004 05:11:19 -0000 1.43
--- storage.py 22 Nov 2004 00:26:44 -0000 1.44
***************
*** 8,11 ****
--- 8,14 ----
PGClassifier - Classifier that uses postgres
mySQLClassifier - Classifier that uses mySQL
+ CBDClassifier - Classifier that uses CDB
+ ZODBClassifier - Classifier that uses ZODB
+ ZEOClassifier - Classifier that uses ZEO
Trainer - Classifier training observer
SpamTrainer - Trainer for spam
***************
*** 36,40 ****
To Do:
- o ZODBClassifier
o Would Trainer.trainall really want to train with the whole corpus,
or just a random subset?
--- 39,42 ----
***************
*** 43,47 ****
'''
! # This module is part of the spambayes project, which is Copyright 2002
# The Python Software Foundation and is covered by the Python Software
# Foundation license.
--- 45,49 ----
'''
! # This module is part of the spambayes project, which is Copyright 2002-5
# The Python Software Foundation and is covered by the Python Software
# Foundation license.
***************
*** 71,74 ****
--- 73,77 ----
import errno
import shelve
+ from spambayes import cdb
from spambayes import dbmstorage
***************
*** 147,151 ****
except IOError, e:
if options["globals", "verbose"]:
! print 'Failed update: ' + str(e)
if fp is not None:
os.remove(tmp)
--- 150,154 ----
except IOError, e:
if options["globals", "verbose"]:
! print >> sys.stderr, 'Failed update: ' + str(e)
if fp is not None:
os.remove(tmp)
***************
*** 595,598 ****
--- 598,761 ----
+ class CDBClassifier(classifier.Classifier):
+ """A classifier that uses a CDB database.
+
+ A CDB wordinfo database is quite small and fast but is slow to update.
+ It is appropriate if training is done rarely (e.g. monthly or weekly
+ using archived ham and spam).
+ """
+ def __init__(self, db_name):
+ classifier.Classifier.__init__(self)
+ self.db_name = db_name
+ self.statekey = STATE_KEY
+ self.load()
+
+ def _WordInfoFactory(self, counts):
+ # For whatever reason, WordInfo's cannot be created with
+ # constructor ham/spam counts, so we do the work here.
+ # Since we're doing the work, we accept the ham/spam count
+ # in the form of a comma-delimited string, as that's what
+ # we get.
+ ham, spam = counts.split(',')
+ wi = classifier.WordInfo()
+ wi.hamcount = int(ham)
+ wi.spamcount = int(spam)
+ return wi
+
+ def load(self):
+ if os.path.exists(self.db_name):
+ db = open(self.db_name, "rb")
+ data = dict(cdb.Cdb(db))
+ db.close()
+ self.nham, self.nspam = [int(i) for i in \
+ data[self.statekey].split(',')]
+ self.wordinfo = dict([(k, self._WordInfoFactory(v)) \
+ for k, v in data.iteritems() \
+ if k != self.statekey])
+ if options["globals", "verbose"]:
+ print >> sys.stderr, ('%s is an existing CDB,'
+ ' with %d ham and %d spam') \
+ % (self.db_name, self.nham,
+ self.nspam)
+ else:
+ if options["globals", "verbose"]:
+ print >> sys.stderr, self.db_name, 'is a new CDB'
+ self.wordinfo = {}
+ self.nham = 0
+ self.nspam = 0
+
+ def store(self):
+ items = [(self.statekey, "%d,%d" % (self.nham, self.nspam))]
+ for word, wi in self.wordinfo.iteritems():
+ items.append((word, "%d,%d" % (wi.hamcount, wi.spamcount)))
+ db = open(self.db_name, "wb")
+ cdb.cdb_make(db, items)
+ db.close()
+
+ def close(self):
+ # We keep no resources open - nothing to do.
+ pass
+
+
+ # If ZODB isn't available, then this class won't be useable, but we
+ # still need to be able to import this module. So we pretend that all
+ # is ok.
+ try:
+ Persistent
+ except NameError:
+ Persistent = object
+ class _PersistentClassifier(classifier.Classifier, Persistent):
+ def __init__(self):
+ import ZODB
+ from BTrees.OOBTree import OOBTree
+
+ classifier.Classifier.__init__(self)
+ self.wordinfo = OOBTree()
+
+ class ZODBClassifier(object):
+ def __init__(self, db_name):
+ self.statekey = STATE_KEY
+ self.db_name = db_name
+ self.load()
+
+ def __getattr__(self, att):
+ # We pretend that we are a classifier subclass.
+ if hasattr(self.classifier, att):
+ return getattr(self.classifier, att)
+ raise AttributeError("ZODBClassifier object has no attribute '%s'"
+ % (att,))
+
+ def __setattr__(self, att, value):
+ # For some attributes, we change the classifier instead.
+ if att in ["nham", "nspam"]:
+ setattr(self.classifier, att, value)
+ else:
+ object.__setattr__(self, att, value)
+
+ def create_storage(self):
+ import ZODB
+ from ZODB.FileStorage import FileStorage
+ self.storage = FileStorage(self.db_name)
+
+ def load(self):
+ import ZODB
+ self.create_storage()
+ self.db = ZODB.DB(self.storage)
+ root = self.db.open().root()
+ self.classifier = root.get(self.db_name)
+ if self.classifier is None:
+ # There is no classifier, so create one.
+ if options["globals", "verbose"]:
+ print >> sys.stderr, self.db_name, 'is a new ZODB'
+ self.classifier = root[self.db_name] = _PersistentClassifier()
+ get_transaction().commit()
+ else:
+ # It seems to me that the persistent classifier should store
+ # the nham and nspam values, but that doesn't appear to be the
+ # case, so work around that. This can be removed once I figure
+ # out the problem.
+ self.nham, self.nspam = self.classifier.wordinfo[self.statekey]
+ if options["globals", "verbose"]:
+ print >> sys.stderr, '%s is an existing ZODB, with %d ' \
+ 'ham and %d spam' % (self.db_name, self.nham,
+ self.nspam)
+
+ def store(self):
+ # It seems to me that the persistent classifier should store
+ # the nham and nspam values, but that doesn't appear to be the
+ # case, so work around that. This can be removed once I figure
+ # out the problem.
+ self.classifier.wordinfo[self.statekey] = (self.nham, self.nspam)
+ get_transaction().commit()
+
+ def close(self):
+ self.db.close()
+ self.storage.close()
+
+
+ class ZEOClassifier(ZODBClassifier):
+ def __init__(self, data_source_name):
+ source_info = data_source_name.split()
+ self.host = "localhost"
+ self.port = None
+ db_name = "SpamBayes"
+ for info in source_info:
+ if info.startswith("host"):
+ self.host = info[5:]
+ elif info.startswith("port"):
+ self.port = int(info[5:])
+ elif info.startswith("dbname"):
+ db_name = info[7:]
+ ZODBClassifier.__init__(self, db_name)
+
+ def create_storage(self):
+ from ZEO.ClientStorage import ClientStorage
+ if self.port:
+ addr = self.host, self.port
+ else:
+ addr = self.host
+ self.storage = ClientStorage(addr)
+
+
# Flags that the Trainer will recognise. These should be or'able integer
# values (i.e. 1, 2, 4, 8, etc.).
***************
*** 683,692 ****
return "Only one type of database can be specified"
! # values are classifier class and True if it accepts a mode
! # arg, False otherwise
! _storage_types = {"dbm" : (DBDictClassifier, True),
! "pickle" : (PickledClassifier, False),
! "pgsql" : (PGClassifier, False),
! "mysql" : (mySQLClassifier, False),
}
--- 846,858 ----
return "Only one type of database can be specified"
! # values are classifier class, True if it accepts a mode
! # arg, and True if the argument is a pathname
! _storage_types = {"dbm" : (DBDictClassifier, True, True),
! "pickle" : (PickledClassifier, False, True),
! "pgsql" : (PGClassifier, False, False),
! "mysql" : (mySQLClassifier, False, False),
! "cdb" : (CDBClassifier, False, True),
! "zodb" : (ZODBClassifier, False, True),
! "zeo" : (ZEOClassifier, False, False),
}
***************
*** 696,705 ****
By centralizing this code here, all the applications will behave
the same given the same options.
-
- db_type must be one of the following strings:
- dbm, pickle, pgsql, mysql
"""
try:
! klass, supports_mode = _storage_types[db_type]
except KeyError:
raise NoSuchClassifierError(db_type)
--- 862,868 ----
By centralizing this code here, all the applications will behave
the same given the same options.
"""
try:
! klass, supports_mode, unused = _storage_types[db_type]
except KeyError:
raise NoSuchClassifierError(db_type)
***************
*** 727,731 ****
}
! def database_type(opts):
"""Return the name of the database and the type to use. The output of
this function can be used as the db_type parameter for the open_storage
--- 890,895 ----
}
! def database_type(opts, default_type=("Storage", "persistent_use_database"),
! default_name=("Storage", "persistent_storage_file")):
"""Return the name of the database and the type to use. The output of
this function can be used as the db_type parameter for the open_storage
***************
*** 752,761 ****
raise MutuallyExclusiveError()
if nm is None and typ is None:
! typ = options["Storage", "persistent_use_database"]
if typ is True or typ == "True":
typ = "dbm"
elif typ is False or typ == "False":
typ = "pickle"
! nm = get_pathname_option("Storage", "persistent_storage_file")
return nm, typ
--- 916,933 ----
raise MutuallyExclusiveError()
if nm is None and typ is None:
! typ = options[default_type]
! # Backwards compatibility crud.
if typ is True or typ == "True":
typ = "dbm"
elif typ is False or typ == "False":
typ = "pickle"
! try:
! unused, unused, is_path = _storage_types[typ]
! except KeyError:
! raise NoSuchClassifierError(db_type)
! if is_path:
! nm = get_pathname_option(*default_name)
! else:
! nm = options[default_name]
return nm, typ
- Previous message: [Spambayes-checkins] spambayes/spambayes/test .cvsignore, NONE,
1.1 test_message.py, NONE, 1.1 test_sb_filter.py, NONE,
1.1 test_sb_dbexpimp.py, 1.2, 1.3 test_sb_imapfilter.py, 1.5, 1.6
- Next message: [Spambayes-checkins] spambayes/spambayes smtpproxy.py,1.8,1.9
- Messages sorted by:
[ date ]
[ thread ]
[ subject ]
[ author ]
More information about the Spambayes-checkins
mailing list