[Spambayes-checkins]
spambayes Bayes.py,1.5,1.5.2.1 Options.py,1.72,1.72.2.1
hammie.py,1.40,1.40.2.1 hammiefilter.py,1.2,1.2.2.1
pop3proxy.py,1.16,1.16.2.1
Neale Pickett
npickett@users.sourceforge.net
Tue Nov 19 23:45:27 2002
Update of /cvsroot/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv20373
Modified Files:
Tag: hammie-playground
Bayes.py Options.py hammie.py hammiefilter.py pop3proxy.py
Log Message:
* Removes DBDict and PersistentBayes from hammie.py
* hammie.py is no longer an executable, just a container for the
Hammie class
* Splits persistent_use_database into pop3proxy and hammiefilter
sections
Index: Bayes.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Bayes.py,v
retrieving revision 1.5
retrieving revision 1.5.2.1
diff -C2 -d -r1.5 -r1.5.2.1
*** Bayes.py 18 Nov 2002 13:04:20 -0000 1.5
--- Bayes.py 19 Nov 2002 23:45:24 -0000 1.5.2.1
***************
*** 41,47 ****
o ZODBBayes
o Would Trainer.trainall really want to train with the whole corpus,
! or just a random subset?
! o Corpus.Verbose is a bit of a strange thing to have. Verbose should be
! in the global namespace, but how do you get it there?
o Suggestions?
--- 41,47 ----
o ZODBBayes
o Would Trainer.trainall really want to train with the whole corpus,
! or just a random subset?
! o Corpus.Verbose is a bit of a strange thing to have. Verbose
! should be in the global namespace, but how do you get it there?
o Suggestions?
***************
*** 57,65 ****
import Corpus
! from classifier import Bayes
from Options import options
- from hammie import DBDict # hammie only for DBDict, which should
- # probably really be somewhere else
import cPickle as pickle
import errno
import copy
--- 57,64 ----
import Corpus
! import classifier
from Options import options
import cPickle as pickle
+ import dbdict
import errno
import copy
***************
*** 70,74 ****
UPDATEPROBS = True # Probabilities will be autoupdated with training
! class PersistentBayes(Bayes):
'''Persistent Bayes database object'''
--- 69,73 ----
UPDATEPROBS = True # Probabilities will be autoupdated with training
! class PersistentBayes(classifier.Bayes):
'''Persistent Bayes database object'''
***************
*** 170,179 ****
class DBDictBayes(PersistentBayes):
'''Bayes object persisted in a hammie.DB_Dict'''
! def __init__(self, db_name):
'''Constructor(database name)'''
self.db_name = db_name
self.statekey = "saved state"
--- 169,215 ----
+ class WIDict(dbdict.DBDict):
+ """DBDict optimized for holding lots of WordInfo objects.
+
+ Normally, the pickler can figure out that you're pickling the same
+ type thing over and over, and will just tag the type with a new
+ byte, thus reducing Administrative Pickle Bloat(R). Since the
+ DBDict continually creates new picklers, however, nothing ever gets
+ the chance to do this optimization.
+
+ The WIDict class forces this optimization by stealing the
+ (currently) unused 'W' pickle type for WordInfo objects. This
+ results in about a 50% reduction in database size.
+
+ """
+
+ def __getitem__(self, key):
+ v = self.hash[key]
+ if v[0] == 'W':
+ val = pickle.loads(v[1:])
+ # We could be sneaky, like pickle.Unpickler.load_inst,
+ # but I think that's overly confusing.
+ obj = classifier.WordInfo(0)
+ obj.__setstate__(val)
+ return obj
+ else:
+ return pickle.loads(v)
+
+ def __setitem__(self, key, val):
+ if isinstance(val, classifier.WordInfo):
+ val = val.__getstate__()
+ v = 'W' + pickle.dumps(val, 1)
+ else:
+ v = pickle.dumps(val, 1)
+ self.hash[key] = v
+
+
class DBDictBayes(PersistentBayes):
'''Bayes object persisted in a hammie.DB_Dict'''
! def __init__(self, db_name, mode='c'):
'''Constructor(database name)'''
+ self.mode = mode
self.db_name = db_name
self.statekey = "saved state"
***************
*** 187,191 ****
print 'Loading state from',self.db_name,'DB_Dict'
! self.wordinfo = DBDict(self.db_name, 'c')
if self.wordinfo.has_key(self.statekey):
--- 223,228 ----
print 'Loading state from',self.db_name,'DB_Dict'
! self.wordinfo = WIDict(self.db_name, self.mode,
! iterskip=[self.statekey])
if self.wordinfo.has_key(self.statekey):
***************
*** 217,221 ****
def __init__(self, bayes, trainertype, updateprobs=NO_UPDATEPROBS):
'''Constructor(Bayes, \
! Corpus.SPAM|Corpus.HAM), updprobs(True|False)'''
self.bayes = bayes
--- 254,258 ----
def __init__(self, bayes, trainertype, updateprobs=NO_UPDATEPROBS):
'''Constructor(Bayes, \
! Corpus.SPAM|Corpus.HAM), updprobs(True|False)'''
self.bayes = bayes
***************
*** 287,289 ****
if __name__ == '__main__':
! print >>sys.stderr, __doc__
\ No newline at end of file
--- 324,326 ----
if __name__ == '__main__':
! print >>sys.stderr, __doc__
Index: Options.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Options.py,v
retrieving revision 1.72
retrieving revision 1.72.2.1
diff -C2 -d -r1.72 -r1.72.2.1
*** Options.py 18 Nov 2002 19:14:48 -0000 1.72
--- Options.py 19 Nov 2002 23:45:24 -0000 1.72.2.1
***************
*** 349,356 ****
persistent_storage_file: hammie.db
! # hammie can use either a database (quick to score one message) or a pickle
! # (quick to train on huge amounts of messages). Set this to True to use a
! # database by default.
! persistent_use_database: False
[pop3proxy]
--- 349,357 ----
persistent_storage_file: hammie.db
! [hammiefilter]
! # hammiefilter can use either a database (quick to score one message) or
! # a pickle (quick to train on huge amounts of messages). Set this to
! # True to use a database by default.
! hammiefilter_persistent_use_database: False
[pop3proxy]
***************
*** 367,370 ****
--- 368,372 ----
pop3proxy_ham_cache: pop3proxy-ham-cache
pop3proxy_unknown_cache: pop3proxy-unknown-cache
+ pop3proxy_persistent_use_database: False
[html_ui]
***************
*** 441,444 ****
--- 443,448 ----
'hammie_debug_header_name': string_cracker,
},
+ 'hammiefilter' : {'hammiefilter_persistent_use_database': boolean_cracker,
+ },
'pop3proxy': {'pop3proxy_server_name': string_cracker,
'pop3proxy_server_port': int_cracker,
***************
*** 449,452 ****
--- 453,457 ----
'pop3proxy_ham_cache': string_cracker,
'pop3proxy_unknown_cache': string_cracker,
+ 'pop3proxy_persistent_use_database': string_cracker,
},
'html_ui': {'html_ui_port': int_cracker,
Index: hammie.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/hammie.py,v
retrieving revision 1.40
retrieving revision 1.40.2.1
diff -C2 -d -r1.40 -r1.40.2.1
*** hammie.py 18 Nov 2002 18:13:54 -0000 1.40
--- hammie.py 19 Nov 2002 23:45:24 -0000 1.40.2.1
***************
*** 1,56 ****
#! /usr/bin/env python
- # A driver for the classifier module and Tim's tokenizer that you can
- # call from procmail.
-
- """Usage: %(program)s [options]
-
- Where:
- -h
- show usage and exit
- -g PATH
- mbox or directory of known good messages (non-spam) to train on.
- Can be specified more than once, or use - for stdin.
- -s PATH
- mbox or directory of known spam messages to train on.
- Can be specified more than once, or use - for stdin.
- -u PATH
- mbox of unknown messages. A ham/spam decision is reported for each.
- Can be specified more than once.
- -r
- reverse the meaning of the check (report ham instead of spam).
- Only meaningful with the -u option.
- -p FILE
- use file as the persistent store. loads data from this file if it
- exists, and saves data to this file at the end.
- Default: %(DEFAULTDB)s
- -d
- use the DBM store instead of cPickle. The file is larger and
- creating it is slower, but checking against it is much faster,
- especially for large word databases. Default: %(USEDB)s
- -D
- the reverse of -d: use the cPickle instead of DBM
- -f
- run as a filter: read a single message from stdin, add an
- %(DISPHEADER)s header, and write it to stdout. If you want to
- run from procmail, this is your option.
- """
-
- from __future__ import generators
-
- import sys
- import os
- import types
- import getopt
- import mailbox
- import glob
- import email
- import errno
- import anydbm
- import cPickle as pickle
import mboxutils
! import classifier
from Options import options
try:
--- 1,10 ----
#! /usr/bin/env python
+ import dbdict
import mboxutils
! import Bayes
from Options import options
+ from tokenizer import tokenize
try:
***************
*** 61,224 ****
! program = sys.argv[0] # For usage(); referenced by docstring above
!
! # Name of the header to add in filter mode
! DISPHEADER = options.hammie_header_name
! DEBUGHEADER = options.hammie_debug_header_name
! DODEBUG = options.hammie_debug_header
!
! # Default database name
! DEFAULTDB = options.persistent_storage_file
!
! # Probability at which a message is considered spam
! SPAM_THRESHOLD = options.spam_cutoff
! HAM_THRESHOLD = options.ham_cutoff
!
! # Probability limit for a clue to be added to the DISPHEADER
! SHOWCLUE = options.clue_mailheader_cutoff
!
! # Use a database? If False, use a pickle
! USEDB = options.persistent_use_database
!
! # Tim's tokenizer kicks far more booty than anything I would have
! # written. Score one for analysis ;)
! from tokenizer import tokenize
!
! class DBDict:
!
! """Database Dictionary.
!
! This wraps an anydbm to make it look even more like a dictionary.
!
! Call it with the name of your database file. Optionally, you can
! specify a list of keys to skip when iterating. This only affects
! iterators; things like .keys() still list everything. For instance:
!
! >>> d = DBDict('/tmp/goober.db', ('skipme', 'skipmetoo'))
! >>> d['skipme'] = 'booga'
! >>> d['countme'] = 'wakka'
! >>> print d.keys()
! ['skipme', 'countme']
! >>> for k in d.iterkeys():
! ... print k
! countme
!
! """
!
! def __init__(self, dbname, mode, iterskip=()):
! self.hash = anydbm.open(dbname, mode)
! self.iterskip = iterskip
!
! def __getitem__(self, key):
! v = self.hash[key]
! if v[0] == 'W':
! val = pickle.loads(v[1:])
! # We could be sneaky, like pickle.Unpickler.load_inst,
! # but I think that's overly confusing.
! obj = classifier.WordInfo(0)
! obj.__setstate__(val)
! return obj
! else:
! return pickle.loads(v)
!
! def __setitem__(self, key, val):
! if isinstance(val, classifier.WordInfo):
! val = val.__getstate__()
! v = 'W' + pickle.dumps(val, 1)
! else:
! v = pickle.dumps(val, 1)
! self.hash[key] = v
!
! def __delitem__(self, key, val):
! del(self.hash[key])
!
! def __iter__(self, fn=None):
! k = self.hash.first()
! while k != None:
! key = k[0]
! val = self.__getitem__(key)
! if key not in self.iterskip:
! if fn:
! yield fn((key, val))
! else:
! yield (key, val)
! try:
! k = self.hash.next()
! except KeyError:
! break
!
! def __contains__(self, name):
! return self.has_key(name)
!
! def __getattr__(self, name):
! # Pass the buck
! return getattr(self.hash, name)
!
! def get(self, key, dfl=None):
! if self.has_key(key):
! return self[key]
! else:
! return dfl
!
! def iteritems(self):
! return self.__iter__()
!
! def iterkeys(self):
! return self.__iter__(lambda k: k[0])
!
! def itervalues(self):
! return self.__iter__(lambda k: k[1])
!
!
! class PersistentBayes(classifier.Bayes):
!
! """A persistent Bayes classifier.
!
! This is just like classifier.Bayes, except that the dictionary is a
! database. You take less disk this way and you can pretend it's
! persistent. The tradeoffs vs. a pickle are: 1. it's slower
! training, but faster checking, and 2. it needs less memory to run,
! but takes more space on the hard drive.
! On destruction, an instantiation of this class will write its state
! to a special key. When you instantiate a new one, it will attempt
! to read these values out of that key again, so you can pick up where
! you left off.
"""
- # XXX: Would it be even faster to remember (in a list) which keys
- # had been modified, and only recalculate those keys? No sense in
- # going over the entire word database if only 100 words are
- # affected.
-
- # XXX: Another idea: cache stuff in memory. But by then maybe we
- # should just use ZODB.
-
- def __init__(self, dbname, mode):
- classifier.Bayes.__init__(self)
- self.statekey = "saved state"
- self.wordinfo = DBDict(dbname, mode, (self.statekey,))
- self.dbmode = mode
-
- self.restore_state()
-
- def __del__(self):
- #super.__del__(self)
- self.save_state()
-
- def save_state(self):
- if self.dbmode != 'r':
- self.wordinfo[self.statekey] = (self.nham, self.nspam)
-
- def restore_state(self):
- if self.wordinfo.has_key(self.statekey):
- self.nham, self.nspam = self.wordinfo[self.statekey]
-
-
- class Hammie:
-
- """A spambayes mail filter"""
-
def __init__(self, bayes):
self.bayes = bayes
--- 15,26 ----
! class Hammie:
! """A spambayes mail filter.
! This implements the basic functionality needed to score, filter, or
! train.
"""
def __init__(self, bayes):
self.bayes = bayes
***************
*** 263,269 ****
traceback.print_exc()
! def filter(self, msg, header=DISPHEADER, spam_cutoff=SPAM_THRESHOLD,
! ham_cutoff=HAM_THRESHOLD, debugheader=DEBUGHEADER,
! debug=DODEBUG):
"""Score (judge) a message and add a disposition header.
--- 65,71 ----
traceback.print_exc()
! def filter(self, msg, header=None, spam_cutoff=None,
! ham_cutoff=None, debugheader=None,
! debug=None):
"""Score (judge) a message and add a disposition header.
***************
*** 283,286 ****
--- 85,99 ----
"""
+ if header == None:
+ header = options.hammie_header_name
+ if spam_cutoff == None:
+ spam_cutoff = options.spam_cutoff
+ if ham_cutoff == None:
+ ham_cutoff = options.ham_cutoff
+ if debugheader == None:
+ debugheader = options.hammie_debug_header_name
+ if debug == None:
+ debug = options.hammie_debug_header
+
msg = mboxutils.get_message(msg)
try:
***************
*** 349,353 ****
self.train(msg, True)
! def update_probabilities(self):
"""Update probability values.
--- 162,166 ----
self.train(msg, True)
! def update_probabilities(self, store=True):
"""Update probability values.
***************
*** 356,510 ****
until you're all done before calling this.
"""
self.bayes.update_probabilities()
! def train(hammie, msgs, is_spam):
! """Train bayes with all messages from a mailbox."""
! mbox = mboxutils.getmbox(msgs)
! i = 0
! for msg in mbox:
! i += 1
! # XXX: Is the \r a Unixism? I seem to recall it working in DOS
! # back in the day. Maybe it's a line-printer-ism ;)
! sys.stdout.write("\r%6d" % i)
! sys.stdout.flush()
! hammie.train(msg, is_spam)
! print
!
! def score(hammie, msgs, reverse=0):
! """Score (judge) all messages from a mailbox."""
! # XXX The reporting needs work!
! mbox = mboxutils.getmbox(msgs)
! i = 0
! spams = hams = 0
! for msg in mbox:
! i += 1
! prob, clues = hammie.score(msg, True)
! if hasattr(msg, '_mh_msgno'):
! msgno = msg._mh_msgno
! else:
! msgno = i
! isspam = (prob >= SPAM_THRESHOLD)
! if isspam:
! spams += 1
! if not reverse:
! print "%6s %4.2f %1s" % (msgno, prob, isspam and "S" or "."),
! print hammie.formatclues(clues)
! else:
! hams += 1
! if reverse:
! print "%6s %4.2f %1s" % (msgno, prob, isspam and "S" or "."),
! print hammie.formatclues(clues)
! return (spams, hams)
!
! def createbayes(pck=DEFAULTDB, usedb=False, mode='r'):
! """Create a Bayes instance for the given pickle (which
! doesn't have to exist). Create a PersistentBayes if
! usedb is True."""
! if usedb:
! bayes = PersistentBayes(pck, mode)
! else:
! bayes = None
! try:
! fp = open(pck, 'rb')
! except IOError, e:
! if e.errno <> errno.ENOENT: raise
! else:
! bayes = pickle.load(fp)
! fp.close()
! if bayes is None:
! bayes = classifier.Bayes()
! return bayes
!
! def usage(code, msg=''):
! """Print usage message and sys.exit(code)."""
! if msg:
! print >> sys.stderr, msg
! print >> sys.stderr
! print >> sys.stderr, __doc__ % globals()
! sys.exit(code)
!
! def main():
! """Main program; parse options and go."""
! try:
! opts, args = getopt.getopt(sys.argv[1:], 'hdDfg:s:p:u:r')
! except getopt.error, msg:
! usage(2, msg)
!
! if not opts:
! usage(2, "No options given")
!
! pck = DEFAULTDB
! good = []
! spam = []
! unknown = []
! reverse = 0
! do_filter = False
! usedb = USEDB
! mode = 'r'
! for opt, arg in opts:
! if opt == '-h':
! usage(0)
! elif opt == '-g':
! good.append(arg)
! mode = 'c'
! elif opt == '-s':
! spam.append(arg)
! mode = 'c'
! elif opt == '-p':
! pck = arg
! elif opt == "-d":
! usedb = True
! elif opt == "-D":
! usedb = False
! elif opt == "-f":
! do_filter = True
! elif opt == '-u':
! unknown.append(arg)
! elif opt == '-r':
! reverse = 1
! if args:
! usage(2, "Positional arguments not allowed")
! save = False
! bayes = createbayes(pck, usedb, mode)
! h = Hammie(bayes)
- for g in good:
- print "Training ham (%s):" % g
- train(h, g, False)
- save = True
! for s in spam:
! print "Training spam (%s):" % s
! train(h, s, True)
! save = True
! if save:
! h.update_probabilities()
! if not usedb and pck:
! fp = open(pck, 'wb')
! pickle.dump(bayes, fp, 1)
! fp.close()
! if do_filter:
! msg = sys.stdin.read()
! filtered = h.filter(msg)
! sys.stdout.write(filtered)
! if unknown:
! (spams, hams) = (0, 0)
! for u in unknown:
! if len(unknown) > 1:
! print "Scoring", u
! s, g = score(h, u, reverse)
! spams += s
! hams += g
! print "Total %d spam, %d ham" % (spams, hams)
- if __name__ == "__main__":
- main()
--- 169,207 ----
until you're all done before calling this.
+ Unless store is false, the peristent store will be written after
+ updating probabilities.
+
"""
self.bayes.update_probabilities()
+ if store:
+ self.store()
+ def store(self):
+ """Write out the persistent store.
! This makes sure the persistent store reflects what is currently
! in memory. You would want to do this after a write and before
! exiting.
! """
! self.bayes.store()
! def open(filename, usedb=True, mode='r'):
! """Open a file, returning a Hammie instance.
! If usedb is False, open as a pickle instead of a DBDict. mode is
! used as the flag to open DBDict objects. 'c' for read-write (create
! if needed), 'r' for read-only, 'w' for read-write.
! """
+ if usedb:
+ b = Bayes.DBDictBayes(filename, mode)
+ else:
+ b = Bayes.PickledBayes(filename)
+ return Hammie(b)
Index: hammiefilter.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/hammiefilter.py,v
retrieving revision 1.2
retrieving revision 1.2.2.1
diff -C2 -d -r1.2 -r1.2.2.1
*** hammiefilter.py 18 Nov 2002 18:14:04 -0000 1.2
--- hammiefilter.py 19 Nov 2002 23:45:25 -0000 1.2.2.1
***************
*** 52,92 ****
sys.exit(code)
- def jar_pickle(h):
- if not options.persistent_use_database:
- import pickle
- fp = open(options.persistent_storage_file, 'wb')
- pickle.dump(h.bayes, fp, 1)
- fp.close()
-
-
- def hammie_open(mode):
- b = hammie.createbayes(options.persistent_storage_file,
- options.persistent_use_database,
- mode)
- return hammie.Hammie(b)
-
def newdb():
! h = hammie_open('n')
! jar_pickle(h)
print "Created new database in", options.persistent_storage_file
def filter():
! h = hammie_open('r')
msg = sys.stdin.read()
print h.filter(msg)
def train_ham():
! h = hammie_open('w')
msg = sys.stdin.read()
h.train_ham(msg)
h.update_probabilities()
! jar_pickle(h)
def train_spam():
! h = hammie_open('w')
msg = sys.stdin.read()
h.train_spam(msg)
h.update_probabilities()
! jar_pickle(h)
def main():
--- 52,86 ----
sys.exit(code)
def newdb():
! h = hammie.open(options.persistent_storage_file,
! options.hammiefilter_persistent_use_database,
! 'n')
! h.store()
print "Created new database in", options.persistent_storage_file
def filter():
! h = hammie.open(options.persistent_storage_file,
! options.hammiefilter_persistent_use_database,
! 'r')
msg = sys.stdin.read()
print h.filter(msg)
def train_ham():
! h = hammie.open(options.persistent_storage_file,
! options.hammiefilter_persistent_use_database,
! 'w')
msg = sys.stdin.read()
h.train_ham(msg)
h.update_probabilities()
! h.store()
def train_spam():
! h = hammie.open(options.persistent_storage_file,
! options.hammiefilter_persistent_use_database,
! 'w')
msg = sys.stdin.read()
h.train_spam(msg)
h.update_probabilities()
! h.store()
def main():
***************
*** 104,112 ****
# hammiefilter overrides
- config_overrides = """[Hammie]
- persistent_storage_file = %s
- persistent_use_database = True
- """ % os.path.expanduser('~/.hammiedb')
- options.mergefilelike(StringIO.StringIO(config_overrides))
options.mergefiles(['/etc/hammierc',
os.path.expanduser('~/.hammierc')])
--- 98,101 ----
Index: pop3proxy.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/pop3proxy.py,v
retrieving revision 1.16
retrieving revision 1.16.2.1
diff -C2 -d -r1.16 -r1.16.2.1
*** pop3proxy.py 18 Nov 2002 19:14:48 -0000 1.16
--- pop3proxy.py 19 Nov 2002 23:45:25 -0000 1.16.2.1
***************
*** 1051,1056 ****
self.serverName = options.pop3proxy_server_name
self.serverPort = options.pop3proxy_server_port
! self.databaseFilename = options.persistent_storage_file
! self.useDB = options.persistent_use_database
self.uiPort = options.html_ui_port
self.launchUI = options.html_ui_launch_browser
--- 1051,1056 ----
self.serverName = options.pop3proxy_server_name
self.serverPort = options.pop3proxy_server_port
! self.databaseFilename = options.pop3proxy_persistent_storage_file
! self.useDB = options.pop3proxy_persistent_use_database
self.uiPort = options.html_ui_port
self.launchUI = options.html_ui_launch_browser
More information about the Spambayes-checkins
mailing list