[Spambayes-checkins]
spambayes Bayes.py,1.5.2.3,1.5.2.4 Options.py,1.72.2.3,1.72.2.4
classifier.py,1.53.2.1,1.53.2.2 hammiefilter.py,1.2.2.1,1.2.2.2
Neale Pickett
npickett@users.sourceforge.net
Thu Nov 21 04:16:39 2002
- Previous message: [Spambayes-checkins] spambayes dbdict.py,1.1.2.2,1.1.2.3
- Next message: [Spambayes-checkins]
spambayes Bayes.py,1.5.2.4,1.5.2.5 classifier.py,1.53.2.2,1.53.2.3
hammie.py,1.40.2.1,1.40.2.2 hammiefilter.py,1.2.2.2,1.2.2.3
- Messages sorted by:
[ date ]
[ thread ]
[ subject ]
[ author ]
Update of /cvsroot/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv25529
Modified Files:
Tag: hammie-playground
Bayes.py Options.py classifier.py hammiefilter.py
Log Message:
Bayes.py: __init__ cleanup
Options.py: moved persistent_storage_file out to hammiefilter and
pop3proxy sections.
classifier.py: New MetaInfo class which keeps counters
for nham and nspam, also a revision, incremented every
time either is changed.
WordInfo class calculates probabilty on the fly iff
MetaInfo revision has changed since last calculation.
Probabilities are no longer stored in the persisitent
databases.
hammiefilter.py: takes advantage of all this stuff :)
Index: Bayes.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Bayes.py,v
retrieving revision 1.5.2.3
retrieving revision 1.5.2.4
diff -C2 -d -r1.5.2.3 -r1.5.2.4
*** Bayes.py 21 Nov 2002 02:58:37 -0000 1.5.2.3
--- Bayes.py 21 Nov 2002 04:16:36 -0000 1.5.2.4
***************
*** 71,74 ****
--- 71,75 ----
'''Constructor(database name)'''
+ classifier.Bayes.__init__(self)
self.db_name = db_name
self.load()
***************
*** 186,190 ****
# We could be sneaky, like pickle.Unpickler.load_inst,
# but I think that's overly confusing.
! obj = classifier.WordInfo(0)
obj.__setstate__(val)
return obj
--- 187,191 ----
# We could be sneaky, like pickle.Unpickler.load_inst,
# but I think that's overly confusing.
! obj = classifier.WordInfo()
obj.__setstate__(val)
return obj
***************
*** 211,215 ****
self.statekey = "saved state"
! self.load()
def load(self):
--- 212,216 ----
self.statekey = "saved state"
! PersistentBayes.__init__(self, db_name)
def load(self):
Index: Options.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Options.py,v
retrieving revision 1.72.2.3
retrieving revision 1.72.2.4
diff -C2 -d -r1.72.2.3 -r1.72.2.4
*** Options.py 20 Nov 2002 06:06:27 -0000 1.72.2.3
--- Options.py 21 Nov 2002 04:16:36 -0000 1.72.2.4
***************
*** 346,352 ****
clue_mailheader_cutoff: 0.5
- # The default database path used by hammie
- persistent_storage_file: hammie.db
-
[hammiefilter]
# hammiefilter can use either a database (quick to score one message) or
--- 346,349 ----
***************
*** 354,357 ****
--- 351,355 ----
# True to use a database by default.
hammiefilter_persistent_use_database: True
+ hammiefilter_persistent_storage_file: ~/.hammiedb
[pop3proxy]
***************
*** 360,364 ****
# The only mandatory option is pop3proxy_server_name, eg. pop3.my-isp.com,
# but that can come from the command line - see "pop3proxy -h".
! pop3proxy_server_name: ""
pop3proxy_server_port: 110
pop3proxy_port: 110
--- 358,362 ----
# The only mandatory option is pop3proxy_server_name, eg. pop3.my-isp.com,
# but that can come from the command line - see "pop3proxy -h".
! pop3proxy_server_name:
pop3proxy_server_port: 110
pop3proxy_port: 110
***************
*** 369,373 ****
pop3proxy_unknown_cache: pop3proxy-unknown-cache
pop3proxy_persistent_use_database: False
! pop3proxy_persistent_storage_file: ""
[html_ui]
--- 367,371 ----
pop3proxy_unknown_cache: pop3proxy-unknown-cache
pop3proxy_persistent_use_database: False
! pop3proxy_persistent_storage_file: hammie.db
[html_ui]
***************
*** 433,437 ****
},
'Hammie': {'hammie_header_name': string_cracker,
- 'persistent_storage_file': string_cracker,
'clue_mailheader_cutoff': float_cracker,
'persistent_use_database': boolean_cracker,
--- 431,434 ----
***************
*** 445,448 ****
--- 442,446 ----
},
'hammiefilter' : {'hammiefilter_persistent_use_database': boolean_cracker,
+ 'hammiefilter_persistent_storage_file': string_cracker,
},
'pop3proxy': {'pop3proxy_server_name': string_cracker,
Index: classifier.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/classifier.py,v
retrieving revision 1.53.2.1
retrieving revision 1.53.2.2
diff -C2 -d -r1.53.2.1 -r1.53.2.2
*** classifier.py 20 Nov 2002 06:06:28 -0000 1.53.2.1
--- classifier.py 21 Nov 2002 04:16:36 -0000 1.53.2.2
***************
*** 32,36 ****
import math
- import time
from sets import Set
--- 32,35 ----
***************
*** 49,90 ****
PICKLE_VERSION = 1
! class WordInfo(object):
! __slots__ = ('atime', # when this record was last used by scoring(*)
! 'spamcount', # # of spams in which this word appears
! 'hamcount', # # of hams in which this word appears
! 'killcount', # # of times this made it to spamprob()'s nbest
! 'spamprob', # prob(spam | msg contains this word)
! )
# Invariant: For use in a classifier database, at least one of
# spamcount and hamcount must be non-zero.
- #
- # (*)atime is the last access time, a UTC time.time() value. It's the
- # most recent time this word was used by scoring (i.e., by spamprob(),
- # not by training via learn()); or, if the word has never been used by
- # scoring, the time the word record was created (i.e., by learn()).
- # One good criterion for identifying junk (word records that have no
- # value) is to delete words that haven't been used for a long time.
- # Perhaps they were typos, or unique identifiers, or relevant to a
- # once-hot topic or scam that's fallen out of favor. Whatever, if
- # a word is no longer being used, it's just wasting space.
! def __init__(self, atime, spamprob=options.unknown_word_prob):
! self.atime = atime
! self.spamcount = self.hamcount = self.killcount = 0
! self.spamprob = spamprob
def __repr__(self):
! return "WordInfo%r" % repr((self.atime, self.spamcount,
! self.hamcount, self.killcount,
self.spamprob))
def __getstate__(self):
! return (self.atime, self.spamcount, self.hamcount, self.killcount,
! self.spamprob)
def __setstate__(self, t):
! (self.atime, self.spamcount, self.hamcount, self.killcount,
! self.spamprob) = t
class Bayes:
--- 48,196 ----
PICKLE_VERSION = 1
! class MetaInfo(object):
! """Information about the corpora.
!
! Contains nham and nspam, used for calculating probabilities. Also
! has a revision, incremented every time nham or nspam is adjusted to
! invalidate any cached probabilities.
!
! """
! def __init__(self):
! self._nham = 0
! self._nspam = 0
! self.revision = 0
!
! def __repr__(self):
! return "MetaInfo%r" % repr((self._nham,
! self._nspam,
! self.revision))
!
! def __getstate__(self):
! return (self._nham, self._nspam)
!
! def __setstate__(self, t):
! (self._nham, self._nspam) = t
!
! def nham(self):
! return self._nham
!
! def nspam(self):
! return self._nspam
!
! def incr_rev(self):
! self.revision += 1
!
! def incr_ham(self, amt=1):
! self._nham += amt
! self.incr_rev()
+ def incr_spam(self, amt=1):
+ self._nspam += 1
+ self.incr_rev()
+
+
+ class WordInfo(object):
# Invariant: For use in a classifier database, at least one of
# spamcount and hamcount must be non-zero.
! def __init__(self):
! self.__setstate__((0, 0))
def __repr__(self):
! return "WordInfo%r" % repr((self.spamcount,
! self.hamcount,
self.spamprob))
def __getstate__(self):
! return (self.spamcount,
! self.hamcount)
def __setstate__(self, t):
! (self.spamcount, self.hamcount) = t
! self.spamprob = None
! self.revision = None
!
! def _update_probability(self, meta):
! """Compute and store p(word) = prob(msg is spam | msg contains word).
!
! This is the Graham calculation, but stripped of biases, and
! stripped of clamping into 0.01 thru 0.99. The Bayesian
! adjustment following keeps them in a sane range, and one
! that naturally grows the more evidence there is to back up
! a probability.
!
! Returns True if the probability changed, False otherwise.
! """
!
! nham = float(meta.nham() or 1)
! nspam = float(meta.nspam() or 1)
!
! if options.experimental_ham_spam_imbalance_adjustment:
! spam2ham = min(nspam / nham, 1.0)
! ham2spam = min(nham / nspam, 1.0)
! else:
! spam2ham = ham2spam = 1.0
!
! S = options.unknown_word_strength
! StimesX = S * options.unknown_word_prob
!
! assert self.hamcount <= nham
! hamratio = self.hamcount / nham
!
! assert self.spamcount <= nspam
! spamratio = self.spamcount / nspam
!
! prob = spamratio / (hamratio + spamratio)
!
! # Now do Robinson's Bayesian adjustment.
! #
! # s*x + n*p(w)
! # f(w) = --------------
! # s + n
! #
! # I find this easier to reason about like so (equivalent when
! # s != 0):
! #
! # x - p
! # p + -------
! # 1 + n/s
! #
! # IOW, it moves p a fraction of the distance from p to x, and
! # less so the larger n is, or the smaller s is.
!
! # Experimental:
! # Picking a good value for n is interesting: how much empirical
! # evidence do we really have? If nham == nspam,
! # hamcount + spamcount makes a lot of sense, and the code here
! # does that by default.
! # But if, e.g., nham is much larger than nspam, p(w) can get a
! # lot closer to 0.0 than it can get to 1.0. That in turn makes
! # strong ham words (high hamcount) much stronger than strong
! # spam words (high spamcount), and that makes the accidental
! # appearance of a strong ham word in spam much more damaging than
! # the accidental appearance of a strong spam word in ham.
! # So we don't give hamcount full credit when nham > nspam (or
! # spamcount when nspam > nham): instead we knock hamcount down
! # to what it would have been had nham been equal to nspam. IOW,
! # we multiply hamcount by nspam/nham when nspam < nham; or, IOOW,
! # we don't "believe" any count to an extent more than
! # min(nspam, nham) justifies.
!
! n = self.hamcount * spam2ham + self.spamcount * ham2spam
! prob = (StimesX + n * prob) / (S + n)
!
! self.revision = meta.revision
! if self.spamprob != prob:
! self.spamprob = prob
! return True
! else:
! return False
!
! def probability(self, meta):
! """Return this word's spam probability, recalculating if needed."""
! if meta.revision != self.revision:
! self._update_probability(meta)
! return self.spamprob
!
class Bayes:
***************
*** 105,117 ****
def __init__(self):
self.wordinfo = {}
! self.nspam = self.nham = 0
def __getstate__(self):
! return PICKLE_VERSION, self.wordinfo, self.nspam, self.nham
def __setstate__(self, t):
if t[0] != PICKLE_VERSION:
raise ValueError("Can't unpickle -- version %s unknown" % t[0])
! self.wordinfo, self.nspam, self.nham = t[1:]
# spamprob() implementations. One of the following is aliased to
--- 211,223 ----
def __init__(self):
self.wordinfo = {}
! self.meta = MetaInfo()
def __getstate__(self):
! return PICKLE_VERSION, self.wordinfo, self.meta
def __setstate__(self, t):
if t[0] != PICKLE_VERSION:
raise ValueError("Can't unpickle -- version %s unknown" % t[0])
! self.wordinfo, self.meta = t[1:]
# spamprob() implementations. One of the following is aliased to
***************
*** 145,150 ****
clues = self._getclues(wordstream)
for prob, word, record in clues:
- if record is not None: # else wordinfo doesn't know about it
- record.killcount += 1
P *= 1.0 - prob
Q *= prob
--- 251,254 ----
***************
*** 234,239 ****
clues = self._getclues(wordstream)
for prob, word, record in clues:
- if record is not None: # else wordinfo doesn't know about it
- record.killcount += 1
S *= 1.0 - prob
H *= prob
--- 338,341 ----
***************
*** 278,282 ****
spamprob = chi2_spamprob
! def learn(self, wordstream, is_spam, update_probabilities=True):
"""Teach the classifier by example.
--- 380,384 ----
spamprob = chi2_spamprob
! def learn(self, wordstream, is_spam, update_word_probabilities=True):
"""Teach the classifier by example.
***************
*** 285,302 ****
else that it's definitely not spam.
! If optional arg update_probabilities is False (the default is True),
! don't update word probabilities. Updating them is expensive, and if
! you're going to pass many messages to learn(), it's more efficient
! to pass False here and call update_probabilities() once when you're
! done -- or to call learn() with update_probabilities=True when
! passing the last new example. The important thing is that the
! probabilities get updated before calling spamprob() again.
"""
! self._add_msg(wordstream, is_spam)
! if update_probabilities:
! self.update_probabilities()
! def unlearn(self, wordstream, is_spam, update_probabilities=True):
"""In case of pilot error, call unlearn ASAP after screwing up.
--- 387,403 ----
else that it's definitely not spam.
! If optional arg update_word_probabilities is False (the default
! is True), don't update individual words' probabilities.
! Updating them is expensive, and if you're going to pass many
! messages to learn(), it's more efficient to pass False here and
! call update_probabilities() once when you're done. The
! important thing is that the probabilities get updated before
! calling spamprob() again.
!
"""
! self._add_msg(wordstream, is_spam, update_word_probabilities)
! def unlearn(self, wordstream, is_spam, update_word_probabilities=True):
"""In case of pilot error, call unlearn ASAP after screwing up.
***************
*** 304,310 ****
"""
! self._remove_msg(wordstream, is_spam)
! if update_probabilities:
! self.update_probabilities()
def update_probabilities(self):
--- 405,409 ----
"""
! self._remove_msg(wordstream, is_spam, update_word_probabilities)
def update_probabilities(self):
***************
*** 320,410 ****
for word, record in self.wordinfo.iteritems():
! self.update_word(word, record)
!
! def update_word(self, word, record):
! """Compute p(word) = prob(msg is spam | msg contains word).
!
! This is the Graham calculation, but stripped of biases, and
! stripped of clamping into 0.01 thru 0.99. The Bayesian
! adjustment following keeps them in a sane range, and one
! that naturally grows the more evidence there is to back up
! a probability.
! """
! nham = float(self.nham or 1)
! nspam = float(self.nspam or 1)
!
! if options.experimental_ham_spam_imbalance_adjustment:
! spam2ham = min(nspam / nham, 1.0)
! ham2spam = min(nham / nspam, 1.0)
! else:
! spam2ham = ham2spam = 1.0
!
! S = options.unknown_word_strength
! StimesX = S * options.unknown_word_prob
!
! hamcount = record.hamcount
! assert hamcount <= nham
! hamratio = hamcount / nham
!
! spamcount = record.spamcount
! assert spamcount <= nspam
! spamratio = spamcount / nspam
!
! prob = spamratio / (hamratio + spamratio)
!
! # Now do Robinson's Bayesian adjustment.
! #
! # s*x + n*p(w)
! # f(w) = --------------
! # s + n
! #
! # I find this easier to reason about like so (equivalent when
! # s != 0):
! #
! # x - p
! # p + -------
! # 1 + n/s
! #
! # IOW, it moves p a fraction of the distance from p to x, and
! # less so the larger n is, or the smaller s is.
!
! # Experimental:
! # Picking a good value for n is interesting: how much empirical
! # evidence do we really have? If nham == nspam,
! # hamcount + spamcount makes a lot of sense, and the code here
! # does that by default.
! # But if, e.g., nham is much larger than nspam, p(w) can get a
! # lot closer to 0.0 than it can get to 1.0. That in turn makes
! # strong ham words (high hamcount) much stronger than strong
! # spam words (high spamcount), and that makes the accidental
! # appearance of a strong ham word in spam much more damaging than
! # the accidental appearance of a strong spam word in ham.
! # So we don't give hamcount full credit when nham > nspam (or
! # spamcount when nspam > nham): instead we knock hamcount down
! # to what it would have been had nham been equal to nspam. IOW,
! # we multiply hamcount by nspam/nham when nspam < nham; or, IOOW,
! # we don't "believe" any count to an extent more than
! # min(nspam, nham) justifies.
!
! n = hamcount * spam2ham + spamcount * ham2spam
! prob = (StimesX + n * prob) / (S + n)
!
! if record.spamprob != prob:
! record.spamprob = prob
! # The next seemingly pointless line appears to be a hack
! # to allow a persistent db to realize the record has changed.
! self.wordinfo[word] = record
!
! def clearjunk(self, oldesttime):
! """Forget useless wordinfo records. This can shrink the database size.
!
! A record for a word will be retained only if the word was accessed
! at or after oldesttime.
! """
!
! wordinfo = self.wordinfo
! tonuke = [w for w, r in wordinfo.iteritems() if r.atime < oldesttime]
! for w in tonuke:
! del wordinfo[w]
# NOTE: Graham's scheme had a strange asymmetry: when a word appeared
--- 419,425 ----
for word, record in self.wordinfo.iteritems():
! # This method updates probability iff the metainfo revision
! # has changed.
! record.probability(self.meta)
# NOTE: Graham's scheme had a strange asymmetry: when a word appeared
***************
*** 428,444 ****
# appears in a msg, but distorting spamprob doesn't appear a correct way
# to exploit it.
! def _add_msg(self, wordstream, is_spam):
if is_spam:
! self.nspam += 1
else:
! self.nham += 1
wordinfo = self.wordinfo
wordinfoget = wordinfo.get
- now = time.time()
for word in Set(wordstream):
record = wordinfoget(word)
if record is None:
! record = self.WordInfoClass(now)
if is_spam:
--- 443,458 ----
# appears in a msg, but distorting spamprob doesn't appear a correct way
# to exploit it.
! def _add_msg(self, wordstream, is_spam, update_word_probabilities):
if is_spam:
! self.meta.incr_spam()
else:
! self.meta.incr_ham()
wordinfo = self.wordinfo
wordinfoget = wordinfo.get
for word in Set(wordstream):
record = wordinfoget(word)
if record is None:
! record = self.WordInfoClass()
if is_spam:
***************
*** 446,461 ****
else:
record.hamcount += 1
! # Needed to tell a persistent DB that the content changed.
! wordinfo[word] = record
! def _remove_msg(self, wordstream, is_spam):
if is_spam:
! if self.nspam <= 0:
raise ValueError("spam count would go negative!")
! self.nspam -= 1
else:
! if self.nham <= 0:
raise ValueError("non-spam count would go negative!")
! self.nham -= 1
wordinfo = self.wordinfo
--- 460,480 ----
else:
record.hamcount += 1
!
! if update_word_probabilities:
! self.update_word_probability(word, record)
! else:
! # Needed to tell a persistent DB that the content changed.
! wordinfo[word] = record
!
! def _remove_msg(self, wordstream, is_spam, update_word_probabilities):
if is_spam:
! if self.meta.nspam() <= 0:
raise ValueError("spam count would go negative!")
! self.meta.incr_spam(-1)
else:
! if self.meta.nham() <= 0:
raise ValueError("non-spam count would go negative!")
! self.meta.incr_ham(-1)
wordinfo = self.wordinfo
***************
*** 472,477 ****
if record.hamcount == 0 == record.spamcount:
del wordinfo[word]
else:
! # Needed to tell a persistent DB that the content changed.
wordinfo[word] = record
--- 491,499 ----
if record.hamcount == 0 == record.spamcount:
del wordinfo[word]
+ elif update_word_probabilities:
+ update_word_probability(word, record)
else:
! # Needed to tell a persistent DB that the content
! # changed.
wordinfo[word] = record
***************
*** 484,488 ****
wordinfoget = self.wordinfo.get
- now = time.time()
for word in Set(wordstream):
record = wordinfoget(word)
--- 506,509 ----
***************
*** 490,495 ****
prob = unknown
else:
! record.atime = now
! prob = record.spamprob
distance = abs(prob - 0.5)
if distance >= mindist:
--- 511,515 ----
prob = unknown
else:
! prob = record.probability(self.meta)
distance = abs(prob - 0.5)
if distance >= mindist:
Index: hammiefilter.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/hammiefilter.py,v
retrieving revision 1.2.2.1
retrieving revision 1.2.2.2
diff -C2 -d -r1.2.2.1 -r1.2.2.2
*** hammiefilter.py 19 Nov 2002 23:45:25 -0000 1.2.2.1
--- hammiefilter.py 21 Nov 2002 04:16:36 -0000 1.2.2.2
***************
*** 52,89 ****
sys.exit(code)
! def newdb():
! h = hammie.open(options.persistent_storage_file,
! options.hammiefilter_persistent_use_database,
! 'n')
! h.store()
! print "Created new database in", options.persistent_storage_file
! def filter():
! h = hammie.open(options.persistent_storage_file,
! options.hammiefilter_persistent_use_database,
! 'r')
! msg = sys.stdin.read()
! print h.filter(msg)
! def train_ham():
! h = hammie.open(options.persistent_storage_file,
! options.hammiefilter_persistent_use_database,
! 'w')
! msg = sys.stdin.read()
! h.train_ham(msg)
! h.update_probabilities()
! h.store()
! def train_spam():
! h = hammie.open(options.persistent_storage_file,
! options.hammiefilter_persistent_use_database,
! 'w')
! msg = sys.stdin.read()
! h.train_spam(msg)
! h.update_probabilities()
! h.store()
def main():
! action = filter
opts, args = getopt.getopt(sys.argv[1:], 'hngs')
for opt, arg in opts:
--- 52,93 ----
sys.exit(code)
! class HammieFilter(object):
! def __init__(self):
! options = Options.options
! options.mergefiles(['/etc/hammierc',
! os.path.expanduser('~/.hammierc')])
!
! self.dbname = options.hammiefilter_persistent_storage_file
! self.dbname = os.path.expanduser(self.dbname)
! self.usedb = options.hammiefilter_persistent_use_database
!
! def newdb(self):
! h = hammie.open(self.dbname, self.usedb, 'n')
! h.store()
! print "Created new database in", self.dbname
! def filter(self):
! h = hammie.open(self.dbname, self.usedb, 'r')
! msg = sys.stdin.read()
! print h.filter(msg)
! def train_ham(self):
! h = hammie.open(self.dbname, self.usedb, 'c')
! msg = sys.stdin.read()
! h.train_ham(msg)
! h.update_probabilities()
! h.store()
!
! def train_spam(self):
! h = hammie.open(self.dbname, self.usedb, 'c')
! msg = sys.stdin.read()
! h.train_spam(msg)
! h.update_probabilities()
! h.store()
def main():
! h = HammieFilter()
! action = h.filter
opts, args = getopt.getopt(sys.argv[1:], 'hngs')
for opt, arg in opts:
***************
*** 91,103 ****
usage(0)
elif opt == '-g':
! action = train_ham
elif opt == '-s':
! action = train_spam
elif opt == "-n":
! action = newdb
!
! # hammiefilter overrides
! options.mergefiles(['/etc/hammierc',
! os.path.expanduser('~/.hammierc')])
action()
--- 95,103 ----
usage(0)
elif opt == '-g':
! action = h.train_ham
elif opt == '-s':
! action = h.train_spam
elif opt == "-n":
! action = h.newdb
action()
- Previous message: [Spambayes-checkins] spambayes dbdict.py,1.1.2.2,1.1.2.3
- Next message: [Spambayes-checkins]
spambayes Bayes.py,1.5.2.4,1.5.2.5 classifier.py,1.53.2.2,1.53.2.3
hammie.py,1.40.2.1,1.40.2.2 hammiefilter.py,1.2.2.2,1.2.2.3
- Messages sorted by:
[ date ]
[ thread ]
[ subject ]
[ author ]
More information about the Spambayes-checkins
mailing list