[Python-checkins] python/nondist/sandbox/spambayes GBayes.py,1.8,1.9
tim_one@users.sourceforge.net
tim_one@users.sourceforge.net
Wed, 21 Aug 2002 14:01:57 -0700
Update of /cvsroot/python/python/nondist/sandbox/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv17414
Modified Files:
GBayes.py
Log Message:
Renamed "modtime" to "atime", to better reflect its meaning, and added a
comment block to explain that better.
Index: GBayes.py
===================================================================
RCS file: /cvsroot/python/python/nondist/sandbox/spambayes/GBayes.py,v
retrieving revision 1.8
retrieving revision 1.9
diff -C2 -d -r1.8 -r1.9
*** GBayes.py 20 Aug 2002 20:42:59 -0000 1.8
--- GBayes.py 21 Aug 2002 21:01:47 -0000 1.9
***************
*** 74,78 ****
class WordInfo(object):
! __slots__ = ('modtime', # when this record was last touched
'spamcount', # # of times word appears in spam
'hamcount', # # of times word appears in non-spam
--- 74,78 ----
class WordInfo(object):
! __slots__ = ('atime', # when this record was last used by scoring(*)
'spamcount', # # of times word appears in spam
'hamcount', # # of times word appears in non-spam
***************
*** 80,100 ****
'spamprob', # prob(spam | msg contains this word)
)
! def __init__(self, modtime):
! self.modtime = modtime
self.spamcount = self.hamcount = self.killcount = 0
self.spamprob = None
def __repr__(self):
! return "WordInfo%r" % repr((self.modtime, self.spamcount,
self.hamcount, self.killcount,
self.spamprob))
def __getstate__(self):
! return (self.modtime, self.spamcount, self.hamcount, self.killcount,
self.spamprob)
def __setstate__(self, t):
! (self.modtime, self.spamcount, self.hamcount, self.killcount,
self.spamprob) = t
--- 80,109 ----
'spamprob', # prob(spam | msg contains this word)
)
+ # (*)atime is the last access time, a UTC time.time() value. It's the
+ # most recent time this word was used by scoring (i.e., by spamprob(),
+ # not by training via learn()); or, if the word has never been used by
+ # scoring, the time the word record was created (i.e., by learn()).
+ # One good criterion for identifying junk (word records that have no
+ # value) is to delete words that haven't been used for a long time.
+ # Perhaps they were typos, or unique identifiers, or relevant to a
+ # once-hot topic or scam that's fallen out of favor. Whatever, if
+ # a word is no longer being used, it's just wasting space.
! def __init__(self, atime):
! self.atime = atime
self.spamcount = self.hamcount = self.killcount = 0
self.spamprob = None
def __repr__(self):
! return "WordInfo%r" % repr((self.atime, self.spamcount,
self.hamcount, self.killcount,
self.spamprob))
def __getstate__(self):
! return (self.atime, self.spamcount, self.hamcount, self.killcount,
self.spamprob)
def __setstate__(self, t):
! (self.atime, self.spamcount, self.hamcount, self.killcount,
self.spamprob) = t
***************
*** 146,150 ****
prob = UNKNOWN_SPAMPROB
else:
! record.modtime = now
prob = record.spamprob
--- 155,159 ----
prob = UNKNOWN_SPAMPROB
else:
! record.atime = now
prob = record.spamprob
***************
*** 246,250 ****
mincount = float(mincount)
for w, r in wordinfo.iteritems():
! if (r.modtime < oldesttime and
SPAMBIAS*r.spamcount + HAMBIAS*r.hamcount < mincount):
if self.DEBUG:
--- 255,259 ----
mincount = float(mincount)
for w, r in wordinfo.iteritems():
! if (r.atime < oldesttime and
SPAMBIAS*r.spamcount + HAMBIAS*r.hamcount < mincount):
if self.DEBUG: