[Python-checkins] python/nondist/sandbox/spambayes GBayes.py,1.8,1.9

Wed, 21 Aug 2002 14:01:57 -0700

Update of /cvsroot/python/python/nondist/sandbox/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv17414

Modified Files:
	GBayes.py 
Log Message:
Renamed "modtime" to "atime", to better reflect its meaning, and added a
comment block to explain that better.


Index: GBayes.py
===================================================================
RCS file: /cvsroot/python/python/nondist/sandbox/spambayes/GBayes.py,v
retrieving revision 1.8
retrieving revision 1.9
diff -C2 -d -r1.8 -r1.9
*** GBayes.py	20 Aug 2002 20:42:59 -0000	1.8
--- GBayes.py	21 Aug 2002 21:01:47 -0000	1.9
***************
*** 74,78 ****
  
  class WordInfo(object):
!     __slots__ = ('modtime',   # when this record was last touched
                   'spamcount', # # of times word appears in spam
                   'hamcount',  # # of times word appears in non-spam
--- 74,78 ----
  
  class WordInfo(object):
!     __slots__ = ('atime',     # when this record was last used by scoring(*)
                   'spamcount', # # of times word appears in spam
                   'hamcount',  # # of times word appears in non-spam
***************
*** 80,100 ****
                   'spamprob',  # prob(spam | msg contains this word)
                  )
  
!     def __init__(self, modtime):
!         self.modtime = modtime
          self.spamcount = self.hamcount = self.killcount = 0
          self.spamprob = None
  
      def __repr__(self):
!         return "WordInfo%r" % repr((self.modtime, self.spamcount,
                                      self.hamcount, self.killcount,
                                      self.spamprob))
  
      def __getstate__(self):
!         return (self.modtime, self.spamcount, self.hamcount, self.killcount,
                  self.spamprob)
  
      def __setstate__(self, t):
!         (self.modtime, self.spamcount, self.hamcount, self.killcount,
           self.spamprob) = t
  
--- 80,109 ----
                   'spamprob',  # prob(spam | msg contains this word)
                  )
+     # (*)atime is the last access time, a UTC time.time() value.  It's the
+     # most recent time this word was used by scoring (i.e., by spamprob(),
+     # not by training via learn()); or, if the word has never been used by
+     # scoring, the time the word record was created (i.e., by learn()).
+     # One good criterion for identifying junk (word records that have no
+     # value) is to delete words that haven't been used for a long time.
+     # Perhaps they were typos, or unique identifiers, or relevant to a
+     # once-hot topic or scam that's fallen out of favor.  Whatever, if
+     # a word is no longer being used, it's just wasting space.
  
!     def __init__(self, atime):
!         self.atime = atime
          self.spamcount = self.hamcount = self.killcount = 0
          self.spamprob = None
  
      def __repr__(self):
!         return "WordInfo%r" % repr((self.atime, self.spamcount,
                                      self.hamcount, self.killcount,
                                      self.spamprob))
  
      def __getstate__(self):
!         return (self.atime, self.spamcount, self.hamcount, self.killcount,
                  self.spamprob)
  
      def __setstate__(self, t):
!         (self.atime, self.spamcount, self.hamcount, self.killcount,
           self.spamprob) = t
  
***************
*** 146,150 ****
                  prob = UNKNOWN_SPAMPROB
              else:
!                 record.modtime = now
                  prob = record.spamprob
  
--- 155,159 ----
                  prob = UNKNOWN_SPAMPROB
              else:
!                 record.atime = now
                  prob = record.spamprob
  
***************
*** 246,250 ****
          mincount = float(mincount)
          for w, r in wordinfo.iteritems():
!             if (r.modtime < oldesttime and
                  SPAMBIAS*r.spamcount + HAMBIAS*r.hamcount < mincount):
                  if self.DEBUG:
--- 255,259 ----
          mincount = float(mincount)
          for w, r in wordinfo.iteritems():
!             if (r.atime < oldesttime and
                  SPAMBIAS*r.spamcount + HAMBIAS*r.hamcount < mincount):
                  if self.DEBUG: