[Spambayes] Split DB and no update_probabilities

Wed Nov 20 23:31:28 2002

I've got a patch to split the word database into two pieces:
one with the ham and spam counts, and one with the spamprobs.
At the same time, I got rid of the timestamps and killcounts,
both of which are rarely used (and can be added back in by
subclasses if they're really wanted).

With this patch, the spamprobs can be cheaply discarded en mass,
leading to a friendlier interface for incremental training.  To
go along with this, update_probabilities is eliminated, and the
probabilities are generated on demand and merely cached in the
spamprob database.

Unfortunately, this patch breaks all preexisting databases.
It wouldn't be too hard to write a bit of code to take an old
pickle and create a count database from it, but I'm too lazy
to do that at the moment.  (The spamprob database would, of
course, take care of itself.)  This patch also likely breaks
most client code, since update_probabilities no longer exists,
and the learn and unlearn methods don't (optionally) take a
boolean to control whether update_probabilities is called.
I could have left in a noop update_probabilities and ignored
the optional arguments to learn and unlearn... but this is
alpha code, and a clean break from the old ways is probably
better in the long run.

Since this patch does break stuff rather severely, I'm just
including it in this email for people to look at and play
with, instead of checking it in and thereby forcing it
down people's throats.

Enjoy.

- Alex

Index: TestDriver.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/TestDriver.py,v
retrieving revision 1.30
diff -c -r1.30 TestDriver.py
*** TestDriver.py	19 Nov 2002 17:43:27 -0000	1.30
--- TestDriver.py	20 Nov 2002 23:18:17 -0000
***************
*** 304,325 ****
              prob, clues = c.spamprob(e, True)
              printmsg(e, prob, clues)

-         if options.show_best_discriminators > 0:
-             print
-             print "    best discriminators:"
-             stats = [(-1, None)] * options.show_best_discriminators
-             smallest_killcount = -1
-             for w, r in c.wordinfo.iteritems():
-                 if r.killcount > smallest_killcount:
-                     heapreplace(stats, (r.killcount, w))
-                     smallest_killcount = stats[0][0]
-             stats.sort()
-             for count, w in stats:
-                 if count < 0:
-                     continue
-                 r = c.wordinfo[w]
-                 print "        %r %d %g" % (w, r.killcount, r.spamprob)
- 
          if options.show_histograms:
              printhist("this pair:", local_ham_hist, local_spam_hist)
          self.trained_ham_hist += local_ham_hist
--- 304,309 ----
Index: Tester.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Tester.py,v
retrieving revision 1.8
diff -c -r1.8 Tester.py
*** Tester.py	7 Nov 2002 22:30:04 -0000	1.8
--- Tester.py	20 Nov 2002 23:18:18 -0000
***************
*** 59,69 ****
          learn = self.classifier.learn
          if hamstream is not None:
              for example in hamstream:
!                 learn(example, False, False)
          if spamstream is not None:
              for example in spamstream:
!                 learn(example, True, False)
!         self.classifier.update_probabilities()

      # Untrain the classifier on streams of ham and spam.  Updates
      # probabilities before returning, and resets test results.
--- 59,68 ----
          learn = self.classifier.learn
          if hamstream is not None:
              for example in hamstream:
!                 learn(example, False)
          if spamstream is not None:
              for example in spamstream:
!                 learn(example, True)

      # Untrain the classifier on streams of ham and spam.  Updates
      # probabilities before returning, and resets test results.
***************
*** 72,82 ****
          unlearn = self.classifier.unlearn
          if hamstream is not None:
              for example in hamstream:
!                 unlearn(example, False, False)
          if spamstream is not None:
              for example in spamstream:
!                 unlearn(example, True, False)
!         self.classifier.update_probabilities()

      # Run prediction on each sample in stream.  You're swearing that stream
      # is entirely composed of spam (is_spam True), or of ham (is_spam False).
--- 71,80 ----
          unlearn = self.classifier.unlearn
          if hamstream is not None:
              for example in hamstream:
!                 unlearn(example, False)
          if spamstream is not None:
              for example in spamstream:
!                 unlearn(example, True)

      # Run prediction on each sample in stream.  You're swearing that stream
      # is entirely composed of spam (is_spam True), or of ham (is_spam False).
Index: classifier.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/classifier.py,v
retrieving revision 1.53
diff -c -r1.53 classifier.py
*** classifier.py	18 Nov 2002 18:23:09 -0000	1.53
--- classifier.py	20 Nov 2002 23:18:18 -0000
***************
*** 46,91 ****

  LN2 = math.log(2)       # used frequently by chi-combining

! PICKLE_VERSION = 1

! class WordInfo(object):
!     __slots__ = ('atime',     # when this record was last used by scoring(*)
!                  'spamcount', # # of spams in which this word appears
!                  'hamcount',  # # of hams in which this word appears
!                  'killcount', # # of times this made it to spamprob()'s nbest
!                  'spamprob',  # prob(spam | msg contains this word)
!                 )
! 
!     # Invariant:  For use in a classifier database, at least one of
!     # spamcount and hamcount must be non-zero.
!     #
!     # (*)atime is the last access time, a UTC time.time() value.  It's the
!     # most recent time this word was used by scoring (i.e., by spamprob(),
!     # not by training via learn()); or, if the word has never been used by
!     # scoring, the time the word record was created (i.e., by learn()).
!     # One good criterion for identifying junk (word records that have no
!     # value) is to delete words that haven't been used for a long time.
!     # Perhaps they were typos, or unique identifiers, or relevant to a
!     # once-hot topic or scam that's fallen out of favor.  Whatever, if
!     # a word is no longer being used, it's just wasting space.
! 
!     def __init__(self, atime, spamprob=options.unknown_word_prob):
!         self.atime = atime
!         self.spamcount = self.hamcount = self.killcount = 0
          self.spamprob = spamprob

      def __repr__(self):
!         return "WordInfo%r" % repr((self.atime, self.spamcount,
!                                     self.hamcount, self.killcount,
!                                     self.spamprob))

      def __getstate__(self):
!         return (self.atime, self.spamcount, self.hamcount, self.killcount,
!                 self.spamprob)

      def __setstate__(self, t):
!         (self.atime, self.spamcount, self.hamcount, self.killcount,
!          self.spamprob) = t

  class Bayes:
      # Defining __slots__ here made Jeremy's life needlessly difficult when
--- 46,82 ----

  LN2 = math.log(2)       # used frequently by chi-combining

! PICKLE_VERSION = 2

! class CountInfo(object):
!     __slots__ = ('spamcount', 'hamcount')
! 
!     def __init__(self):
!         self.spamcount = self.hamcount = 0
! 
!     def __repr__(self):
!         return "CountInfo%r" % repr((self.spamcount, self.hamcount))
! 
!     def __getstate__(self):
!         return (self.spamcount, self.hamcount)
! 
!     def __setstate__(self, t):
!         (self.spamcount, self.hamcount) = t
! 
! class ProbInfo(object):
!     __slots__ = ('spamprob')
! 
!     def __init__(self, spamprob=options.unknown_word_prob):
          self.spamprob = spamprob

      def __repr__(self):
!         return "ProbInfo%r" % repr((self.spamprob,))

      def __getstate__(self):
!         return (self.spamprob,)

      def __setstate__(self, t):
!         (self.spamprob,) = t

  class Bayes:
      # Defining __slots__ here made Jeremy's life needlessly difficult when
***************
*** 100,118 ****
      #            )

      # allow a subclass to use a different class for WordInfo
!     WordInfoClass = WordInfo

      def __init__(self):
!         self.wordinfo = {}
          self.nspam = self.nham = 0

      def __getstate__(self):
!         return PICKLE_VERSION, self.wordinfo, self.nspam, self.nham

      def __setstate__(self, t):
          if t[0] != PICKLE_VERSION:
              raise ValueError("Can't unpickle -- version %s unknown" % t[0])
!         self.wordinfo, self.nspam, self.nham = t[1:]

      # spamprob() implementations.  One of the following is aliased to
      # spamprob, depending on option settings.
--- 91,112 ----
      #            )

      # allow a subclass to use a different class for WordInfo
!     CountInfoClass = CountInfo
!     ProbInfoClass = ProbInfo

      def __init__(self):
!         self.countinfo = {}
!         self.probinfo = {}
          self.nspam = self.nham = 0

      def __getstate__(self):
!         return (PICKLE_VERSION, self.countinfo, self.probinfo,
!                 self.nspam, self.nham)

      def __setstate__(self, t):
          if t[0] != PICKLE_VERSION:
              raise ValueError("Can't unpickle -- version %s unknown" % t[0])
!         self.countinfo, self.probinfo, self.nspam, self.nham = t[1:]

      # spamprob() implementations.  One of the following is aliased to
      # spamprob, depending on option settings.
***************
*** 143,151 ****
          P = Q = 1.0
          Pexp = Qexp = 0
          clues = self._getclues(wordstream)
!         for prob, word, record in clues:
!             if record is not None:  # else wordinfo doesn't know about it
!                 record.killcount += 1
              P *= 1.0 - prob
              Q *= prob
              if P < 1e-200:  # move back into range
--- 137,143 ----
          P = Q = 1.0
          Pexp = Qexp = 0
          clues = self._getclues(wordstream)
!         for prob, word in clues:
              P *= 1.0 - prob
              Q *= prob
              if P < 1e-200:  # move back into range
***************
*** 232,240 ****
          Hexp = Sexp = 0

          clues = self._getclues(wordstream)
!         for prob, word, record in clues:
!             if record is not None:  # else wordinfo doesn't know about it
!                 record.killcount += 1
              S *= 1.0 - prob
              H *= prob
              if S < 1e-200:  # prevent underflow
--- 224,230 ----
          Hexp = Sexp = 0

          clues = self._getclues(wordstream)
!         for prob, word in clues:
              S *= 1.0 - prob
              H *= prob
              if S < 1e-200:  # prevent underflow
***************
*** 277,283 ****
      if options.use_chi_squared_combining:
          spamprob = chi2_spamprob

!     def learn(self, wordstream, is_spam, update_probabilities=True):
          """Teach the classifier by example.

          wordstream is a word stream representing a message.  If is_spam is
--- 267,273 ----
      if options.use_chi_squared_combining:
          spamprob = chi2_spamprob

!     def learn(self, wordstream, is_spam):
          """Teach the classifier by example.

          wordstream is a word stream representing a message.  If is_spam is
***************
*** 294,323 ****
          """

          self._add_msg(wordstream, is_spam)
-         if update_probabilities:
-             self.update_probabilities()

!     def unlearn(self, wordstream, is_spam, update_probabilities=True):
          """In case of pilot error, call unlearn ASAP after screwing up.

          Pass the same arguments you passed to learn().
          """

          self._remove_msg(wordstream, is_spam)
-         if update_probabilities:
-             self.update_probabilities()
- 
-     def update_probabilities(self):
-         """Update the word probabilities in the spam database.
- 
-         This computes a new probability for every word in the database,
-         so can be expensive.  learn() and unlearn() update the probabilities
-         each time by default.  Thay have an optional argument that allows
-         to skip this step when feeding in many messages, and in that case
-         you should call update_probabilities() after feeding the last
-         message and before calling spamprob().
-         """

          nham = float(self.nham or 1)
          nspam = float(self.nspam or 1)

--- 284,300 ----
          """

          self._add_msg(wordstream, is_spam)

!     def unlearn(self, wordstream, is_spam):
          """In case of pilot error, call unlearn ASAP after screwing up.

          Pass the same arguments you passed to learn().
          """

          self._remove_msg(wordstream, is_spam)

+     # Compute the probability reflected by a set of counts.
+     def _compute_probability(self, record):
          nham = float(self.nham or 1)
          nspam = float(self.nspam or 1)

***************
*** 330,406 ****
          S = options.unknown_word_strength
          StimesX = S * options.unknown_word_prob

!         for word, record in self.wordinfo.iteritems():
!             # Compute p(word) = prob(msg is spam | msg contains word).
!             # This is the Graham calculation, but stripped of biases, and
!             # stripped of clamping into 0.01 thru 0.99.  The Bayesian
!             # adjustment following keeps them in a sane range, and one
!             # that naturally grows the more evidence there is to back up
!             # a probability.
!             hamcount = record.hamcount
!             assert hamcount <= nham
!             hamratio = hamcount / nham
! 
!             spamcount = record.spamcount
!             assert spamcount <= nspam
!             spamratio = spamcount / nspam
! 
!             prob = spamratio / (hamratio + spamratio)

!             # Now do Robinson's Bayesian adjustment.
!             #
!             #         s*x + n*p(w)
!             # f(w) = --------------
!             #           s + n
!             #
!             # I find this easier to reason about like so (equivalent when
!             # s != 0):
!             #
!             #        x - p
!             #  p +  -------
!             #       1 + n/s
!             #
!             # IOW, it moves p a fraction of the distance from p to x, and
!             # less so the larger n is, or the smaller s is.

!             # Experimental:
!             # Picking a good value for n is interesting:  how much empirical
!             # evidence do we really have?  If nham == nspam,
!             # hamcount + spamcount makes a lot of sense, and the code here
!             # does that by default.
!             # But if, e.g., nham is much larger than nspam, p(w) can get a
!             # lot closer to 0.0 than it can get to 1.0.  That in turn makes
!             # strong ham words (high hamcount) much stronger than strong
!             # spam words (high spamcount), and that makes the accidental
!             # appearance of a strong ham word in spam much more damaging than
!             # the accidental appearance of a strong spam word in ham.
!             # So we don't give hamcount full credit when nham > nspam (or
!             # spamcount when nspam > nham):  instead we knock hamcount down
!             # to what it would have been had nham been equal to nspam.  IOW,
!             # we multiply hamcount by nspam/nham when nspam < nham; or, IOOW,
!             # we don't "believe" any count to an extent more than
!             # min(nspam, nham) justifies.
! 
!             n = hamcount * spam2ham  +  spamcount * ham2spam
!             prob = (StimesX + n * prob) / (S + n)
! 
!             if record.spamprob != prob:
!                 record.spamprob = prob
!                 # The next seemingly pointless line appears to be a hack
!                 # to allow a persistent db to realize the record has changed.
!                 self.wordinfo[word] = record
! 
!     def clearjunk(self, oldesttime):
!         """Forget useless wordinfo records.  This can shrink the database size.
! 
!         A record for a word will be retained only if the word was accessed
!         at or after oldesttime.
!         """

!         wordinfo = self.wordinfo
!         tonuke = [w for w, r in wordinfo.iteritems() if r.atime < oldesttime]
!         for w in tonuke:
!             del wordinfo[w]

      # NOTE:  Graham's scheme had a strange asymmetry:  when a word appeared
      # n>1 times in a single message, training added n to the word's hamcount
--- 307,373 ----
          S = options.unknown_word_strength
          StimesX = S * options.unknown_word_prob

!         # Compute p(word) = prob(msg is spam | msg contains word).
!         # This is the Graham calculation, but stripped of biases, and
!         # stripped of clamping into 0.01 thru 0.99.  The Bayesian
!         # adjustment following keeps them in a sane range, and one
!         # that naturally grows the more evidence there is to back up
!         # a probability.
!         hamcount = record.hamcount
!         assert hamcount <= nham
!         hamratio = hamcount / nham
! 
!         spamcount = record.spamcount
!         assert spamcount <= nspam
!         spamratio = spamcount / nspam

!         prob = spamratio / (hamratio + spamratio)

!         # Now do Robinson's Bayesian adjustment.
!         #
!         #         s*x + n*p(w)
!         # f(w) = --------------
!         #           s + n
!         #
!         # I find this easier to reason about like so (equivalent when
!         # s != 0):
!         #
!         #        x - p
!         #  p +  -------
!         #       1 + n/s
!         #
!         # IOW, it moves p a fraction of the distance from p to x, and
!         # less so the larger n is, or the smaller s is.

!         # Experimental:
!         # Picking a good value for n is interesting:  how much empirical
!         # evidence do we really have?  If nham == nspam,
!         # hamcount + spamcount makes a lot of sense, and the code here
!         # does that by default.
!         # But if, e.g., nham is much larger than nspam, p(w) can get a
!         # lot closer to 0.0 than it can get to 1.0.  That in turn makes
!         # strong ham words (high hamcount) much stronger than strong
!         # spam words (high spamcount), and that makes the accidental
!         # appearance of a strong ham word in spam much more damaging than
!         # the accidental appearance of a strong spam word in ham.
!         # So we don't give hamcount full credit when nham > nspam (or
!         # spamcount when nspam > nham):  instead we knock hamcount down
!         # to what it would have been had nham been equal to nspam.  IOW,
!         # we multiply hamcount by nspam/nham when nspam < nham; or, IOOW,
!         # we don't "believe" any count to an extent more than
!         # min(nspam, nham) justifies.
! 
!         n = hamcount * spam2ham  +  spamcount * ham2spam
!         prob = (StimesX + n * prob) / (S + n)
! 
!         return prob
! 
!     # Forget all the cached probability information.
!     # This is usually done in the process of learning or unlearning,
!     # since changing nham and nspam changes the probability of nearly
!     # every word in the database.
!     def _wipe_probinfo(self):
!         self.probinfo = {}

      # NOTE:  Graham's scheme had a strange asymmetry:  when a word appeared
      # n>1 times in a single message, training added n to the word's hamcount
***************
*** 427,447 ****
              self.nspam += 1
          else:
              self.nham += 1

!         wordinfo = self.wordinfo
!         wordinfoget = wordinfo.get
!         now = time.time()
          for word in Set(wordstream):
!             record = wordinfoget(word)
              if record is None:
!                 record = self.WordInfoClass(now)

              if is_spam:
                  record.spamcount += 1
              else:
                  record.hamcount += 1
              # Needed to tell a persistent DB that the content changed.
!             wordinfo[word] = record

      def _remove_msg(self, wordstream, is_spam):
          if is_spam:
--- 394,414 ----
              self.nspam += 1
          else:
              self.nham += 1
+         self._wipe_probinfo()

!         countinfo = self.countinfo
!         countinfoget = countinfo.get
          for word in Set(wordstream):
!             record = countinfoget(word)
              if record is None:
!                 record = self.CountInfoClass()

              if is_spam:
                  record.spamcount += 1
              else:
                  record.hamcount += 1
              # Needed to tell a persistent DB that the content changed.
!             countinfo[word] = record

      def _remove_msg(self, wordstream, is_spam):
          if is_spam:
***************
*** 452,462 ****
              if self.nham <= 0:
                  raise ValueError("non-spam count would go negative!")
              self.nham -= 1

!         wordinfo = self.wordinfo
!         wordinfoget = wordinfo.get
          for word in Set(wordstream):
!             record = wordinfoget(word)
              if record is not None:
                  if is_spam:
                      if record.spamcount > 0:
--- 419,430 ----
              if self.nham <= 0:
                  raise ValueError("non-spam count would go negative!")
              self.nham -= 1
+         self._wipe_probinfo()

!         countinfo = self.countinfo
!         countinfoget = countinfo.get
          for word in Set(wordstream):
!             record = countinfoget(word)
              if record is not None:
                  if is_spam:
                      if record.spamcount > 0:
***************
*** 465,474 ****
                      if record.hamcount > 0:
                          record.hamcount -= 1
                  if record.hamcount == 0 == record.spamcount:
!                     del wordinfo[word]
                  else:
                      # Needed to tell a persistent DB that the content changed.
!                     wordinfo[word] = record

      def _getclues(self, wordstream):
          mindist = options.minimum_prob_strength
--- 433,454 ----
                      if record.hamcount > 0:
                          record.hamcount -= 1
                  if record.hamcount == 0 == record.spamcount:
!                     del countinfo[word]
                  else:
                      # Needed to tell a persistent DB that the content changed.
!                     countinfo[word] = record
! 
!     # Handle the generation and caching of the spamprob values.
!     def _getprobability(self, word):
!         record = self.probinfo.get(word)
!         if record is None:
!             counts = self.countinfo.get(word)
!             if counts is None:
!                 return options.unknown_word_prob
!             record = self.ProbInfoClass()
!             record.spamprob = self._compute_probability(counts)
!             self.probinfo[word] = record
!         return record.spamprob

      def _getclues(self, wordstream):
          mindist = options.minimum_prob_strength
***************
*** 477,497 ****
          clues = []  # (distance, prob, word, record) tuples
          pushclue = clues.append

!         wordinfoget = self.wordinfo.get
!         now = time.time()
          for word in Set(wordstream):
!             record = wordinfoget(word)
!             if record is None:
!                 prob = unknown
!             else:
!                 record.atime = now
!                 prob = record.spamprob
              distance = abs(prob - 0.5)
              if distance >= mindist:
!                 pushclue((distance, prob, word, record))

          clues.sort()
          if len(clues) > options.max_discriminators:
              del clues[0 : -options.max_discriminators]
!         # Return (prob, word, record).
          return [t[1:] for t in clues]
--- 457,471 ----
          clues = []  # (distance, prob, word, record) tuples
          pushclue = clues.append

!         probget = self._getprobability
          for word in Set(wordstream):
!             prob = probget(word)
              distance = abs(prob - 0.5)
              if distance >= mindist:
!                 pushclue((distance, prob, word))

          clues.sort()
          if len(clues) > options.max_discriminators:
              del clues[0 : -options.max_discriminators]
!         # Return (prob, word).
          return [t[1:] for t in clues]