[Spambayes-checkins] spambayes Options.py,1.20,1.21 classifier.py,1.14,1.15

Tim Peters tim_one@users.sourceforge.net
Fri, 20 Sep 2002 19:46:23 -0700


Update of /cvsroot/spambayes/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv20258

Modified Files:
	Options.py classifier.py 
Log Message:
Added some speculative options for more of Gary Robinson's ideas.  Will
explain on the spambayes list.


Index: Options.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Options.py,v
retrieving revision 1.20
retrieving revision 1.21
diff -C2 -d -r1.20 -r1.21
*** Options.py	19 Sep 2002 09:34:56 -0000	1.20
--- Options.py	21 Sep 2002 02:46:20 -0000	1.21
***************
*** 150,155 ****
  max_discriminators: 16
  
! # Use Gary Robinson's scheme for combining probabilities.
  use_robinson_probability: False
  """
  
--- 150,168 ----
  max_discriminators: 16
  
! ###########################################################################
! # Speculative options for Gary Robinson's ideas.  These may go away, or
! # a bunch of incompatible stuff above may go away.
! 
! # Use Gary's scheme for combining probabilities.
! use_robinson_combining: False
! 
! # Use Gary's scheme for computing probabilities, along with its "a" and
! # "x" parameters.
  use_robinson_probability: False
+ robinson_probability_a: 1.0
+ robinson_probability_x: 0.5
+ 
+ # Use Gary's scheme for ranking probabilities.
+ use_robinson_ranking: False
  """
  
***************
*** 189,193 ****
--- 202,210 ----
                     'unknown_spamprob': float_cracker,
                     'max_discriminators': int_cracker,
+                    'use_robinson_combining': boolean_cracker,
                     'use_robinson_probability': boolean_cracker,
+                    'robinson_probability_a': float_cracker,
+                    'robinson_probability_x': float_cracker,
+                    'use_robinson_ranking': boolean_cracker,
                     },
  }

Index: classifier.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/classifier.py,v
retrieving revision 1.14
retrieving revision 1.15
diff -C2 -d -r1.14 -r1.15
*** classifier.py	21 Sep 2002 00:15:16 -0000	1.14
--- classifier.py	21 Sep 2002 02:46:20 -0000	1.15
***************
*** 314,357 ****
                  heapreplace(nbest, x)
  
!         if options.use_robinson_probability:
!             # This combination method is due to Gary Robinson.
!             # http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html
!             # In preliminary tests, it did just as well as Graham's scheme,
!             # but creates a definite "middle ground" around 0.5 where false
!             # negatives and false positives can actually found in non-trivial
!             # number.
!             P = Q = 1.0
!             num_clues = 0
!             for distance, prob, word, record in nbest:
!                 if prob is None:    # it's one of the dummies nbest started with
!                     continue
!                 if record is not None:  # else wordinfo doesn't know about it
!                     record.killcount += 1
!                 if evidence:
!                     clues.append((word, prob))
!                 num_clues += 1
!                 P *= 1.0 - prob
!                 Q *= prob
! 
!             if num_clues:
!                 P = 1.0 - P**(1./num_clues)
!                 Q = 1.0 - Q**(1./num_clues)
!                 prob = (P-Q)/(P+Q)  # in -1 .. 1
!                 prob = 0.5 + prob/2 # shift to 0 .. 1
!             else:
!                 prob = 0.5
!         else:
!             prob_product = inverse_prob_product = 1.0
!             for distance, prob, word, record in nbest:
!                 if prob is None:    # it's one of the dummies nbest started with
!                     continue
!                 if record is not None:  # else wordinfo doesn't know about it
!                     record.killcount += 1
!                 if evidence:
!                     clues.append((word, prob))
!                 prob_product *= prob
!                 inverse_prob_product *= 1.0 - prob
  
!             prob = prob_product / (prob_product + inverse_prob_product)
  
          if evidence:
--- 314,329 ----
                  heapreplace(nbest, x)
  
!         prob_product = inverse_prob_product = 1.0
!         for distance, prob, word, record in nbest:
!             if prob is None:    # it's one of the dummies nbest started with
!                 continue
!             if record is not None:  # else wordinfo doesn't know about it
!                 record.killcount += 1
!             if evidence:
!                 clues.append((word, prob))
!             prob_product *= prob
!             inverse_prob_product *= 1.0 - prob
  
!         prob = prob_product / (prob_product + inverse_prob_product)
  
          if evidence:
***************
*** 361,365 ****
              return prob
  
- 
      def learn(self, wordstream, is_spam, update_probabilities=True):
          """Teach the classifier by example.
--- 333,336 ----
***************
*** 479,480 ****
--- 450,601 ----
                  if record.hamcount == 0 == record.spamcount:
                      del self.wordinfo[word]
+ 
+ 
+     #************************************************************************
+     # Some options change so much behavior that it's better to write a
+     # different method.
+     # CAUTION:  These end up overwriting methods of the same name above.
+     # A subclass would be cleaner, but experiments will soon enough lead
+     # to only one of the alternatives surviving.
+ 
+     def robinson_spamprob(self, wordstream, evidence=False):
+         """Return best-guess probability that wordstream is spam.
+ 
+         wordstream is an iterable object producing words.
+         The return value is a float in [0.0, 1.0].
+ 
+         If optional arg evidence is True, the return value is a pair
+             probability, evidence
+         where evidence is a list of (word, probability) pairs.
+         """
+ 
+         from math import frexp
+ 
+         # A priority queue to remember the MAX_DISCRIMINATORS best
+         # probabilities, where "best" means largest distance from 0.5.
+         # The tuples are (distance, prob, word, wordinfo[word]).
+         nbest = [(-1.0, None, None, None)] * MAX_DISCRIMINATORS
+         smallest_best = -1.0
+ 
+         wordinfoget = self.wordinfo.get
+         now = time.time()
+         for word in Set(wordstream):
+             record = wordinfoget(word)
+             if record is None:
+                 prob = UNKNOWN_SPAMPROB
+             else:
+                 record.atime = now
+                 prob = record.spamprob
+ 
+             distance = abs(prob - 0.5)
+             if distance > smallest_best:
+                 heapreplace(nbest, (distance, prob, word, record))
+                 smallest_best = nbest[0][0]
+ 
+         # Compute the probability.
+         clues = []
+ 
+         # This combination method is due to Gary Robinson.
+         # http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html
+         # In preliminary tests, it did just as well as Graham's scheme,
+         # but creates a definite "middle ground" around 0.5 where false
+         # negatives and false positives can actually found in non-trivial
+         # number.
+ 
+         # The real P = this P times 2**Pexp.  Likewise for Q.  We're
+         # simulating unbounding dynamic float range by hand.  If this pans
+         # out, *maybe* we should store logarithms in the database instead
+         # and just add them here.
+         P = Q = 1.0
+         Pexp = Qexp = 0
+         num_clues = 0
+         for distance, prob, word, record in nbest:
+             if prob is None:    # it's one of the dummies nbest started with
+                 continue
+             if record is not None:  # else wordinfo doesn't know about it
+                 record.killcount += 1
+             if evidence:
+                 clues.append((word, prob))
+             num_clues += 1
+             P *= 1.0 - prob
+             Q *= prob
+             if P < 1e-200:  # move back into range
+                 P, e = frexp(P)
+                 Pexp += e
+             if Q < 1e-200:  # move back into range
+                 Q, e = frexp(Q)
+                 Qexp += e
+ 
+         P, e = frexp(P)
+         Pexp += e
+         Q, e = frexp(Q)
+         Qexp += e
+ 
+         if num_clues:
+             #P = 1.0 - P**(1./num_clues)
+             #Q = 1.0 - Q**(1./num_clues)
+             #
+             # (x*2**e)**n = x**n * 2**(e*n)
+             n = 1.0 / num_clues
+             P = 1.0 - P**n * 2.0**(Pexp * n)
+             Q = 1.0 - P**n * 2.0**(Qexp * n)
+ 
+             prob = (P-Q)/(P+Q)  # in -1 .. 1
+             prob = 0.5 + prob/2 # shift to 0 .. 1
+         else:
+             prob = 0.5
+ 
+         if evidence:
+             clues.sort(lambda a, b: cmp(a[1], b[1]))
+             return prob, clues
+         else:
+             return prob
+ 
+     if options.use_robinson_combining:
+         spamprob = robinson_spamprob
+ 
+     def robinson_update_probabilities(self):
+         """Update the word probabilities in the spam database.
+ 
+         This computes a new probability for every word in the database,
+         so can be expensive.  learn() and unlearn() update the probabilities
+         each time by default.  Thay have an optional argument that allows
+         to skip this step when feeding in many messages, and in that case
+         you should call update_probabilities() after feeding the last
+         message and before calling spamprob().
+         """
+ 
+         nham = float(self.nham or 1)
+         nspam = float(self.nspam or 1)
+         A = options.robinson_probability_a
+         X = options.robinson_probability_x
+         AoverX = A/X
+         for word, record in self.wordinfo.iteritems():
+             # Compute prob(msg is spam | msg contains word).
+             # This is the Graham calculation, but stripped of biases, and
+             # of clamping into 0.01 thru 0.99.
+             hamcount = min(record.hamcount, nham)
+             hamratio = hamcount / nham
+ 
+             spamcount = min(record.spamcount, nspam)
+             spamratio = spamcount / nspam
+ 
+             prob = spamratio / (hamratio + spamratio)
+ 
+             # Now do Robinson's Bayesian adjustment.
+             #
+             #         a + (n * p(w))
+             # f(w) = ---------------
+             #          (a / x) + n
+             n = hamcount + spamratio
+             prob = (A + n * prob) / (AoverX + n)
+ 
+             if record.spamprob != prob:
+                 record.spamprob = prob
+                 # The next seemingly pointless line appears to be a hack
+                 # to allow a persistent db to realize the record has changed.
+                 self.wordinfo[word] = record
+ 
+ 
+     if options.use_robinson_probability:
+         update_probabilities = robinson_update_probabilities