[Spambayescheckins] spambayes/spambayes Options.py, 1.72,
1.73 classifier.py, 1.9, 1.10
Tony Meyer
anadelonbrin at users.sourceforge.net
Sun Sep 14 21:08:13 EDT 2003
Update of /cvsroot/spambayes/spambayes/spambayes
In directory sc8prcvs1:/tmp/cvsserv10633/spambayes
Modified Files:
Options.py classifier.py
Log Message:
Goodbye Garycombining! (Use the LastGary tag to get it back).
Index: Options.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/Options.py,v
retrieving revision 1.72
retrieving revision 1.73
diff C2 d r1.72 r1.73
*** Options.py 9 Sep 2003 07:03:54 0000 1.72
 Options.py 15 Sep 2003 01:08:11 0000 1.73
***************
*** 371,382 ****
REAL, RESTORE),
 ("use_gary_combining", "Use garycombining", False,
 """The combining scheme currently detailed on the Robinson web page.
 The middle ground here is touchy, varying across corpus, and within
 a corpus across amounts of training data. It almost never gives
 extreme scores (near 0.0 or 1.0), but the tail ends of the ham and
 spam distributions overlap.""",
 BOOLEAN, RESTORE),

("use_chi_squared_combining", "Use chisquared combining", True,
"""For vectors of random, uniformly distributed probabilities,
 371,374 
Index: classifier.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/classifier.py,v
retrieving revision 1.9
retrieving revision 1.10
diff C2 d r1.9 r1.10
*** classifier.py 5 Sep 2003 01:15:28 0000 1.9
 classifier.py 15 Sep 2003 01:08:11 0000 1.10
***************
*** 25,28 ****
 25,31 
# advance, and it's touchy.
#
+ # The last version of the Garycombining scheme can be retrieved from our
+ # CVS repository via tag LastGary.
+ #
# The chicombining scheme used by default here gets closer to the theoretical
# basis of Gary's combining scheme, and does give extreme scores, but also
***************
*** 108,181 ****
# spamprob() implementations. One of the following is aliased to
# spamprob, depending on option settings.
!
! def gary_spamprob(self, wordstream, evidence=False):
! """Return bestguess probability that wordstream is spam.
!
! wordstream is an iterable object producing words.
! The return value is a float in [0.0, 1.0].
!
! If optional arg evidence is True, the return value is a pair
! probability, evidence
! where evidence is a list of (word, probability) pairs.
! """
!
! from math import frexp
!
! # This combination method is due to Gary Robinson; see
! # http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html
!
! # The real P = this P times 2**Pexp. Likewise for Q. We're
! # simulating unbounded dynamic float range by hand. If this pans
! # out, *maybe* we should store logarithms in the database instead
! # and just add them here. But I like keeping raw counts in the
! # database (they're easy to understand, manipulate and combine),
! # and there's no evidence that this simulation is a significant
! # expense.
! P = Q = 1.0
! Pexp = Qexp = 0
! clues = self._getclues(wordstream)
! for prob, word, record in clues:
! P *= 1.0  prob
! Q *= prob
! if P < 1e200: # move back into range
! P, e = frexp(P)
! Pexp += e
! if Q < 1e200: # move back into range
! Q, e = frexp(Q)
! Qexp += e
!
! P, e = frexp(P)
! Pexp += e
! Q, e = frexp(Q)
! Qexp += e
!
! num_clues = len(clues)
! if num_clues:
! #P = 1.0  P**(1./num_clues)
! #Q = 1.0  Q**(1./num_clues)
! #
! # (x*2**e)**n = x**n * 2**(e*n)
! n = 1.0 / num_clues
! P = 1.0  P**n * 2.0**(Pexp * n)
! Q = 1.0  Q**n * 2.0**(Qexp * n)
!
! # (PQ)/(P+Q) is in 1 .. 1; scaling into 0 .. 1 gives
! # ((PQ)/(P+Q)+1)/2 =
! # ((PQ+PQ)/(P+Q)/2 =
! # (2*P/(P+Q)/2 =
! # P/(P+Q)
! prob = P/(P+Q)
! else:
! prob = 0.5
!
! if evidence:
! clues = [(w, p) for p, w, r in clues]
! clues.sort(lambda a, b: cmp(a[1], b[1]))
! return prob, clues
! else:
! return prob
!
! if options["Classifier", "use_gary_combining"]:
! spamprob = gary_spamprob
# Across vectors of length n, containing random uniformlydistributed
 111,116 
# spamprob() implementations. One of the following is aliased to
# spamprob, depending on option settings.
! # Currently only chisquared is available, but maybe there will be
! # an alternative again someday.
# Across vectors of length n, containing random uniformlydistributed
