[Spambayes-checkins] spambayes/contrib tte.py,1.11,1.12
Skip Montanaro
montanaro at users.sourceforge.net
Sun Jul 25 05:00:55 CEST 2004
Update of /cvsroot/spambayes/spambayes/contrib
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv3684
Modified Files:
tte.py
Log Message:
Add --ratio=N flag to allow the user to adjust the ratio of spam to ham.
Seems unfortunately necessary in this spam-heavy world we live in.
Index: tte.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/contrib/tte.py,v
retrieving revision 1.11
retrieving revision 1.12
diff -C2 -d -r1.11 -r1.12
*** tte.py 10 Jul 2004 02:59:41 -0000 1.11
--- tte.py 25 Jul 2004 03:00:53 -0000 1.12
***************
*** 23,31 ****
all messages score correctly.
! -c ext - Cull all messages which aren't used as training input during any run
! and write to new ham and spam files with ext as an extra file extension.
! All messages which are never considered (because one training set is
! longer than the other or the -m flag was used to reduce the amount of
! input) are retained.
-o sect:opt:val -
--- 23,31 ----
all messages score correctly.
! -c ext - Cull all messages which aren't used as training input during any
! run and write to new ham and spam files with ext as an extra file
! extension. All messages which are never considered (because one
! training set is longer than the other or the -m flag was used to
! reduce the amount of input) are retained.
-o sect:opt:val -
***************
*** 36,39 ****
--- 36,43 ----
-R Walk backwards through the mailbox.
+ --ratio=n Define the number of spam messages to be trained for each ham.
+ The default is 1, but given the sorry state of the Net's email
+ infrastructure these days you'll probably want to raise it. Keep
+ it as close to 1 as you can...
Note: The -c command line argument isn't quite as benign as it might first
***************
*** 91,95 ****
return iter(seq)
! def train(store, ham, spam, maxmsgs, maxrounds, tdict, reverse, verbose):
smisses = hmisses = round = 0
ham_cutoff = Options.options["Categorization", "ham_cutoff"]
--- 95,100 ----
return iter(seq)
! def train(store, hambox, spambox, maxmsgs, maxrounds, tdict, reverse, verbose,
! ratio):
smisses = hmisses = round = 0
ham_cutoff = Options.options["Categorization", "ham_cutoff"]
***************
*** 97,102 ****
while round < maxrounds and (hmisses or smisses or round == 0):
! hambone = mboxutils.getmbox(ham)
! spamcan = mboxutils.getmbox(spam)
if reverse:
hambone = reversed(list(hambone))
--- 102,107 ----
while round < maxrounds and (hmisses or smisses or round == 0):
! hambone = mboxutils.getmbox(hambox)
! spamcan = mboxutils.getmbox(spambox)
if reverse:
hambone = reversed(list(hambone))
***************
*** 111,124 ****
try:
while not maxmsgs or nmsgs < maxmsgs:
! hammsg = hambone.next()
! spammsg = spamcan.next()
! nmsgs += 2
sys.stdout.write("\r%5d" % nmsgs)
sys.stdout.flush()
! score = store.spamprob(tokenize(hammsg))
! selector = (hammsg["message-id"] or
! hammsg["subject"])
if score > ham_cutoff and selector is not None:
if verbose:
--- 116,137 ----
try:
while not maxmsgs or nmsgs < maxmsgs:
! ham = hambone.next()
! spams = []
! for i in range(ratio):
! try:
! spams.append(spamcan.next())
! except StopIteration:
! # no spams left so exit
! if not spams:
! raise
! # use what we've collected
! break
! nmsgs += 1 + len(spams)
sys.stdout.write("\r%5d" % nmsgs)
sys.stdout.flush()
! score = store.spamprob(tokenize(ham))
! selector = ham["message-id"] or ham["subject"]
if score > ham_cutoff and selector is not None:
if verbose:
***************
*** 126,142 ****
score, selector)
hmisses += 1
! tdict[hammsg["message-id"]] = True
! store.learn(tokenize(hammsg), False)
! score = store.spamprob(tokenize(spammsg))
! selector = (spammsg["message-id"] or
! spammsg["subject"])
! if score < spam_cutoff and selector is not None:
! if verbose:
! print >> sys.stderr, "miss spam: %.6f %s" % (
! score, selector)
! smisses += 1
! tdict[spammsg["message-id"]] = True
! store.learn(tokenize(spammsg), True)
except StopIteration:
--- 139,156 ----
score, selector)
hmisses += 1
! tdict[ham["message-id"]] = True
! store.learn(tokenize(ham), False)
! for spam in spams:
! score = store.spamprob(tokenize(spam))
! selector = (spam["message-id"] or
! spam["subject"])
! if score < spam_cutoff and selector is not None:
! if verbose:
! print >> sys.stderr, "miss spam: %.6f %s" % (
! score, selector)
! smisses += 1
! tdict[spam["message-id"]] = True
! store.learn(tokenize(spam), True)
except StopIteration:
***************
*** 176,180 ****
"database=", "pickle=", "verbose",
"option=", "max=", "maxrounds=",
! "cullext=", "reverse"])
except getopt.GetoptError, msg:
usage(msg)
--- 190,194 ----
"database=", "pickle=", "verbose",
"option=", "max=", "maxrounds=",
! "cullext=", "reverse", "ratio="])
except getopt.GetoptError, msg:
usage(msg)
***************
*** 186,189 ****
--- 200,204 ----
verbose = False
reverse = False
+ sh_ratio = 1
for opt, arg in opts:
if opt in ("-h", "--help"):
***************
*** 206,209 ****
--- 221,226 ----
elif opt in ('-o', '--option'):
Options.options.set_from_cmdline(arg, sys.stderr)
+ elif opt == '--ratio':
+ sh_ratio = int(arg)
if ham is None or spam is None:
***************
*** 221,225 ****
tdict = {}
! train(store, ham, spam, maxmsgs, maxrounds, tdict, reverse, verbose)
store.store()
--- 238,243 ----
tdict = {}
! train(store, ham, spam, maxmsgs, maxrounds, tdict, reverse, verbose,
! sh_ratio)
store.store()
More information about the Spambayes-checkins
mailing list