[Tutor] about a program
Abdirizak abdi
a_abdi406@yahoo.com
Sun Jun 8 09:27:01 2003
--0-1080038127-1055078789=:59825
Content-Type: multipart/alternative; boundary="0-1564231918-1055078789=:59825"
--0-1564231918-1055078789=:59825
Content-Type: text/plain; charset=us-ascii
Hi,
I was working on a program that verifies whether a given message is spam or not. the program uses statistical analysis based on Paul Graham's plan for spam. However I set up the alforithm of the program as follows:
1.read the mails spam or non-spam form respective directories(build_corpus())
2.coumpute the the frequency of each word
3.get a message to check for "spamness" and compute the probility of each word
by using the the frequency of the words and put in adictionary {'word':probality}
4.take the 15 most improbaple and put it in a list and combine them
5. if the sscore of the combination is greater than 90% then the message is spam
PROBLEM: I have a problem getting wrong values, so can anyone have a look the program set up and comment if I can figure out why I am getting the wrong values.
I know it is hard to follow someones's program setup but sometimes with a third party could be a lot of help.
the program is attached with this e-mail
thanks in advance
---------------------------------
Do you Yahoo!?
Free online calendar with sync to Outlook(TM).
--0-1564231918-1055078789=:59825
Content-Type: text/html; charset=us-ascii
<DIV>
<DIV><FONT size=2>
<P>Hi,</P>
<P>I was working on a program that verifies whether a given message is spam or not. the program uses statistical analysis based on Paul Graham's plan for spam. However I set up the alforithm of the program as follows:</P>
<P>1.read the mails spam or non-spam form respective directories(build_corpus())</P>
<P>2.coumpute the the frequency of each word</P>
<P>3.get a message to check for "spamness" and compute the probility of each word</P>
<P>by using the the frequency of the words and put in adictionary {'word':probality}</P>
<P>4.take the 15 most improbaple and put it in a list and combine them</P>
<P>5. if the sscore of the combination is greater than 90% then the message is spam</P>
<P>PROBLEM: I have a problem getting wrong values, so can anyone have a look the program set up and comment if I can figure out why I am getting the wrong values.</P>
<P>I know it is hard to follow someones's program setup but sometimes with a third party could be a lot of help.</P>
<P>the program is attached with this e-mail</P>
<P>thanks in advance</P></FONT></DIV></DIV><p><hr SIZE=1>
Do you Yahoo!?<br>
Free <a href="http://us.rd.yahoo.com/mail_us/tag/*http://calendar.yahoo.com">online calendar</a> with sync to Outlook(TM).
--0-1564231918-1055078789=:59825--
--0-1080038127-1055078789=:59825
Content-Type: text/plain; name="TestTest.py"
Content-Description: TestTest.py
Content-Disposition: inline; filename="TestTest.py"
import sys
import os
import re
import math
import cPickle
import string
#from utils import *
from glob import glob
def incr(hash,key):
""" this function counts the frequency of word """
hash[key]=hash.get(key,0)+1
#this function used glob which reads multiple files
def getwords(fn,addfreq):
#print "Read %s..."%fn
file=open(fn)
text=file.read()
file.close()
for word in text.split():
if len(word)<100:
lw=word.lower()
incr(addfreq,lw)
incr(addfreq,'*')
class Classifier(ClassifierI):
def __init__(self):
self.spam={'*': 0}
self.nonspam={'*': 0}
def classify(self,token):
return Token(LabeledText(token.type(),'spam'),token.loc())
def labels(self):
return ('spam','nonspam')
#def hello(self):
#print "hello"
def generate_prob(self,word):
""" This function computes the probability that a word occurs n times """
# first change case
lowerWord = word.lower()
#print "lowerWord= %s" %(lowerWord)#
# goodword frequency
g = float(self.nonspam.get(lowerWord, 0) * 2)
print "g = %5.3f" %(g)
#print self.nonspam.get(lowerWord, 0)
#print" g = %5.3f" %(g)#
# bad word frequency
b = float(self.spam.get(lowerWord, 0))
# non-spammed counts
goodCount = self.nonspam['*']
#print "good count = %d" %(goodCount)#
# spammed counts
badCount = self.spam['*']
# Not seen before
if g == 0 and b == 0:
return 0.2
# Not frequent enough
if g + b < 5:
return 0.2
bfreq = min (1.0 , b / badCount )
gfreq = min (1.0 , g / goodCount )
result = max(0.01, min(0.99, (bfreq /gfreq + bfreq)))
return result
def isSpam(self, Message):
""" """
#setup a regular expression
word_like = re.compile( '[-\w\'$]+')
temp_result = word_like.findall(Message)
#this is the third hashtable that will be stored
#the word and its calculated probability temp_dict{'word':probability}
temp_dict = {}
for word in temp_result:
p = self.generate_prob(word)
#print " p = %5.3f" %(p)#
p2 = abs(p - 0.5)
temp_dict[word] = p2
#print temp_dict
# call for report token which counts the frequency in
# descending order and returns a list of 16 less probable word frequency
list = report_tokens(temp_dict)
# call for combine fucntion which combines the probabilty of
# 16 less probable word frequency and returns combination
# of score greater than 0.90%
score = combine(list)
print score
if score > 0.90:
spam_message = "the message is spammed....."
print spam_message
#return spam_message
else:
non_spam_message = "the message is non spammed....."
print non_spam_message
#return non_spam_message
def build_corpus():
"""it scans several mails """
print "Scanning the files in the directory......"
count=Classifier()
for file in glob("Spam/msg*.txt"):
#print file
getwords(file,count.spam)
print "Spam: %d words known" % len(count.spam)
for file in glob("NonSpam/1000*-*.txt"):
#print file
getwords(file,count.nonspam)
print "Nonspam: %d words known" % len(count.nonspam)
return count
def main():
build_corpus()
save_Data()
sw =Classifier()
# for tseting
#'Spam/msg101.txt'
file = open('NonSpam/10002-nspm.txt')
text = file.read()
sw.isSpam(text)
if __name__=='__main__':
main()
#for arg in sys.argv[1:]:
#global results
#load_Data()
#build_corpus()
#save_Data()
#sw =Classifier()
# for tseting
#'Spam/msg101.txt'
#file = open('10002-nspm.txt')
#text = file.read()
#sw.isSpam(text)
#------------------------------------------
#a = Classifier()
#rint a.classify(Token("this is a sentence"))
#b = a.hello()
#b
#------------------------------
--0-1080038127-1055078789=:59825--