[Spambayes-checkins]
spambayes/pspam/pspam __init__.py,NONE,1.1 database.py,NONE,1.1
folder.py,NONE,1.1 message.py,NONE,1.1 options.py,NONE,1.1
profile.py,NONE,1.1
Jeremy Hylton
jhylton@users.sourceforge.net
Mon Nov 4 04:44:22 2002
- Previous message: [Spambayes-checkins] spambayes/pspam README.txt,NONE,1.1
pop.py,NONE,1.1vmspam.ini,NONE,1.1zeo.sh,NONE,1.1
- Next message: [Spambayes-checkins] spambayes/Outlook2000 addin.py,1.25,1.26
- Messages sorted by:
[ date ]
[ thread ]
[ subject ]
[ author ]
Update of /cvsroot/spambayes/spambayes/pspam/pspam
In directory usw-pr-cvs1:/tmp/cvs-serv21558/pspam/pspam
Added Files:
__init__.py database.py folder.py message.py options.py
profile.py
Log Message:
Initial checkin of pspam code.
--- NEW FILE: __init__.py ---
"""Package for interacting with VM folders.
Design notes go here.
Use ZODB to store training data and classifier.
The spam and ham data are culled from sets of folders. The actual
tokenized messages are stored in a training database. When the folder
changes, the training data is updated.
- Updates are incremental.
- Changes to a folder are detected based on mtime and folder size.
- The contents of the folder are keyed on message-id.
- If a message is removed from a folder, it is removed from training data.
"""
--- NEW FILE: database.py ---
from pspam.options import options
import ZODB
from ZEO.ClientStorage import ClientStorage
import zLOG
import os
def logging():
os.environ["STUPID_LOG_FILE"] = options.event_log_file
os.environ["STUPID_LOG_SEVERITY"] = str(options.event_log_severity)
zLOG.initialize()
def open():
cs = ClientStorage(options.zeo_addr)
db = ZODB.DB(cs, cache_size=options.cache_size)
return db
--- NEW FILE: folder.py ---
import ZODB
from Persistence import Persistent
from BTrees.OOBTree import OOBTree, OOSet, difference
import email
import mailbox
import os
import stat
from pspam.message import PMessage
def factory(fp):
try:
return email.message_from_file(fp, PMessage)
except email.Errors.MessageError, msg:
print msg
return PMessage()
class Folder(Persistent):
def __init__(self, path):
self.path = path
self.mtime = 0
self.size = 0
self.messages = OOBTree()
def _stat(self):
t = os.stat(self.path)
self.mtime = t[stat.ST_MTIME]
self.size = t[stat.ST_SIZE]
def changed(self):
t = os.stat(self.path)
if (t[stat.ST_MTIME] != self.mtime
or t[stat.ST_SIZE] != self.size):
return True
else:
return False
def read(self):
"""Return messages added and removed from folder.
Two sets of message objects are returned. The first set is
messages that were added to the folder since the last read.
The second set is the messages that were removed from the
folder since the last read.
The code assumes messages are added and removed but not edited.
"""
mbox = mailbox.UnixMailbox(open(self.path, "rb"), factory)
self._stat()
cur = OOSet()
new = OOSet()
while 1:
msg = mbox.next()
if msg is None:
break
msgid = msg["message-id"]
cur.insert(msgid)
if not self.messages.has_key(msgid):
self.messages[msgid] = msg
new.insert(msg)
removed = difference(self.messages, cur)
for msgid in removed.keys():
del self.messages[msgid]
# XXX perhaps just return the OOBTree for removed?
return new, OOSet(removed.values())
if __name__ == "__main__":
f = Folder("/home/jeremy/Mail/INBOX")
--- NEW FILE: message.py ---
import ZODB
from Persistence import Persistent
from email.Message import Message
class PMessage(Message, Persistent):
def __hash__(self):
return id(self)
--- NEW FILE: options.py ---
from Options import options, all_options, \
boolean_cracker, float_cracker, int_cracker, string_cracker
from sets import Set
all_options["Score"] = {'max_ham': float_cracker,
'min_spam': float_cracker,
}
all_options["Train"] = {'folder_dir': string_cracker,
'spam_folders': ('get', lambda s: Set(s.split())),
'ham_folders': ('get', lambda s: Set(s.split())),
}
all_options["Proxy"] = {'server': string_cracker,
'server_port': int_cracker,
'proxy_port': int_cracker,
'log_pop_session': boolean_cracker,
'log_pop_session_file': string_cracker,
}
all_options["ZODB"] = {'zeo_addr': string_cracker,
'event_log_file': string_cracker,
'event_log_severity': int_cracker,
'cache_size': int_cracker,
}
import os
options.mergefiles("vmspam.ini")
def mergefile(p):
options.mergefiles(p)
--- NEW FILE: profile.py ---
"""Spam/ham profile for a single VM user."""
import ZODB
from ZODB.PersistentList import PersistentList
from Persistence import Persistent
from BTrees.OOBTree import OOBTree
import classifier
from tokenizer import tokenize
from pspam.folder import Folder
import os
def open_folders(dir, names, klass):
L = []
for name in names:
path = os.path.join(dir, name)
L.append(klass(path))
return L
import time
_start = None
def log(s):
global _start
if _start is None:
_start = time.time()
print round(time.time() - _start, 2), s
class IterOOBTree(OOBTree):
def iteritems(self):
return self.items()
class WordInfo(Persistent):
def __init__(self, atime, spamprob=None):
self.atime = atime
self.spamcount = self.hamcount = self.killcount = 0
self.spamprob = spamprob
def __repr__(self):
return "WordInfo%r" % repr((self.atime, self.spamcount,
self.hamcount, self.killcount,
self.spamprob))
class PBayes(classifier.Bayes, Persistent):
WordInfoClass = WordInfo
def __init__(self):
classifier.Bayes.__init__(self)
self.wordinfo = IterOOBTree()
# XXX what about the getstate and setstate defined in base class
class Profile(Persistent):
FolderClass = Folder
def __init__(self, folder_dir):
self._dir = folder_dir
self.classifier = PBayes()
self.hams = PersistentList()
self.spams = PersistentList()
def add_ham(self, folder):
p = os.path.join(self._dir, folder)
f = self.FolderClass(p)
self.hams.append(f)
def add_spam(self, folder):
p = os.path.join(self._dir, folder)
f = self.FolderClass(p)
self.spams.append(f)
def update(self):
"""Update classifier from current folder contents."""
changed1 = self._update(self.hams, False)
changed2 = self._update(self.spams, True)
if changed1 or changed2:
self.classifier.update_probabilities()
get_transaction().commit()
log("updated probabilities")
def _update(self, folders, is_spam):
changed = False
for f in folders:
log("update from %s" % f.path)
added, removed = f.read()
if added:
log("added %d" % len(added))
if removed:
log("removed %d" % len(removed))
get_transaction().commit()
if not (added or removed):
continue
changed = True
# It's important not to commit a transaction until
# after update_probabilities is called in update().
# Otherwise some new entries will cause scoring to fail.
for msg in added.keys():
self.classifier.learn(tokenize(msg), is_spam, False)
del added
get_transaction().commit(1)
log("learned")
for msg in removed.keys():
self.classifier.unlearn(tokenize(msg), is_spam, False)
if removed:
log("unlearned")
del removed
get_transaction().commit(1)
return changed
- Previous message: [Spambayes-checkins] spambayes/pspam README.txt,NONE,1.1
pop.py,NONE,1.1vmspam.ini,NONE,1.1zeo.sh,NONE,1.1
- Next message: [Spambayes-checkins] spambayes/Outlook2000 addin.py,1.25,1.26
- Messages sorted by:
[ date ]
[ thread ]
[ subject ]
[ author ]