[Spambayes-checkins] spambayes/pspam/pspam __init__.py,NONE,1.1 database.py,NONE,1.1 folder.py,NONE,1.1 message.py,NONE,1.1 options.py,NONE,1.1 profile.py,NONE,1.1

Jeremy Hylton jhylton@users.sourceforge.net
Mon Nov 4 04:44:22 2002


Update of /cvsroot/spambayes/spambayes/pspam/pspam
In directory usw-pr-cvs1:/tmp/cvs-serv21558/pspam/pspam

Added Files:
	__init__.py database.py folder.py message.py options.py 
	profile.py 
Log Message:
Initial checkin of pspam code.


--- NEW FILE: __init__.py ---
"""Package for interacting with VM folders.

Design notes go here.

Use ZODB to store training data and classifier.

The spam and ham data are culled from sets of folders.  The actual
tokenized messages are stored in a training database.  When the folder
changes, the training data is updated.

- Updates are incremental.
- Changes to a folder are detected based on mtime and folder size.
- The contents of the folder are keyed on message-id.
- If a message is removed from a folder, it is removed from training data.
"""

--- NEW FILE: database.py ---
from pspam.options import options

import ZODB
from ZEO.ClientStorage import ClientStorage
import zLOG

import os

def logging():
    os.environ["STUPID_LOG_FILE"] = options.event_log_file
    os.environ["STUPID_LOG_SEVERITY"] = str(options.event_log_severity)
    zLOG.initialize()

def open():
    cs = ClientStorage(options.zeo_addr)
    db = ZODB.DB(cs, cache_size=options.cache_size)
    return db

--- NEW FILE: folder.py ---
import ZODB
from Persistence import Persistent
from BTrees.OOBTree import OOBTree, OOSet, difference

import email
import mailbox
import os
import stat

from pspam.message import PMessage

def factory(fp):
    try:
        return email.message_from_file(fp, PMessage)
    except email.Errors.MessageError, msg:
        print msg
        return PMessage()

class Folder(Persistent):

    def __init__(self, path):
        self.path = path
        self.mtime = 0
        self.size = 0
        self.messages = OOBTree()

    def _stat(self):
        t = os.stat(self.path)
        self.mtime = t[stat.ST_MTIME]
        self.size = t[stat.ST_SIZE]

    def changed(self):
        t = os.stat(self.path)
        if (t[stat.ST_MTIME] != self.mtime
            or t[stat.ST_SIZE] != self.size):
            return True
        else:
            return False

    def read(self):
        """Return messages added and removed from folder.

        Two sets of message objects are returned.  The first set is
        messages that were added to the folder since the last read.
        The second set is the messages that were removed from the
        folder since the last read.

        The code assumes messages are added and removed but not edited.
        """
        mbox = mailbox.UnixMailbox(open(self.path, "rb"), factory)
        self._stat()
        cur = OOSet()
        new = OOSet()
        while 1:
            msg = mbox.next()
            if msg is None:
                break
            msgid = msg["message-id"]
            cur.insert(msgid)
            if not self.messages.has_key(msgid):
                self.messages[msgid] = msg
                new.insert(msg)
                
        removed = difference(self.messages, cur)
        for msgid in removed.keys():
            del self.messages[msgid]

        # XXX perhaps just return the OOBTree for removed?
        return new, OOSet(removed.values())

if __name__ == "__main__":
    f = Folder("/home/jeremy/Mail/INBOX")

--- NEW FILE: message.py ---
import ZODB
from Persistence import Persistent
from email.Message import Message

class PMessage(Message, Persistent):

    def __hash__(self):
        return id(self)

--- NEW FILE: options.py ---
from Options import options, all_options, \
     boolean_cracker, float_cracker, int_cracker, string_cracker
from sets import Set     

all_options["Score"] = {'max_ham': float_cracker,
                        'min_spam': float_cracker,
                        }

all_options["Train"] = {'folder_dir': string_cracker,
                        'spam_folders': ('get', lambda s: Set(s.split())),
                        'ham_folders': ('get', lambda s: Set(s.split())),
                        }

all_options["Proxy"] = {'server': string_cracker,
                        'server_port': int_cracker,
                        'proxy_port': int_cracker,
                        'log_pop_session': boolean_cracker,
                        'log_pop_session_file': string_cracker,
                        }

all_options["ZODB"] = {'zeo_addr': string_cracker,
                       'event_log_file': string_cracker,
                       'event_log_severity': int_cracker,
                       'cache_size': int_cracker,
                       }

import os
options.mergefiles("vmspam.ini")

def mergefile(p):
    options.mergefiles(p)

--- NEW FILE: profile.py ---
"""Spam/ham profile for a single VM user."""

import ZODB
from ZODB.PersistentList import PersistentList
from Persistence import Persistent
from BTrees.OOBTree import OOBTree

import classifier
from tokenizer import tokenize

from pspam.folder import Folder

import os

def open_folders(dir, names, klass):
    L = []
    for name in names:
        path = os.path.join(dir, name)
        L.append(klass(path))
    return L

import time
_start = None
def log(s):
    global _start
    if _start is None:
        _start = time.time()
    print round(time.time() - _start, 2), s


class IterOOBTree(OOBTree):

    def iteritems(self):
        return self.items()

class WordInfo(Persistent):

    def __init__(self, atime, spamprob=None):
        self.atime = atime
        self.spamcount = self.hamcount = self.killcount = 0
        self.spamprob = spamprob

    def __repr__(self):
        return "WordInfo%r" % repr((self.atime, self.spamcount,
                                    self.hamcount, self.killcount,
                                    self.spamprob))

class PBayes(classifier.Bayes, Persistent):

    WordInfoClass = WordInfo

    def __init__(self):
        classifier.Bayes.__init__(self)
        self.wordinfo = IterOOBTree()

    # XXX what about the getstate and setstate defined in base class

class Profile(Persistent):

    FolderClass = Folder

    def __init__(self, folder_dir):
        self._dir = folder_dir
        self.classifier = PBayes()
        self.hams = PersistentList()
        self.spams = PersistentList()

    def add_ham(self, folder):
        p = os.path.join(self._dir, folder)
        f = self.FolderClass(p)
        self.hams.append(f)

    def add_spam(self, folder):
        p = os.path.join(self._dir, folder)
        f = self.FolderClass(p)
        self.spams.append(f)

    def update(self):
        """Update classifier from current folder contents."""
        changed1 = self._update(self.hams, False)
        changed2 = self._update(self.spams, True)
        if changed1 or changed2:
            self.classifier.update_probabilities()
        get_transaction().commit()
        log("updated probabilities")
        
    def _update(self, folders, is_spam):
        changed = False
        for f in folders:
            log("update from %s" % f.path)
            added, removed = f.read()
            if added:
                log("added %d" % len(added))
            if removed:    
                log("removed %d" % len(removed))
            get_transaction().commit()
            if not (added or removed):
                continue
            changed = True

            # It's important not to commit a transaction until
            # after update_probabilities is called in update().
            # Otherwise some new entries will cause scoring to fail.
            for msg in added.keys():
                self.classifier.learn(tokenize(msg), is_spam, False)
            del added
            get_transaction().commit(1)
            log("learned")
            for msg in removed.keys():
                self.classifier.unlearn(tokenize(msg), is_spam, False)
            if removed: 
                log("unlearned")
            del removed
            get_transaction().commit(1)
        return changed