[Spambayes-checkins] spambayes/Outlook2000 addin.py, 1.138, 1.139 manager.py, 1.98, 1.99 msgstore.py, 1.88, 1.89 tester.py, 1.23, 1.24 train.py, 1.39, 1.40

Tony Meyer anadelonbrin at users.sourceforge.net
Fri Nov 26 00:27:02 CET 2004


Update of /cvsroot/spambayes/spambayes/Outlook2000
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv23924/Outlook2000

Modified Files:
	addin.py manager.py msgstore.py tester.py train.py 
Log Message:
Stop using the deprecated access to the bayes database and use the manager.classifier_data
 directly.

Switch to using a spambayes.message.MessageInfo database rather than an Outlook specific
 one.  This allows us to store more data than just the 'trained' status that we currently
 store, and also reduces code duplication and simplifies the Outlook code a little
 bit.

I have tested this as much as possible, and run it for a couple of days here and it
 appears to work.  The old database should still be usable (both old style and new
 style data can be in the same database) and work, so it should be seemless.  The
 change is more-or-less the same as when the sb_server/sb_imapfilter database swapped
 to storing more than just 'c' and 't', and there weren't problems there, so fingers
 crossed...

Index: addin.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Outlook2000/addin.py,v
retrieving revision 1.138
retrieving revision 1.139
diff -C2 -d -r1.138 -r1.139
*** addin.py	17 Nov 2004 00:01:06 -0000	1.138
--- addin.py	25 Nov 2004 23:26:57 -0000	1.139
***************
*** 125,130 ****
      # If the message has been trained on, we certainly have seen it before.
      import train
!     if train.been_trained_as_ham(msgstore_message, manager.classifier_data) or \
!        train.been_trained_as_spam(msgstore_message, manager.classifier_data):
          return True
      # I considered checking if the "save spam score" option is enabled - but
--- 125,131 ----
      # If the message has been trained on, we certainly have seen it before.
      import train
!     manager.classifier_data.message_db.load_msg(msgstore_message)
!     if train.been_trained_as_ham(msgstore_message) or \
!        train.been_trained_as_spam(msgstore_message):
          return True
      # I considered checking if the "save spam score" option is enabled - but
***************
*** 149,155 ****
      else:
          print "already was trained as good"
!     assert train.been_trained_as_ham(msgstore_message, manager.classifier_data)
      if save_db:
!         manager.SaveBayesPostIncrementalTrain()
  
  def TrainAsSpam(msgstore_message, manager, rescore = True, save_db = True):
--- 150,157 ----
      else:
          print "already was trained as good"
!     manager.classifier_data.message_db.load_msg(msgstore_message)
!     assert train.been_trained_as_ham(msgstore_message)
      if save_db:
!         manager.classifier_data.SavePostIncrementalTrain()
  
  def TrainAsSpam(msgstore_message, manager, rescore = True, save_db = True):
***************
*** 167,174 ****
      else:
          print "already was trained as spam"
!     assert train.been_trained_as_spam(msgstore_message, manager.classifier_data)
      # And if the DB can save itself incrementally, do it now
      if save_db:
!         manager.SaveBayesPostIncrementalTrain()
  
  # Function to filter a message - note it is a msgstore msg, not an
--- 169,177 ----
      else:
          print "already was trained as spam"
!     manager.classifier_data.message_db.load_msg(msgstore_message)
!     assert train.been_trained_as_spam(msgstore_message)
      # And if the DB can save itself incrementally, do it now
      if save_db:
!         manager.classifier_data.SavePostIncrementalTrain()
  
  # Function to filter a message - note it is a msgstore msg, not an
***************
*** 190,194 ****
              if manager.config.training.train_recovered_spam:
                  import train
!                 if train.been_trained_as_spam(msgstore_message, manager.classifier_data):
                      need_train = True
                  else:
--- 193,198 ----
              if manager.config.training.train_recovered_spam:
                  import train
!                 manager.classifier_data.message_db.load_msg(msgstore_message)
!                 if train.been_trained_as_spam(msgstore_message):
                      need_train = True
                  else:
***************
*** 200,204 ****
                      # 'Unsure', then this event is unlikely to be the user
                      # re-classifying (and in fact it may simply be the Outlook
!                     # rules moving the item.
                      need_train = manager.config.filter.unsure_threshold < prop * 100
  
--- 204,208 ----
                      # 'Unsure', then this event is unlikely to be the user
                      # re-classifying (and in fact it may simply be the Outlook
!                     # rules moving the item).
                      need_train = manager.config.filter.unsure_threshold < prop * 100
  
***************
*** 422,426 ****
              # previously trained, try and optimize.
              import train
!             if train.been_trained_as_ham(msgstore_message, self.manager.classifier_data):
                  need_train = True
              else:
--- 426,431 ----
              # previously trained, try and optimize.
              import train
!             self.manager.classifier_data.message_db.load_msg(msgstore_message)
!             if train.been_trained_as_ham(msgstore_message):
                  need_train = True
              else:
***************
*** 441,444 ****
--- 446,450 ----
      if msgstore_message is None:
          return
+     mgr.classifier_data.message_db.load_msg(msgstore_message)
  
      item = msgstore_message.GetOutlookItem()
***************
*** 479,486 ****
      # Report whether this message has been trained or not.
      push("<br>\n")
-     trained_as = mgr.classifier_data.message_db.get(msgstore_message.searchkey)
      push("This message has %sbeen trained%s." % \
!          {'0' : ("", " as ham"), '1' : ("", " as spam"), None : ("not ", "")}
!          [trained_as])
      # Format the clues.
      push("<h2>%s Significant Tokens</h2>\n<PRE>" % len(clues))
--- 485,491 ----
      # Report whether this message has been trained or not.
      push("<br>\n")
      push("This message has %sbeen trained%s." % \
!          {False : ("", " as ham"), True : ("", " as spam"),
!           None : ("not ", "")}[msgstore_message.t])
      # Format the clues.
      push("<h2>%s Significant Tokens</h2>\n<PRE>" % len(clues))
***************
*** 707,711 ****
              # but we are smart enough to know we have already done it.
          # And if the DB can save itself incrementally, do it now
!         self.manager.SaveBayesPostIncrementalTrain()
          SetWaitCursor(0)
  
--- 712,716 ----
              # but we are smart enough to know we have already done it.
          # And if the DB can save itself incrementally, do it now
!         self.manager.classifier_data.SavePostIncrementalTrain()
          SetWaitCursor(0)
  
***************
*** 774,778 ****
              # but we are smart enough to know we have already done it.
          # And if the DB can save itself incrementally, do it now
!         self.manager.SaveBayesPostIncrementalTrain()
          SetWaitCursor(0)
  
--- 779,783 ----
              # but we are smart enough to know we have already done it.
          # And if the DB can save itself incrementally, do it now
!         self.manager.classifier_data.SavePostIncrementalTrain()
          SetWaitCursor(0)
  

Index: manager.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Outlook2000/manager.py,v
retrieving revision 1.98
retrieving revision 1.99
diff -C2 -d -r1.98 -r1.99
*** manager.py	2 Nov 2004 21:33:46 -0000	1.98
--- manager.py	25 Nov 2004 23:26:58 -0000	1.99
***************
*** 118,122 ****
  
  def import_core_spambayes_stuff(ini_filenames):
!     global bayes_classifier, bayes_tokenize, bayes_storage, bayes_options
      if "spambayes.Options" in sys.modules:
          # The only thing we are worried about here is spambayes.Options
--- 118,123 ----
  
  def import_core_spambayes_stuff(ini_filenames):
!     global bayes_classifier, bayes_tokenize, bayes_storage, bayes_options, \
!            bayes_message
      if "spambayes.Options" in sys.modules:
          # The only thing we are worried about here is spambayes.Options
***************
*** 144,150 ****
--- 145,153 ----
      from spambayes.tokenizer import tokenize
      from spambayes import storage
+     from spambayes import message
      bayes_classifier = classifier
      bayes_tokenize = tokenize
      bayes_storage = storage
+     bayes_message = message
      assert "spambayes.Options" in sys.modules, \
          "Expected 'spambayes.Options' to be loaded here"
***************
*** 170,174 ****
  # Base class for our "storage manager" - we choose between the pickle
  # and DB versions at runtime.  As our bayes uses spambayes.storage,
! # our base class can share common bayes loading code.
  class BasicStorageManager:
      db_extension = None # for pychecker - overwritten by subclass
--- 173,179 ----
  # Base class for our "storage manager" - we choose between the pickle
  # and DB versions at runtime.  As our bayes uses spambayes.storage,
! # our base class can share common bayes loading code, and we use
! # spambayes.message, so the base class can share common message info
! # code, too.
  class BasicStorageManager:
      db_extension = None # for pychecker - overwritten by subclass
***************
*** 186,205 ****
          bayes.store()
      def open_bayes(self):
!         raise NotImplementedError
      def close_bayes(self, bayes):
          bayes.close()
  
  class PickleStorageManager(BasicStorageManager):
      db_extension = ".pck"
!     def open_bayes(self):
!         return bayes_storage.PickledClassifier(self.bayes_filename)
!     def open_mdb(self):
!         return cPickle.load(open(self.mdb_filename, 'rb'))
      def new_mdb(self):
          return {}
-     def store_mdb(self, mdb):
-         SavePickle(mdb, self.mdb_filename)
-     def close_mdb(self, mdb):
-         pass
      def is_incremental(self):
          return False # False means we always save the entire DB
--- 191,209 ----
          bayes.store()
      def open_bayes(self):
!         return bayes_storage.open_storage(self.bayes_filename, self.klass)
      def close_bayes(self, bayes):
          bayes.close()
+     def open_mdb(self):
+         return bayes_message.open_storage(self.mdb_filename, self.klass)
+     def store_mdb(self, mdb):
+         mdb.store()
+     def close_mdb(self, mdb):
+         mdb.close()
  
  class PickleStorageManager(BasicStorageManager):
      db_extension = ".pck"
!     klass = "pickle"
      def new_mdb(self):
          return {}
      def is_incremental(self):
          return False # False means we always save the entire DB
***************
*** 207,217 ****
  class DBStorageManager(BasicStorageManager):
      db_extension = ".db"
!     def open_bayes(self):
!         # bsddb doesn't handle unicode filenames yet :(
!         fname = self.bayes_filename.encode(filesystem_encoding)
!         return bayes_storage.DBDictClassifier(fname)
!     def open_mdb(self):
!         fname = self.mdb_filename.encode(filesystem_encoding)
!         return bsddb.hashopen(fname)
      def new_mdb(self):
          try:
--- 211,220 ----
  class DBStorageManager(BasicStorageManager):
      db_extension = ".db"
!     klass = "dbm"
!     def __init__(self, bayes_base_name, mdb_base_name):
!         self.bayes_filename = bayes_base_name.encode(filesystem_encoding) + \
!                               self.db_extension
!         self.mdb_filename = mdb_base_name.encode(filesystem_encoding) + \
!                             self.db_extension
      def new_mdb(self):
          try:
***************
*** 220,227 ****
              if e.errno != errno.ENOENT: raise
          return self.open_mdb()
-     def store_mdb(self, mdb):
-         mdb.sync()
-     def close_mdb(self, mdb):
-         mdb.close()
      def is_incremental(self):
          return True # True means only changed records get actually written
--- 223,226 ----
***************
*** 424,432 ****
          db_manager = ManagerClass(bayes_base, mdb_base)
          self.classifier_data = ClassifierData(db_manager, self)
-         self.LoadBayes()
-         self.stats = oastats.Stats(self.config, self.data_directory)
- 
-     # "old" bayes functions - new code should use "classifier_data" directly
-     def LoadBayes(self):
          try:
              self.classifier_data.Load()
--- 423,426 ----
***************
*** 434,445 ****
              self.ReportFatalStartupError("Failed to load bayes database")
              self.classifier_data.InitNew()
  
!     def InitNewBayes(self):
!         self.classifier_data.InitNew()
!     def SaveBayes(self):
!         self.classifier_data.Save()
!     def SaveBayesPostIncrementalTrain(self):
!         self.classifier_data.SavePostIncrementalTrain()
!     # Logging - this too should be somewhere else.
      def LogDebug(self, level, *args):
          if self.verbose >= level:
--- 428,434 ----
              self.ReportFatalStartupError("Failed to load bayes database")
              self.classifier_data.InitNew()
+         self.stats = oastats.Stats(self.config, self.data_directory)
  
!     # Logging - this should be somewhere else.
      def LogDebug(self, level, *args):
          if self.verbose >= level:

Index: msgstore.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Outlook2000/msgstore.py,v
retrieving revision 1.88
retrieving revision 1.89
diff -C2 -d -r1.88 -r1.89
*** msgstore.py	2 Nov 2004 21:34:56 -0000	1.88
--- msgstore.py	25 Nov 2004 23:26:58 -0000	1.89
***************
*** 807,810 ****
--- 807,817 ----
          self.dirty = False
  
+         # For use with the spambayes.message messageinfo database.
+         self.stored_attributes = ['t',]
+ 
+     def getDBKey(self):
+         # Long lived search key.
+         return self.searchkey
+ 
      def __repr__(self):
          if self.id is None:

Index: tester.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Outlook2000/tester.py,v
retrieving revision 1.23
retrieving revision 1.24
diff -C2 -d -r1.23 -r1.24
*** tester.py	24 Dec 2003 04:08:38 -0000	1.23
--- tester.py	25 Nov 2004 23:26:58 -0000	1.24
***************
*** 258,265 ****
          # Now move the message back to the inbox - it should get trained.
          store_msg = driver.manager.message_store.GetMessage(spam_msg)
          import train
!         if train.been_trained_as_ham(store_msg, driver.manager.classifier_data):
              TestFailed("This new spam message should not have been trained as ham yet")
!         if train.been_trained_as_spam(store_msg, driver.manager.classifier_data):
              TestFailed("This new spam message should not have been trained as spam yet")
          spam_msg.Move(folder_watch)
--- 258,266 ----
          # Now move the message back to the inbox - it should get trained.
          store_msg = driver.manager.message_store.GetMessage(spam_msg)
+         driver.manager.classifier_data.message_db.load_msg(store_msg)
          import train
!         if train.been_trained_as_ham(store_msg):
              TestFailed("This new spam message should not have been trained as ham yet")
!         if train.been_trained_as_spam(store_msg):
              TestFailed("This new spam message should not have been trained as spam yet")
          spam_msg.Move(folder_watch)
***************
*** 269,272 ****
--- 270,274 ----
              TestFailed("The message appears to have been filtered out of the watch folder")
          store_msg = driver.manager.message_store.GetMessage(spam_msg)
+         driver.manager.classifier_data.message_db.load_msg(store_msg)
          need_untrain = True
          try:
***************
*** 275,281 ****
              if nham+1 != bayes.nham:
                  TestFailed("There was not one more ham messages after a re-train")
!             if train.been_trained_as_spam(store_msg, driver.manager.classifier_data):
                  TestFailed("This new spam message should not have been trained as spam yet")
!             if not train.been_trained_as_ham(store_msg, driver.manager.classifier_data):
                  TestFailed("This new spam message should have been trained as ham now")
              # word infos should have one extra ham
--- 277,283 ----
              if nham+1 != bayes.nham:
                  TestFailed("There was not one more ham messages after a re-train")
!             if train.been_trained_as_spam(store_msg):
                  TestFailed("This new spam message should not have been trained as spam yet")
!             if not train.been_trained_as_ham(store_msg):
                  TestFailed("This new spam message should have been trained as ham now")
              # word infos should have one extra ham
***************
*** 289,299 ****
                  TestFailed("Could not find the message in the Spam folder")
              store_msg = driver.manager.message_store.GetMessage(spam_msg)
              if nspam +1 != bayes.nspam:
                  TestFailed("There should be one more spam now")
              if nham != bayes.nham:
                  TestFailed("There should be the same number of hams again")
!             if not train.been_trained_as_spam(store_msg, driver.manager.classifier_data):
                  TestFailed("This new spam message should have been trained as spam by now")
!             if train.been_trained_as_ham(store_msg, driver.manager.classifier_data):
                  TestFailed("This new spam message should have been un-trained as ham")
              # word infos should have one extra spam, no extra ham
--- 291,302 ----
                  TestFailed("Could not find the message in the Spam folder")
              store_msg = driver.manager.message_store.GetMessage(spam_msg)
+             driver.manager.classifier_data.message_db.load_msg(store_msg)
              if nspam +1 != bayes.nspam:
                  TestFailed("There should be one more spam now")
              if nham != bayes.nham:
                  TestFailed("There should be the same number of hams again")
!             if not train.been_trained_as_spam(store_msg):
                  TestFailed("This new spam message should have been trained as spam by now")
!             if train.been_trained_as_ham(store_msg):
                  TestFailed("This new spam message should have been un-trained as ham")
              # word infos should have one extra spam, no extra ham
***************
*** 308,312 ****
                  TestFailed("Could not find the message in the Unsure folder")
              store_msg = driver.manager.message_store.GetMessage(spam_msg)
!             if not train.been_trained_as_spam(store_msg, driver.manager.classifier_data):
                  TestFailed("Message was not identified as Spam after moving")
  
--- 311,316 ----
                  TestFailed("Could not find the message in the Unsure folder")
              store_msg = driver.manager.message_store.GetMessage(spam_msg)
!             driver.manager.classifier_data.message_db.load_msg(store_msg)
!             if not train.been_trained_as_spam(store_msg):
                  TestFailed("Message was not identified as Spam after moving")
  
***************
*** 316,323 ****
              # Now undo the damage we did.
              was_spam = train.untrain_message(store_msg, driver.manager.classifier_data)
              if not was_spam:
                  TestFailed("Untraining this message did not indicate it was spam")
!             if train.been_trained_as_spam(store_msg, driver.manager.classifier_data) or \
!                train.been_trained_as_ham(store_msg, driver.manager.classifier_data):
                  TestFailed("Untraining this message kept it has ham/spam")
              need_untrain = False
--- 320,328 ----
              # Now undo the damage we did.
              was_spam = train.untrain_message(store_msg, driver.manager.classifier_data)
+             driver.manager.classifier_data.message_db.load_msg(store_msg)
              if not was_spam:
                  TestFailed("Untraining this message did not indicate it was spam")
!             if train.been_trained_as_spam(store_msg) or \
!                train.been_trained_as_ham(store_msg):
                  TestFailed("Untraining this message kept it has ham/spam")
              need_untrain = False

Index: train.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Outlook2000/train.py,v
retrieving revision 1.39
retrieving revision 1.40
diff -C2 -d -r1.39 -r1.40
*** train.py	2 Nov 2004 21:36:54 -0000	1.39
--- train.py	25 Nov 2004 23:26:58 -0000	1.40
***************
*** 5,8 ****
--- 5,9 ----
  # Copyright PSF, license under the PSF license
  
+ import sys
  import traceback
  from win32com.mapi import mapi
***************
*** 17,29 ****
  # Note our Message Database uses PR_SEARCH_KEY, *not* PR_ENTRYID, as the
  # latter changes after a Move operation - see msgstore.py
! def been_trained_as_ham(msg, cdata):
!     if not cdata.message_db.has_key(msg.searchkey):
          return False
!     return cdata.message_db[msg.searchkey]=='0'
  
! def been_trained_as_spam(msg, cdata):
!     if not cdata.message_db.has_key(msg.searchkey):
          return False
!     return cdata.message_db[msg.searchkey]=='1'
  
  def train_message(msg, is_spam, cdata):
--- 18,30 ----
  # Note our Message Database uses PR_SEARCH_KEY, *not* PR_ENTRYID, as the
  # latter changes after a Move operation - see msgstore.py
! def been_trained_as_ham(msg):
!     if msg.t is None:
          return False
!     return msg.t == False
  
! def been_trained_as_spam(msg):
!     if msg.t is None:
          return False
!     return msg.t == True
  
  def train_message(msg, is_spam, cdata):
***************
*** 36,43 ****
      from spambayes.tokenizer import tokenize
  
!     if not cdata.message_db.has_key(msg.searchkey):
!         was_spam = None
!     else:
!         was_spam = cdata.message_db[msg.searchkey]=='1'
      if was_spam == is_spam:
          return False    # already correctly classified
--- 37,42 ----
      from spambayes.tokenizer import tokenize
  
!     cdata.message_db.load_msg(msg)
!     was_spam = msg.t
      if was_spam == is_spam:
          return False    # already correctly classified
***************
*** 51,55 ****
      # Learn the correct classification.
      cdata.bayes.learn(tokenize(stream), is_spam)
!     cdata.message_db[msg.searchkey] = ['0', '1'][is_spam]
      cdata.dirty = True
      return True
--- 50,55 ----
      # Learn the correct classification.
      cdata.bayes.learn(tokenize(stream), is_spam)
!     msg.t = is_spam
!     cdata.message_db.store_msg(msg)
      cdata.dirty = True
      return True
***************
*** 62,75 ****
      from spambayes.tokenizer import tokenize
      stream = msg.GetEmailPackageObject()
!     if been_trained_as_spam(msg, cdata):
!         assert not been_trained_as_ham(msg, cdata), "Can't have been both!"
          cdata.bayes.unlearn(tokenize(stream), True)
!         del cdata.message_db[msg.searchkey]
          cdata.dirty = True
          return True
!     if been_trained_as_ham(msg, cdata):
!         assert not been_trained_as_spam(msg, cdata), "Can't have been both!"
          cdata.bayes.unlearn(tokenize(stream), False)
!         del cdata.message_db[msg.searchkey]
          cdata.dirty = True
          return False
--- 62,76 ----
      from spambayes.tokenizer import tokenize
      stream = msg.GetEmailPackageObject()
!     cdata.message_db.load_msg(msg)
!     if been_trained_as_spam(msg):
!         assert not been_trained_as_ham(msg), "Can't have been both!"
          cdata.bayes.unlearn(tokenize(stream), True)
!         cdata.message_db.remove_msg(msg)
          cdata.dirty = True
          return True
!     if been_trained_as_ham(msg):
!         assert not been_trained_as_spam(msg), "Can't have been both!"
          cdata.bayes.unlearn(tokenize(stream), False)
!         cdata.message_db.remove_msg(msg)
          cdata.dirty = True
          return False



More information about the Spambayes-checkins mailing list