[Spambayes-checkins] spambayes imapfilter.py,1.36,1.37

Tony Meyer anadelonbrin at users.sourceforge.net
Mon Apr 28 01:01:29 EDT 2003


Update of /cvsroot/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv11246

Modified Files:
	imapfilter.py 
Log Message:
Switch from imapfilter using IMAP uids as persistent ids for
messages to our own generated ids.  This code runs on one
server I have, but I don't have time for more testing at the moment.
I'm checking it in since the old code is Bad and this should be
ok....

WARNING: You will need to disguard any hammie.db that you
have trained with imapfilter.py, and also any spambayes.messageinfo.db
files as well.

Index: imapfilter.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/imapfilter.py,v
retrieving revision 1.36
retrieving revision 1.37
diff -C2 -d -r1.36 -r1.37
*** imapfilter.py	28 Apr 2003 05:20:46 -0000	1.36
--- imapfilter.py	28 Apr 2003 07:01:20 -0000	1.37
***************
*** 93,96 ****
--- 93,101 ----
        This might help if the username/password has special characters like
        accented characters.
+     o The code currently FETCHes the whole RFC822 message in training, even
+       if it will subsequently decide that the message has already been
+       trained (it won't then train on it, it just throws the message away).
+       This is really inefficient, and a better solution needs to be
+       developed.
      o Suggestions?
  """
***************
*** 191,196 ****
      def __init__(self, server, port, debug=0, do_expunge=False):
          imaplib.Debug = debug  # this is a global in the imaplib module
!         imaplib.IMAP4.__init__(self, server, port)
!         # XXX We should check here to see if the server/port were valid
          # For efficiency, we remember which folder we are currently
          # in, and only send a select command to the IMAP server if
--- 196,208 ----
      def __init__(self, server, port, debug=0, do_expunge=False):
          imaplib.Debug = debug  # this is a global in the imaplib module
!         try:
!             imaplib.IMAP4.__init__(self, server, port)
!         except:
!             # A more specific except would be good here, but I get
!             # (in Python 2.2) a generic 'error' and a 'gaierror'
!             # if I pass a valid domain that isn't an IMAP server
!             # or invalid domain (respectively)
!             print "Invalid server or port, please check these settings."
!             sys.exit(-1)
          # For efficiency, we remember which folder we are currently
          # in, and only send a select command to the IMAP server if
***************
*** 200,206 ****
          self.do_expunge = do_expunge
  
!     def login(self, uid, pw):
          try:
!             imaplib.IMAP4.login(self, uid, pw)  # superclass login
          except imaplib.IMAP4.error, e:
              if str(e) == "permission denied":
--- 212,218 ----
          self.do_expunge = do_expunge
  
!     def login(self, username, pwd):
          try:
!             imaplib.IMAP4.login(self, username, pwd)  # superclass login
          except imaplib.IMAP4.error, e:
              if str(e) == "permission denied":
***************
*** 219,224 ****
      def SelectFolder(self, folder):
          '''A method to point ensuing imap operations at a target folder'''
!         if self.current_folder != None:
!             if self.current_folder != folder:
                  if self.do_expunge:
                      # It is faster to do close() than a single
--- 231,236 ----
      def SelectFolder(self, folder):
          '''A method to point ensuing imap operations at a target folder'''
!         if self.current_folder != folder:
!             if self.current_folder != None:
                  if self.do_expunge:
                      # It is faster to do close() than a single
***************
*** 252,257 ****
  
      def _force_CRLF(self, data):
!         """Make sure data uses CRLF for line termination.
!         """
          return CRLF_RE.sub('\r\n', data)
  
--- 264,268 ----
  
      def _force_CRLF(self, data):
!         """Make sure data uses CRLF for line termination."""
          return CRLF_RE.sub('\r\n', data)
  
***************
*** 266,271 ****
      def extractTime(self):
          # When we create a new copy of a message, we need to specify
!         # a timestamp for the message.  If the message has a date header
!         # we use that.  Otherwise, we use the current time.
          message_date = self["Date"]
          if message_date is not None:
--- 277,282 ----
      def extractTime(self):
          # When we create a new copy of a message, we need to specify
!         # a timestamp for the message.  If the message has a valid date
!         # header we use that.  Otherwise, we use the current time.
          message_date = self["Date"]
          if message_date is not None:
***************
*** 278,288 ****
      def MoveTo(self, dest):
          '''Note that message should move to another folder.  No move is
!         carried out until Save() is called.'''
!         # This move operation just changes where we think we are,
!         # and we do an actual move on save (to avoid doing
!         # this more than once)
          if self.previous_folder is None:
              self.previous_folder = self.folder
!             self.folder = dest
  
      def Save(self):
--- 289,296 ----
      def MoveTo(self, dest):
          '''Note that message should move to another folder.  No move is
!         carried out until Save() is called, for efficiency.'''
          if self.previous_folder is None:
              self.previous_folder = self.folder
!         self.folder = dest
  
      def Save(self):
***************
*** 290,294 ****
          # we can't actually update the message with IMAP
          # so what we do is create a new message and delete the old one
-         # we need to copy the flags as well
          if self.folder is None:
              raise RuntimeError, """Can't save a message that doesn't
--- 298,301 ----
***************
*** 297,301 ****
              raise RuntimeError, """Can't save a message that doesn't have
              an id."""
!         response = imap.uid("FETCH", self.id, "(FLAGS INTERNALDATE)")
          self._check(response, 'fetch (flags internaldate)')
          data = _extract_fetch_data(response[1][0])
--- 304,308 ----
              raise RuntimeError, """Can't save a message that doesn't have
              an id."""
!         response = imap.uid("FETCH", self.uid, "(FLAGS INTERNALDATE)")
          self._check(response, 'fetch (flags internaldate)')
          data = _extract_fetch_data(response[1][0])
***************
*** 312,318 ****
              flags = None
  
!         # See searching for new uid comments below
!         old_id = self.id
!         self["X-Spambayes-IMAP-OldID"] = old_id
  
          response = imap.append(self.folder.name, flags,
--- 319,328 ----
              flags = None
  
!         # Once, we used the IMAP uid to keep track of messages.
!         # This fails miserably because it's only guarenteed to be unique
!         # within a particular folder.  Folders have a UID validity value,
!         # but this can change from session to session.  So we forget this
!         # imap rubbish and use our own id.
!         self[options["pop3proxy", "mailid_header_name"]] = self.id
  
          response = imap.append(self.folder.name, flags,
***************
*** 325,344 ****
              imap.SelectFolder(self.previous_folder.name)
              self.previous_folder = None
!         response = imap.uid("STORE", old_id, "+FLAGS.SILENT", "(\\Deleted)")
          self._check(response, 'store')
  
!         # We need to update the uid, as it will have changed
!         # Searching for the new message is full of problems.  Searching for
!         # the text sends far too much data through the connection, and
!         # doesn't work reliably anyway.  We instead search for a special
!         # header that we add for this explicit purpose.
          imap.SelectFolder(self.folder.name)
!         response = imap.uid("SEARCH", "HEADER", "X-Spambayes-IMAP-OldID",
!                             old_id)
          self._check(response, 'search')
          new_id = response[1][0]
!         # This works with NetMail, but not with Courier.  Very strange,
!         # and needs more examination.  For the moment, if the search
!         # turns up empty, we make the very big assumption that the new
          # message is the last one with a recent flag
          if new_id == "":
--- 335,352 ----
              imap.SelectFolder(self.previous_folder.name)
              self.previous_folder = None
!         response = imap.uid("STORE", self.uid, "+FLAGS.SILENT", "(\\Deleted)")
          self._check(response, 'store')
  
!         # We need to update the uid, as it will have changed.
!         # Although we don't use the UID to keep track of messages, we do
!         # have to use it for IMAP operations.
          imap.SelectFolder(self.folder.name)
!         response = imap.uid("SEARCH", "HEADER",
!                             options["pop3proxy", "mailid_header_name"],
!                             self.id)
          self._check(response, 'search')
          new_id = response[1][0]
!         # Let's hope it doesn't, but, just in case, if the search
!         # turns up empty, we make the assumption that the new
          # message is the last one with a recent flag
          if new_id == "":
***************
*** 348,358 ****
                  ids = new_id.split(' ')
                  new_id = ids[-1]
! 
!         #XXX This code to delete the old message id from the message
!         #XXX info db and manipulate the message id, is a *serious* hack.
!         #XXX There's gotta be a better way to do this.
!         message.msginfoDB._delState(self)
!         self.id = new_id
!         self.modified()
  
  # This performs a similar function to email.message_from_string()
--- 356,360 ----
                  ids = new_id.split(' ')
                  new_id = ids[-1]
!         self.uid = new_id
  
  # This performs a similar function to email.message_from_string()
***************
*** 362,368 ****
  
  class IMAPFolder(object):
!     def __init__(self, folder_name, readOnly=True):
          self.name = folder_name
          self.rfc822_command = "RFC822.PEEK"
  
      def __cmp__(self, obj):
--- 364,373 ----
  
  class IMAPFolder(object):
!     def __init__(self, folder_name):
          self.name = folder_name
          self.rfc822_command = "RFC822.PEEK"
+         # Unique names for cached messages - see _generate_id below.
+         self.lastBaseMessageName = ''
+         self.uniquifier = 2
  
      def __cmp__(self, obj):
***************
*** 385,389 ****
                  pass
  
!     def recent_keys(self):
          '''Returns uids for all the messages in the folder that
          are flagged as recent, but not flagged as deleted.'''
--- 390,394 ----
                  pass
  
!     def recent_uids(self):
          '''Returns uids for all the messages in the folder that
          are flagged as recent, but not flagged as deleted.'''
***************
*** 394,400 ****
  
      def keys(self):
!         '''Returns uids for all the messages in the folder not
          marked as deleted.'''
-         # request message range
          imap.SelectFolder(self.name)
          response = imap.uid("SEARCH", "UNDELETED")
--- 399,404 ----
  
      def keys(self):
!         '''Returns *uids* for all the messages in the folder not
          marked as deleted.'''
          imap.SelectFolder(self.name)
          response = imap.uid("SEARCH", "UNDELETED")
***************
*** 421,429 ****
          # raw rfc822 message
          msg = imapmessage_from_string(messageText)
-         msg.setId(key)
          msg.setFolder(self)
          
          return msg
!    
      def Train(self, classifier, isSpam):
          '''Train folder as spam/ham'''
--- 425,453 ----
          # raw rfc822 message
          msg = imapmessage_from_string(messageText)
          msg.setFolder(self)
+         msg.uid = data["UID"]
+         if msg.setIdFromPayload() is None:
+             msg.setId(self._generate_id())
+             # Unfortunately, we now have to re-save this message, so that
+             # our id is stored on the IMAP server.  Before anyone suggests
+             # it, we can't store it as a flag, because user-defined flags
+             # aren't supported by all IMAP servers.
+             msg.Save()
          
          return msg
! 
!     # Lifted straight from pop3proxy.py (under the name getNewMessageName)
!     def _generate_id(self):
!         # The message id is the time it arrived, with a uniquifier
!         # appended if two arrive within one clock tick of each other.
!         messageName = "%10.10d" % long(time.time())
!         if messageName == self.lastBaseMessageName:
!             messageName = "%s-%d" % (messageName, self.uniquifier)
!             self.uniquifier += 1
!         else:
!             self.lastBaseMessageName = messageName
!             self.uniquifier = 2
!         return messageName
! 
      def Train(self, classifier, isSpam):
          '''Train folder as spam/ham'''
***************
*** 522,526 ****
              # Select the folder to make sure it exists
              imap.SelectFolder(filter_folder)
!             folder = IMAPFolder(filter_folder, False)
              folder.Filter(self.classifier, self.spam_folder,
                            self.unsure_folder)
--- 546,550 ----
              # Select the folder to make sure it exists
              imap.SelectFolder(filter_folder)
!             folder = IMAPFolder(filter_folder)
              folder.Filter(self.classifier, self.spam_folder,
                            self.unsure_folder)





More information about the Spambayes-checkins mailing list