[Spambayes-checkins] spambayes/Outlook2000 addin.py, 1.153, 1.154 export.py, 1.19, 1.20 msgstore.py, 1.100, 1.101

Mon Feb 12 12:35:38 CET 2007

Update of /cvsroot/spambayes/spambayes/Outlook2000
In directory sc8-pr-cvs8.sourceforge.net:/tmp/cvs-serv8117

Modified Files:
	addin.py export.py msgstore.py 
Log Message:
Integrate OCR with outlook plugin


Index: addin.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Outlook2000/addin.py,v
retrieving revision 1.153
retrieving revision 1.154
diff -C2 -d -r1.153 -r1.154
*** addin.py	10 Jun 2006 04:57:10 -0000	1.153
--- addin.py	12 Feb 2007 11:35:34 -0000	1.154
***************
*** 444,460 ****
                  TrainAsSpam(msgstore_message, self.manager)
  
! # Event function fired from the "Show Clues" UI items.
! def ShowClues(mgr, explorer):
      from cgi import escape
- 
-     app = explorer.Application
-     msgstore_message = explorer.GetSelectedMessages(False)
-     if msgstore_message is None:
-         return
      mgr.classifier_data.message_db.load_msg(msgstore_message)
- 
-     item = msgstore_message.GetOutlookItem()
      score, clues = mgr.score(msgstore_message, evidence=True)
!     new_msg = app.CreateItem(0)
      # NOTE: Silly Outlook always switches the message editor back to RTF
      # once the Body property has been set.  Thus, there is no reasonable
--- 444,452 ----
                  TrainAsSpam(msgstore_message, self.manager)
  
! def GetClues(mgr, msgstore_message):
      from cgi import escape
      mgr.classifier_data.message_db.load_msg(msgstore_message)
      score, clues = mgr.score(msgstore_message, evidence=True)
! 
      # NOTE: Silly Outlook always switches the message editor back to RTF
      # once the Body property has been set.  Thus, there is no reasonable
***************
*** 533,538 ****
      # Now the raw text of the message, as best we can
      push("<h2>Message Stream</h2>\n")
-     push("<PRE>\n")
      msg = msgstore_message.GetEmailPackageObject(strip_mime_headers=False)
      push(escape(msg.as_string(), True))
      push("</PRE>\n")
--- 525,530 ----
      # Now the raw text of the message, as best we can
      push("<h2>Message Stream</h2>\n")
      msg = msgstore_message.GetEmailPackageObject(strip_mime_headers=False)
+     push("<PRE>\n")
      push(escape(msg.as_string(), True))
      push("</PRE>\n")
***************
*** 562,565 ****
--- 554,570 ----
      # Put the body together, then the rest of the message.
      body = ''.join(body)
+     return body
+ 
+ # Event function fired from the "Show Clues" UI items.
+ def ShowClues(mgr, explorer):
+ 
+     app = explorer.Application
+     msgstore_message = explorer.GetSelectedMessages(False)
+     if msgstore_message is None:
+         return
+ 
+     body = GetClues(mgr, msgstore_message)
+     item = msgstore_message.GetOutlookItem()
+     new_msg = app.CreateItem(0)
      new_msg.Subject = "Spam Clues: " + item.Subject
      # As above, use HTMLBody else Outlook refuses to behave.

Index: export.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Outlook2000/export.py,v
retrieving revision 1.19
retrieving revision 1.20
diff -C2 -d -r1.19 -r1.20
*** export.py	29 Dec 2003 00:35:20 -0000	1.19
--- export.py	12 Feb 2007 11:35:34 -0000	1.20
***************
*** 41,46 ****
  # Return the text of msg (a MAPIMsgStoreMsg object) as a string.
  # There are subtleties, alas.
! def get_text(msg):
!     email_object = msg.GetEmailPackageObject()
      try:
          # Don't use str(msg) instead -- that inserts an information-
--- 41,49 ----
  # Return the text of msg (a MAPIMsgStoreMsg object) as a string.
  # There are subtleties, alas.
! def get_text(msg, old_style):
!     if old_style:
!         email_object = msg.OldGetEmailPackageObject()
!     else:
!         email_object = msg.GetEmailPackageObject()
      try:
          # Don't use str(msg) instead -- that inserts an information-
***************
*** 93,97 ****
  # Returns the total number of .txt files created (== the number of msgs
  # successfully exported).
! def _export_folders(manager, root, buckets, folder_ids, include_sub):
      from random import choice
  
--- 96,100 ----
  # Returns the total number of .txt files created (== the number of msgs
  # successfully exported).
! def _export_folders(manager, root, buckets, folder_ids, include_sub, old_style):
      from random import choice
  
***************
*** 104,108 ****
              # filename is the EID.txt
              try:
!                 msg_text = get_text(message)
              except KeyboardInterrupt:
                  raise
--- 107,111 ----
              # filename is the EID.txt
              try:
!                 msg_text = get_text(message, old_style)
              except KeyboardInterrupt:
                  raise
***************
*** 121,125 ****
  # This does all the work.  'directory' is the parent directory for the
  # generated Ham and Spam sub-folders.
! def export(directory, num_buckets):
      print "Loading bayes manager..."
      manager = GetManager()
--- 124,128 ----
  # This does all the work.  'directory' is the parent directory for the
  # generated Ham and Spam sub-folders.
! def export(directory, num_buckets, old_style):
      print "Loading bayes manager..."
      manager = GetManager()
***************
*** 141,145 ****
                            buckets,
                            config.training.spam_folder_ids,
!                           config.training.spam_include_sub)
      print "Exported", num, "spam messages."
  
--- 144,149 ----
                            buckets,
                            config.training.spam_folder_ids,
!                           config.training.spam_include_sub,
!                           old_style)
      print "Exported", num, "spam messages."
  
***************
*** 149,153 ****
                            buckets,
                            config.training.ham_folder_ids,
!                           config.training.ham_include_sub)
      print "Exported", num, "ham messages."
  
--- 153,158 ----
                            buckets,
                            config.training.ham_folder_ids,
!                           config.training.ham_include_sub,
!                           old_style)
      print "Exported", num, "ham messages."
  
***************
*** 156,163 ****
  
      try:
!         opts, args = getopt.getopt(sys.argv[1:], "hqn:")
      except getopt.error, d:
          usage(d)
      quiet = 0
      num_buckets = NUM_BUCKETS
      for opt, val in opts:
--- 161,169 ----
  
      try:
!         opts, args = getopt.getopt(sys.argv[1:], "hqon:")
      except getopt.error, d:
          usage(d)
      quiet = 0
+     old_style = False
      num_buckets = NUM_BUCKETS
      for opt, val in opts:
***************
*** 168,171 ****
--- 174,179 ----
          elif opt == '-n':
              num_buckets = int(val)
+         elif opt == '-o':
+             old_style = True
          else:
              assert 0, "internal error on option '%s'" % opt
***************
*** 191,195 ****
      if not quiet:
          raw_input("Press enter to continue, or Ctrl+C to abort.")
!     export(directory, num_buckets)
  
  # Display errormsg (if specified), a blank line, and usage information; then
--- 199,203 ----
      if not quiet:
          raw_input("Press enter to continue, or Ctrl+C to abort.")
!     export(directory, num_buckets, old_style=old_style)
  
  # Display errormsg (if specified), a blank line, and usage information; then

Index: msgstore.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Outlook2000/msgstore.py,v
retrieving revision 1.100
retrieving revision 1.101
diff -C2 -d -r1.100 -r1.101
*** msgstore.py	6 Apr 2005 03:06:51 -0000	1.100
--- msgstore.py	12 Feb 2007 11:35:34 -0000	1.101
***************
*** 3,6 ****
--- 3,20 ----
  import sys, os, re
  import locale
+ from time import timezone
+ 
+ import email
+ from email.MIMEImage import MIMEImage
+ from email.Message import Message
+ from email.MIMEMultipart import MIMEMultipart
+ from email.MIMEText import MIMEText
+ from email.Parser import HeaderParser
+ from email.Utils import formatdate
+ 
+ try:
+     from cStringIO import StringIO
+ except ImportError:
+     from StringIO import StringIO
  
  try:
***************
*** 723,727 ****
              folder = self.msgstore._OpenEntry(self.id)
              # Nuke my MAPI reference, and set my ID to None
!             rc = folder.DeleteMessages(real_ids, 0, None, 0)
          except pythoncom.com_error, details:
              raise MsgStoreExceptionFromCOMException(details)
--- 737,741 ----
              folder = self.msgstore._OpenEntry(self.id)
              # Nuke my MAPI reference, and set my ID to None
!             folder.DeleteMessages(real_ids, 0, None, 0)
          except pythoncom.com_error, details:
              raise MsgStoreExceptionFromCOMException(details)
***************
*** 884,888 ****
      def _GetMessageText(self):
          parts = self._GetMessageTextParts()
!         # parts is (headers, body, html), but could possibly grow
          return "\n".join(parts)
  
--- 898,904 ----
      def _GetMessageText(self):
          parts = self._GetMessageTextParts()
!         # parts is (headers, body, html) - which needs more formalizing -
!         # GetMessageText should become deprecated - it makes no sense in the
!         # face of multi-part messages.
          return "\n".join(parts)
  
***************
*** 893,896 ****
--- 909,915 ----
          # Note we *dont* look in plain text attachments, which we arguably
          # should.
+         # This should be refactored into a function that returns the headers,
+         # plus a list of email package sub-objects suitable for sending to
+         # the classifier.
          from spambayes import mboxutils
  
***************
*** 936,939 ****
--- 955,960 ----
              # Find all attachments with
              # PR_ATTACH_MIME_TAG_A=multipart/signed
+             # XXX - see also self._GetAttachmentsToInclude(), which
+             # scans the attachment table - we should consolidate!
              table = self.mapi_object.GetAttachmentTable(0)
              restriction = (mapi.RES_PROPERTY,   # a property restriction
***************
*** 973,977 ****
                  # it into a message object, so we can extract the text, so
                  # we can stick it back into another one.  Ahhhhh.
-                 import email
                  msg = email.message_from_string(attach_body)
                  assert msg.is_multipart(), "Should be multi-part: %r" % attach_body
--- 994,997 ----
***************
*** 1032,1037 ****
  
      def _format_time(self, raw):
-         from time import timezone
-         from email.Utils import formatdate
          return formatdate(int(raw)-timezone, True)
  
--- 1052,1055 ----
***************
*** 1068,1071 ****
--- 1086,1134 ----
                  raise MsgStoreExceptionFromCOMException(details)
  
+     def _GetAttachmentsToInclude(self):
+         # Get the list of attachments to include in the email package
+         # Message object. Currently only images (BUT - consider consolidating
+         # with the attachment handling above for signed messages!)
+         from spambayes.Options import options
+         from spambayes.ImageStripper import image_large_size_attribute
+ 
+         # For now, we know these are the only 2 options that need attachments.
+         if not options['Tokenizer', 'x-crack_images'] and \
+            not options['Tokenizer', 'x-image_size']:
+             return []
+         try:
+             table = self.mapi_object.GetAttachmentTable(0)
+             tags = PR_ATTACH_NUM,PR_ATTACH_MIME_TAG_A,PR_ATTACH_SIZE,PR_ATTACH_DATA_BIN
+             attach_rows = mapi.HrQueryAllRows(table, tags, None, None, 0)
+         except pythoncom.com_error, why:
+             attach_rows = []
+ 
+         attachments = []
+         # Create a new attachment for each image.
+         for row in attach_rows:
+             attach_num = row[0][1]
+             # mime-tag may not exist - eg, seen on bounce messages
+             mime_tag = None
+             if PROP_TYPE(row[1][0]) != PT_ERROR:
+                 mime_tag = row[1][1]
+             # oh - what is the library for this!?
+             if mime_tag:
+                 typ, subtyp = mime_tag.split('/', 1)
+                 if typ == 'image':
+                     size = row[2][1]
+                     # If it is too big, just write the size.  ImageStripper.py
+                     # checks this attribute.
+                     if size > options["Tokenizer", "max_image_size"]:
+                         sub = MIMEImage(None, subtyp)
+                         setattr(sub, image_large_size_attribute, size)
+                     else:
+                         attach = self.mapi_object.OpenAttach(attach_num,
+                                         None, mapi.MAPI_DEFERRED_ERRORS)
+                         data = GetPotentiallyLargeStringProp(attach,
+                                     PR_ATTACH_DATA_BIN, row[3])
+                         sub = MIMEImage(data, subtyp)
+                     attachments.append(sub)
+         return attachments
+ 
      def GetEmailPackageObject(self, strip_mime_headers=True):
          # Return an email.Message object.
***************
*** 1096,1099 ****
--- 1159,1288 ----
          # Short course:  we either have to synthesize non-insane MIME
          # structure, or eliminate all evidence of original MIME structure.
+         # We used to do the latter - but now that we must give valid
+         # multipart messages which include attached images, we are forced
+         # to try and do the former (but actually the 2 options are not
+         # mutually exclusive - first we eliminate all evidence of original
+         # MIME structure, before allowing the email package to synthesize
+         # non-insane MIME structure.
+ 
+         # We still jump through hoops though - if we have no interesting
+         # attachments we attempt to return as close as possible as what
+         # we always returned in the past - a "single-part" message with the
+         # text and HTML as a simple text body.
+         header_text, body, html = self._GetMessageTextParts()
+ 
+         try: # catch all exceptions!
+             # Try and decide early if we want multipart or not.
+             # We originally just looked at the content-type - but Outlook
+             # is unreliable WRT that header!  Also, consider a message multipart message
+             # with only text and html sections and no additional attachments.
+             # Outlook will generally have copied the HTML and Text sections
+             # into the relevant properties and they will *not* appear as
+             # attachments. We should return the 'single' message here to keep
+             # as close to possible to what we used to return.  We can change
+             # this policy in the future - but we would probably need to insist
+             # on a full re-train as the training tokens will have changed for
+             # many messages.
+             attachments = self._GetAttachmentsToInclude()
+             new_content_type = None
+             if attachments:
+                 _class = MIMEMultipart
+                 payload = []
+                 if body:
+                     payload.append(MIMEText(body))
+                 if html:
+                     payload.append(MIMEText(html, 'html'))
+                 payload += attachments
+                 new_content_type = "multipart/mixed"
+             else:
+                 # Single message part with both text and HTML.
+                 _class = Message
+                 payload = body + '\n' + html
+ 
+             try:
+                 root_msg = HeaderParser(_class=_class).parsestr(header_text)
+             except email.Errors.HeaderParseError:
+                 raise # sob
+                 # ack - it is about here we need to do what the old code did
+                 # below:  But - the fact the code below is dealing only
+                 # with content-type (and the fact we handle that above) makes
+                 # it less obvious....
+ 
+                 ## But even this doesn't get *everything*.  We can still see:
+                 ##  "multipart message with no defined boundary" or the
+                 ## HeaderParseError above.  Time to get brutal - hack out
+                 ## the Content-Type header, so we see it as plain text.
+                 #if msg is None:
+                 #    butcher_pos = text.lower().find("\ncontent-type: ")
+                 #    if butcher_pos < 0:
+                 #        # This error just just gunna get caught below anyway
+                 #        raise RuntimeError(
+                 #            "email package croaked with a MIME related error, but "
+                 #            "there appears to be no 'Content-Type' header")
+                 #    # Put it back together, skipping the original "\n" but
+                 #    # leaving the header leaving "\nSpamBayes-Content-Type: "
+                 #    butchered = text[:butcher_pos] + "\nSpamBayes-" + \
+                 #                text[butcher_pos+1:] + "\n\n"
+                 #    msg = email.message_from_string(butchered)
+     
+             # patch up mime stuff - these headers will confuse the email
+             # package as it walks the attachments.
+             if strip_mime_headers:
+                 for h, new_val in (('content-type', new_content_type),
+                                    ('content-transfer-encoding', None)):
+                     try:
+                         root_msg['X-SpamBayes-Original-' + h] = root_msg[h]
+                         del root_msg[h]
+                     except KeyError:
+                         pass
+                     if new_val is not None:
+                         root_msg[h] = new_val
+ 
+             root_msg.set_payload(payload)
+ 
+             # We used to call email.message_from_string(text) and catch:
+             # email.Errors.BoundaryError: should no longer happen - we no longer
+             # ask the email package to parse anything beyond headers.
+             # email.Errors.HeaderParseError: caught above
+         except:
+             text = '\r\n'.join([header_text, body, html])
+             print "FAILED to create email.message from: ", `text`
+             raise
+ 
+         return root_msg
+ 
+     # XXX - this is the OLD version of GetEmailPackageObject() - it
+     # temporarily remains as a testing aid, to ensure that the different
+     # mime structure we now generate has no negative affects.
+     # Use 'sandbox/export.py -o' to export to the testdata directory
+     # in the old format, then run the cross-validation tests.
+     def OldGetEmailPackageObject(self, strip_mime_headers=True):
+         # Return an email.Message object.
+         #
+         # strip_mime_headers is a hack, and should be left True unless you're
+         # trying to display all the headers for diagnostic purposes.  If we
+         # figure out something better to do, it should go away entirely.
+         #
+         # Problem #1:  suppose a msg is multipart/alternative, with
+         # text/plain and text/html sections.  The latter MIME decorations
+         # are plain missing in what _GetMessageText() returns.  If we leave
+         # the multipart/alternative in the headers anyway, the email
+         # package's "lax parsing" won't complain about not finding any
+         # sections, but since the type *is* multipart/alternative then
+         # anyway, the tokenizer finds no text/* parts at all to tokenize.
+         # As a result, only the headers get tokenized.  By stripping
+         # Content-Type from the headers (if present), the email pkg
+         # considers the body to be text/plain (the default), and so it
+         # does get tokenized.
+         #
+         # Problem #2:  Outlook decodes quoted-printable and base64 on its
+         # own, but leaves any Content-Transfer-Encoding line in the headers.
+         # This can cause the email pkg to try to decode the text again,
+         # with unpleasant (but rarely fatal) results.  If we strip that
+         # header too, no problem -- although the fact that a msg was
+         # encoded in base64 is usually a good spam clue, and we miss that.
+         #
+         # Short course:  we either have to synthesize non-insane MIME
+         # structure, or eliminate all evidence of original MIME structure.
          # Since we don't have a way to the former, by default this function
          # does the latter.
***************
*** 1146,1150 ****
  
          return msg
! 
      def SetField(self, prop, val):
          # Future optimization note - from GetIDsFromNames doco
--- 1335,1340 ----
  
          return msg
!     # end of OLD GetEmailPackageObject
!     
      def SetField(self, prop, val):
          # Future optimization note - from GetIDsFromNames doco
***************
*** 1341,1345 ****
  
  def test():
-     from win32com.client import Dispatch
      outlook = Dispatch("Outlook.Application")
      inbox = outlook.Session.GetDefaultFolder(constants.olFolderInbox)
--- 1531,1534 ----