[Spambayes-checkins] spambayes/Outlook2000 addin.py, 1.153, 1.154 export.py, 1.19, 1.20 msgstore.py, 1.100, 1.101
Mark Hammond
mhammond at users.sourceforge.net
Mon Feb 12 12:35:38 CET 2007
Update of /cvsroot/spambayes/spambayes/Outlook2000
In directory sc8-pr-cvs8.sourceforge.net:/tmp/cvs-serv8117
Modified Files:
addin.py export.py msgstore.py
Log Message:
Integrate OCR with outlook plugin
Index: addin.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Outlook2000/addin.py,v
retrieving revision 1.153
retrieving revision 1.154
diff -C2 -d -r1.153 -r1.154
*** addin.py 10 Jun 2006 04:57:10 -0000 1.153
--- addin.py 12 Feb 2007 11:35:34 -0000 1.154
***************
*** 444,460 ****
TrainAsSpam(msgstore_message, self.manager)
! # Event function fired from the "Show Clues" UI items.
! def ShowClues(mgr, explorer):
from cgi import escape
-
- app = explorer.Application
- msgstore_message = explorer.GetSelectedMessages(False)
- if msgstore_message is None:
- return
mgr.classifier_data.message_db.load_msg(msgstore_message)
-
- item = msgstore_message.GetOutlookItem()
score, clues = mgr.score(msgstore_message, evidence=True)
! new_msg = app.CreateItem(0)
# NOTE: Silly Outlook always switches the message editor back to RTF
# once the Body property has been set. Thus, there is no reasonable
--- 444,452 ----
TrainAsSpam(msgstore_message, self.manager)
! def GetClues(mgr, msgstore_message):
from cgi import escape
mgr.classifier_data.message_db.load_msg(msgstore_message)
score, clues = mgr.score(msgstore_message, evidence=True)
!
# NOTE: Silly Outlook always switches the message editor back to RTF
# once the Body property has been set. Thus, there is no reasonable
***************
*** 533,538 ****
# Now the raw text of the message, as best we can
push("<h2>Message Stream</h2>\n")
- push("<PRE>\n")
msg = msgstore_message.GetEmailPackageObject(strip_mime_headers=False)
push(escape(msg.as_string(), True))
push("</PRE>\n")
--- 525,530 ----
# Now the raw text of the message, as best we can
push("<h2>Message Stream</h2>\n")
msg = msgstore_message.GetEmailPackageObject(strip_mime_headers=False)
+ push("<PRE>\n")
push(escape(msg.as_string(), True))
push("</PRE>\n")
***************
*** 562,565 ****
--- 554,570 ----
# Put the body together, then the rest of the message.
body = ''.join(body)
+ return body
+
+ # Event function fired from the "Show Clues" UI items.
+ def ShowClues(mgr, explorer):
+
+ app = explorer.Application
+ msgstore_message = explorer.GetSelectedMessages(False)
+ if msgstore_message is None:
+ return
+
+ body = GetClues(mgr, msgstore_message)
+ item = msgstore_message.GetOutlookItem()
+ new_msg = app.CreateItem(0)
new_msg.Subject = "Spam Clues: " + item.Subject
# As above, use HTMLBody else Outlook refuses to behave.
Index: export.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Outlook2000/export.py,v
retrieving revision 1.19
retrieving revision 1.20
diff -C2 -d -r1.19 -r1.20
*** export.py 29 Dec 2003 00:35:20 -0000 1.19
--- export.py 12 Feb 2007 11:35:34 -0000 1.20
***************
*** 41,46 ****
# Return the text of msg (a MAPIMsgStoreMsg object) as a string.
# There are subtleties, alas.
! def get_text(msg):
! email_object = msg.GetEmailPackageObject()
try:
# Don't use str(msg) instead -- that inserts an information-
--- 41,49 ----
# Return the text of msg (a MAPIMsgStoreMsg object) as a string.
# There are subtleties, alas.
! def get_text(msg, old_style):
! if old_style:
! email_object = msg.OldGetEmailPackageObject()
! else:
! email_object = msg.GetEmailPackageObject()
try:
# Don't use str(msg) instead -- that inserts an information-
***************
*** 93,97 ****
# Returns the total number of .txt files created (== the number of msgs
# successfully exported).
! def _export_folders(manager, root, buckets, folder_ids, include_sub):
from random import choice
--- 96,100 ----
# Returns the total number of .txt files created (== the number of msgs
# successfully exported).
! def _export_folders(manager, root, buckets, folder_ids, include_sub, old_style):
from random import choice
***************
*** 104,108 ****
# filename is the EID.txt
try:
! msg_text = get_text(message)
except KeyboardInterrupt:
raise
--- 107,111 ----
# filename is the EID.txt
try:
! msg_text = get_text(message, old_style)
except KeyboardInterrupt:
raise
***************
*** 121,125 ****
# This does all the work. 'directory' is the parent directory for the
# generated Ham and Spam sub-folders.
! def export(directory, num_buckets):
print "Loading bayes manager..."
manager = GetManager()
--- 124,128 ----
# This does all the work. 'directory' is the parent directory for the
# generated Ham and Spam sub-folders.
! def export(directory, num_buckets, old_style):
print "Loading bayes manager..."
manager = GetManager()
***************
*** 141,145 ****
buckets,
config.training.spam_folder_ids,
! config.training.spam_include_sub)
print "Exported", num, "spam messages."
--- 144,149 ----
buckets,
config.training.spam_folder_ids,
! config.training.spam_include_sub,
! old_style)
print "Exported", num, "spam messages."
***************
*** 149,153 ****
buckets,
config.training.ham_folder_ids,
! config.training.ham_include_sub)
print "Exported", num, "ham messages."
--- 153,158 ----
buckets,
config.training.ham_folder_ids,
! config.training.ham_include_sub,
! old_style)
print "Exported", num, "ham messages."
***************
*** 156,163 ****
try:
! opts, args = getopt.getopt(sys.argv[1:], "hqn:")
except getopt.error, d:
usage(d)
quiet = 0
num_buckets = NUM_BUCKETS
for opt, val in opts:
--- 161,169 ----
try:
! opts, args = getopt.getopt(sys.argv[1:], "hqon:")
except getopt.error, d:
usage(d)
quiet = 0
+ old_style = False
num_buckets = NUM_BUCKETS
for opt, val in opts:
***************
*** 168,171 ****
--- 174,179 ----
elif opt == '-n':
num_buckets = int(val)
+ elif opt == '-o':
+ old_style = True
else:
assert 0, "internal error on option '%s'" % opt
***************
*** 191,195 ****
if not quiet:
raw_input("Press enter to continue, or Ctrl+C to abort.")
! export(directory, num_buckets)
# Display errormsg (if specified), a blank line, and usage information; then
--- 199,203 ----
if not quiet:
raw_input("Press enter to continue, or Ctrl+C to abort.")
! export(directory, num_buckets, old_style=old_style)
# Display errormsg (if specified), a blank line, and usage information; then
Index: msgstore.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Outlook2000/msgstore.py,v
retrieving revision 1.100
retrieving revision 1.101
diff -C2 -d -r1.100 -r1.101
*** msgstore.py 6 Apr 2005 03:06:51 -0000 1.100
--- msgstore.py 12 Feb 2007 11:35:34 -0000 1.101
***************
*** 3,6 ****
--- 3,20 ----
import sys, os, re
import locale
+ from time import timezone
+
+ import email
+ from email.MIMEImage import MIMEImage
+ from email.Message import Message
+ from email.MIMEMultipart import MIMEMultipart
+ from email.MIMEText import MIMEText
+ from email.Parser import HeaderParser
+ from email.Utils import formatdate
+
+ try:
+ from cStringIO import StringIO
+ except ImportError:
+ from StringIO import StringIO
try:
***************
*** 723,727 ****
folder = self.msgstore._OpenEntry(self.id)
# Nuke my MAPI reference, and set my ID to None
! rc = folder.DeleteMessages(real_ids, 0, None, 0)
except pythoncom.com_error, details:
raise MsgStoreExceptionFromCOMException(details)
--- 737,741 ----
folder = self.msgstore._OpenEntry(self.id)
# Nuke my MAPI reference, and set my ID to None
! folder.DeleteMessages(real_ids, 0, None, 0)
except pythoncom.com_error, details:
raise MsgStoreExceptionFromCOMException(details)
***************
*** 884,888 ****
def _GetMessageText(self):
parts = self._GetMessageTextParts()
! # parts is (headers, body, html), but could possibly grow
return "\n".join(parts)
--- 898,904 ----
def _GetMessageText(self):
parts = self._GetMessageTextParts()
! # parts is (headers, body, html) - which needs more formalizing -
! # GetMessageText should become deprecated - it makes no sense in the
! # face of multi-part messages.
return "\n".join(parts)
***************
*** 893,896 ****
--- 909,915 ----
# Note we *dont* look in plain text attachments, which we arguably
# should.
+ # This should be refactored into a function that returns the headers,
+ # plus a list of email package sub-objects suitable for sending to
+ # the classifier.
from spambayes import mboxutils
***************
*** 936,939 ****
--- 955,960 ----
# Find all attachments with
# PR_ATTACH_MIME_TAG_A=multipart/signed
+ # XXX - see also self._GetAttachmentsToInclude(), which
+ # scans the attachment table - we should consolidate!
table = self.mapi_object.GetAttachmentTable(0)
restriction = (mapi.RES_PROPERTY, # a property restriction
***************
*** 973,977 ****
# it into a message object, so we can extract the text, so
# we can stick it back into another one. Ahhhhh.
- import email
msg = email.message_from_string(attach_body)
assert msg.is_multipart(), "Should be multi-part: %r" % attach_body
--- 994,997 ----
***************
*** 1032,1037 ****
def _format_time(self, raw):
- from time import timezone
- from email.Utils import formatdate
return formatdate(int(raw)-timezone, True)
--- 1052,1055 ----
***************
*** 1068,1071 ****
--- 1086,1134 ----
raise MsgStoreExceptionFromCOMException(details)
+ def _GetAttachmentsToInclude(self):
+ # Get the list of attachments to include in the email package
+ # Message object. Currently only images (BUT - consider consolidating
+ # with the attachment handling above for signed messages!)
+ from spambayes.Options import options
+ from spambayes.ImageStripper import image_large_size_attribute
+
+ # For now, we know these are the only 2 options that need attachments.
+ if not options['Tokenizer', 'x-crack_images'] and \
+ not options['Tokenizer', 'x-image_size']:
+ return []
+ try:
+ table = self.mapi_object.GetAttachmentTable(0)
+ tags = PR_ATTACH_NUM,PR_ATTACH_MIME_TAG_A,PR_ATTACH_SIZE,PR_ATTACH_DATA_BIN
+ attach_rows = mapi.HrQueryAllRows(table, tags, None, None, 0)
+ except pythoncom.com_error, why:
+ attach_rows = []
+
+ attachments = []
+ # Create a new attachment for each image.
+ for row in attach_rows:
+ attach_num = row[0][1]
+ # mime-tag may not exist - eg, seen on bounce messages
+ mime_tag = None
+ if PROP_TYPE(row[1][0]) != PT_ERROR:
+ mime_tag = row[1][1]
+ # oh - what is the library for this!?
+ if mime_tag:
+ typ, subtyp = mime_tag.split('/', 1)
+ if typ == 'image':
+ size = row[2][1]
+ # If it is too big, just write the size. ImageStripper.py
+ # checks this attribute.
+ if size > options["Tokenizer", "max_image_size"]:
+ sub = MIMEImage(None, subtyp)
+ setattr(sub, image_large_size_attribute, size)
+ else:
+ attach = self.mapi_object.OpenAttach(attach_num,
+ None, mapi.MAPI_DEFERRED_ERRORS)
+ data = GetPotentiallyLargeStringProp(attach,
+ PR_ATTACH_DATA_BIN, row[3])
+ sub = MIMEImage(data, subtyp)
+ attachments.append(sub)
+ return attachments
+
def GetEmailPackageObject(self, strip_mime_headers=True):
# Return an email.Message object.
***************
*** 1096,1099 ****
--- 1159,1288 ----
# Short course: we either have to synthesize non-insane MIME
# structure, or eliminate all evidence of original MIME structure.
+ # We used to do the latter - but now that we must give valid
+ # multipart messages which include attached images, we are forced
+ # to try and do the former (but actually the 2 options are not
+ # mutually exclusive - first we eliminate all evidence of original
+ # MIME structure, before allowing the email package to synthesize
+ # non-insane MIME structure.
+
+ # We still jump through hoops though - if we have no interesting
+ # attachments we attempt to return as close as possible as what
+ # we always returned in the past - a "single-part" message with the
+ # text and HTML as a simple text body.
+ header_text, body, html = self._GetMessageTextParts()
+
+ try: # catch all exceptions!
+ # Try and decide early if we want multipart or not.
+ # We originally just looked at the content-type - but Outlook
+ # is unreliable WRT that header! Also, consider a message multipart message
+ # with only text and html sections and no additional attachments.
+ # Outlook will generally have copied the HTML and Text sections
+ # into the relevant properties and they will *not* appear as
+ # attachments. We should return the 'single' message here to keep
+ # as close to possible to what we used to return. We can change
+ # this policy in the future - but we would probably need to insist
+ # on a full re-train as the training tokens will have changed for
+ # many messages.
+ attachments = self._GetAttachmentsToInclude()
+ new_content_type = None
+ if attachments:
+ _class = MIMEMultipart
+ payload = []
+ if body:
+ payload.append(MIMEText(body))
+ if html:
+ payload.append(MIMEText(html, 'html'))
+ payload += attachments
+ new_content_type = "multipart/mixed"
+ else:
+ # Single message part with both text and HTML.
+ _class = Message
+ payload = body + '\n' + html
+
+ try:
+ root_msg = HeaderParser(_class=_class).parsestr(header_text)
+ except email.Errors.HeaderParseError:
+ raise # sob
+ # ack - it is about here we need to do what the old code did
+ # below: But - the fact the code below is dealing only
+ # with content-type (and the fact we handle that above) makes
+ # it less obvious....
+
+ ## But even this doesn't get *everything*. We can still see:
+ ## "multipart message with no defined boundary" or the
+ ## HeaderParseError above. Time to get brutal - hack out
+ ## the Content-Type header, so we see it as plain text.
+ #if msg is None:
+ # butcher_pos = text.lower().find("\ncontent-type: ")
+ # if butcher_pos < 0:
+ # # This error just just gunna get caught below anyway
+ # raise RuntimeError(
+ # "email package croaked with a MIME related error, but "
+ # "there appears to be no 'Content-Type' header")
+ # # Put it back together, skipping the original "\n" but
+ # # leaving the header leaving "\nSpamBayes-Content-Type: "
+ # butchered = text[:butcher_pos] + "\nSpamBayes-" + \
+ # text[butcher_pos+1:] + "\n\n"
+ # msg = email.message_from_string(butchered)
+
+ # patch up mime stuff - these headers will confuse the email
+ # package as it walks the attachments.
+ if strip_mime_headers:
+ for h, new_val in (('content-type', new_content_type),
+ ('content-transfer-encoding', None)):
+ try:
+ root_msg['X-SpamBayes-Original-' + h] = root_msg[h]
+ del root_msg[h]
+ except KeyError:
+ pass
+ if new_val is not None:
+ root_msg[h] = new_val
+
+ root_msg.set_payload(payload)
+
+ # We used to call email.message_from_string(text) and catch:
+ # email.Errors.BoundaryError: should no longer happen - we no longer
+ # ask the email package to parse anything beyond headers.
+ # email.Errors.HeaderParseError: caught above
+ except:
+ text = '\r\n'.join([header_text, body, html])
+ print "FAILED to create email.message from: ", `text`
+ raise
+
+ return root_msg
+
+ # XXX - this is the OLD version of GetEmailPackageObject() - it
+ # temporarily remains as a testing aid, to ensure that the different
+ # mime structure we now generate has no negative affects.
+ # Use 'sandbox/export.py -o' to export to the testdata directory
+ # in the old format, then run the cross-validation tests.
+ def OldGetEmailPackageObject(self, strip_mime_headers=True):
+ # Return an email.Message object.
+ #
+ # strip_mime_headers is a hack, and should be left True unless you're
+ # trying to display all the headers for diagnostic purposes. If we
+ # figure out something better to do, it should go away entirely.
+ #
+ # Problem #1: suppose a msg is multipart/alternative, with
+ # text/plain and text/html sections. The latter MIME decorations
+ # are plain missing in what _GetMessageText() returns. If we leave
+ # the multipart/alternative in the headers anyway, the email
+ # package's "lax parsing" won't complain about not finding any
+ # sections, but since the type *is* multipart/alternative then
+ # anyway, the tokenizer finds no text/* parts at all to tokenize.
+ # As a result, only the headers get tokenized. By stripping
+ # Content-Type from the headers (if present), the email pkg
+ # considers the body to be text/plain (the default), and so it
+ # does get tokenized.
+ #
+ # Problem #2: Outlook decodes quoted-printable and base64 on its
+ # own, but leaves any Content-Transfer-Encoding line in the headers.
+ # This can cause the email pkg to try to decode the text again,
+ # with unpleasant (but rarely fatal) results. If we strip that
+ # header too, no problem -- although the fact that a msg was
+ # encoded in base64 is usually a good spam clue, and we miss that.
+ #
+ # Short course: we either have to synthesize non-insane MIME
+ # structure, or eliminate all evidence of original MIME structure.
# Since we don't have a way to the former, by default this function
# does the latter.
***************
*** 1146,1150 ****
return msg
!
def SetField(self, prop, val):
# Future optimization note - from GetIDsFromNames doco
--- 1335,1340 ----
return msg
! # end of OLD GetEmailPackageObject
!
def SetField(self, prop, val):
# Future optimization note - from GetIDsFromNames doco
***************
*** 1341,1345 ****
def test():
- from win32com.client import Dispatch
outlook = Dispatch("Outlook.Application")
inbox = outlook.Session.GetDefaultFolder(constants.olFolderInbox)
--- 1531,1534 ----
More information about the Spambayes-checkins
mailing list