From mhammond at users.sourceforge.net Tue Feb 6 02:16:31 2007 From: mhammond at users.sourceforge.net (Mark Hammond) Date: Mon, 05 Feb 2007 17:16:31 -0800 Subject: [Spambayes-checkins] spambayes/Outlook2000 manager.py, 1.109, 1.110 train.py, 1.41, 1.42 Message-ID: <20070206011633.D9B711E4011@bag.python.org> Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory sc8-pr-cvs8.sourceforge.net:/tmp/cvs-serv7212 Modified Files: manager.py train.py Log Message: New persistent-stats code failed on a full retrain with bizarre errors relating to an invalid messageinfo_db. I'm not 100% confident this will correctly carry stats over after a retrain - but I'm not even sure what the desired semantics are, and this works better than it did! Index: manager.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/manager.py,v retrieving revision 1.109 retrieving revision 1.110 diff -C2 -d -r1.109 -r1.110 *** manager.py 22 Apr 2005 06:18:09 -0000 1.109 --- manager.py 6 Feb 2007 01:16:29 -0000 1.110 *************** *** 470,473 **** --- 470,477 ---- self.classifier_data.message_db) + def AdoptClassifierData(self, new_classifier_data): + self.classifier_data.Adopt(new_classifier_data) + self.stats.messageinfo_db = self.classifier_data.message_db + # Logging - this should be somewhere else. def LogDebug(self, level, *args): Index: train.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/train.py,v retrieving revision 1.41 retrieving revision 1.42 diff -C2 -d -r1.41 -r1.42 *** train.py 21 Dec 2004 21:48:38 -0000 1.41 --- train.py 6 Feb 2007 01:16:29 -0000 1.42 *************** *** 168,172 **** if rebuild: assert mgr.classifier_data is not classifier_data ! mgr.classifier_data.Adopt(classifier_data) classifier_data = mgr.classifier_data # If we are rebuilding, then we reset the statistics, too. --- 168,172 ---- if rebuild: assert mgr.classifier_data is not classifier_data ! mgr.AdoptClassifierData(classifier_data) classifier_data = mgr.classifier_data # If we are rebuilding, then we reset the statistics, too. From mhammond at users.sourceforge.net Mon Feb 12 12:25:09 2007 From: mhammond at users.sourceforge.net (Mark Hammond) Date: Mon, 12 Feb 2007 03:25:09 -0800 Subject: [Spambayes-checkins] spambayes/spambayes ImageStripper.py, 1.11, 1.12 Options.py, 1.139, 1.140 tokenizer.py, 1.46, 1.47 Message-ID: <20070212112513.16E1E1E4011@bag.python.org> Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs8.sourceforge.net:/tmp/cvs-serv3625 Modified Files: ImageStripper.py Options.py tokenizer.py Log Message: Add gocr support Index: ImageStripper.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/ImageStripper.py,v retrieving revision 1.11 retrieving revision 1.12 diff -C2 -d -r1.11 -r1.12 *** ImageStripper.py 2 Dec 2006 22:09:25 -0000 1.11 --- ImageStripper.py 12 Feb 2007 11:24:59 -0000 1.12 *************** *** 26,29 **** --- 26,41 ---- Image = None + # The email mime object carrying the image data can have a special attribute + # which indicates that a message had an image, but it was large (ie, larger + # than the 'max_image_size' option.) This allows the provider of the email + # object to avoid loading huge images into memory just to have this image + # stripper ignore it. + # If the attribute exists, it should be the size of the image (we assert it + # is > max_image_size). The image payload is ignored. + # A 'cleaner' option would be to look at a header - but an attribute was + # chosen to avoid spammers getting wise and 'injecting' the header into the + # message body of a mime section. + image_large_size_attribute = "spambayes_image_large_size" + try: # We have three possibilities for Set: *************** *** 55,64 **** path = os.environ.get("PATH", "").split(os.pathsep) if sys.platform == "win32": - # Outlook plugin puts executables in (for example): - # C:/Program Files/SpamBayes/bin - # so add that directory to the path and make sure we - # look for a file ending in ".exe". - path.append(os.path.dirname(sys.executable)) prog = "%s.exe" % prog for directory in path: program = os.path.join(directory, prog) --- 67,84 ---- path = os.environ.get("PATH", "").split(os.pathsep) if sys.platform == "win32": prog = "%s.exe" % prog + if hasattr(sys, "frozen"): # a binary (py2exe) build.. + # Outlook plugin puts executables in (for example): + # C:/Program Files/SpamBayes/bin + # so add that directory to the path and make sure we + # look for a file ending in ".exe". + # Put it at the *start* of the paths we search - who knows + # what else me may encounter in the wild! + path.insert(0, os.path.dirname(sys.executable)) + else: + # a source build - for testing, allow it in SB package dir. + import spambayes + path.insert(0, os.path.abspath(spambayes.__path__[0])) + for directory in path: program = os.path.join(directory, prog) *************** *** 89,100 **** tokens = Set() rows = [] for part in parts: ! try: ! bytes = part.get_payload(decode=True) ! except: ! tokens.add("invalid-image:%s" % part.get_content_type()) ! continue ! if len(bytes) > options["Tokenizer", "max_image_size"]: tokens.add("image:big") continue # assume it's just a picture for now --- 109,130 ---- tokens = Set() rows = [] + max_image_size = options["Tokenizer", "max_image_size"] for part in parts: ! # See 'image_large_size_attribute' above - the provider may have seen ! # an image, but optimized the fact we don't bother processing large ! # images. ! nbytes = getattr(part, image_large_size_attribute, None) ! if nbytes is None: # no optimization - process normally... ! try: ! bytes = part.get_payload(decode=True) ! nbytes = len(bytes) ! except: ! tokens.add("invalid-image:%s" % part.get_content_type()) ! continue ! else: ! # optimization should not have remove images smaller than our max ! assert nbytes > max_image_size, (len(bytes), max_image_size) ! if nbytes > max_image_size: tokens.add("image:big") continue # assume it's just a picture for now *************** *** 157,161 **** full_image = imconcattb(full_image, image) ! fd, pnmfile = tempfile.mkstemp() os.close(fd) full_image.save(open(pnmfile, "wb"), "PPM") --- 187,191 ---- full_image = imconcattb(full_image, image) ! fd, pnmfile = tempfile.mkstemp('-spambayes-image') os.close(fd) full_image.save(open(pnmfile, "wb"), "PPM") *************** *** 163,166 **** --- 193,288 ---- return [pnmfile], tokens + class OCREngine(object): + """Base class for an OCR "engine" that extracts text. Ideally would + also deal with image format (as different engines will have different + requirements), but all currently supported ones deal with the PNM + formats (ppm/pgm/pbm) + """ + engine_name = None # sub-classes should override. + def __init__(self): + pass + + def is_enabled(self): + """Return true if this engine is able to be used. Note that + returning true only means it is *capable* of being used - not that + it is enabled. eg, it should check the program is needs to use + is installed, etc. + """ + raise NotImplementedError + + def extract_text(self, pnmfiles): + """Extract the text as an unprocessed stream (but as a string). + Typically this will be the raw output from the OCR engine. + """ + raise NotImplementedError + + class OCRExecutableEngine(OCREngine): + """Uses a simple executable that writes to stdout to extract the text""" + program_name = None + def __init__(self): + # we go looking for the program first use and cache its location + self._program = None + OCREngine.__init__(self) + + def is_enabled(self): + return self.program is not None + + def get_program(self): + # by default, executable is same as engine name + if not self._program: + self._program = find_program(self.engine_name) + return self._program + + program = property(get_program) + + class OCREngineOCRAD(OCRExecutableEngine): + engine_name = "ocrad" + + def extract_text(self, pnmfile): + assert self.is_enabled(), "I'm not working!" + scale = options["Tokenizer", "ocrad_scale"] or 1 + charset = options["Tokenizer", "ocrad_charset"] + ocr = os.popen('%s -s %s -c %s -f "%s" 2>%s' % + (self.program, scale, charset, + pnmfile, os.path.devnull)) + ret = ocr.read() + ocr.close() + return ret + + class OCREngineGOCR(OCRExecutableEngine): + engine_name="gocr" + + def extract_text(self, pnmfile): + assert self.is_enabled(), "I'm not working!" + ocr = os.popen('%s "%s" 2>%s' % + (self.program, pnmfile, os.path.devnull)) + ret = ocr.read() + ocr.close() + return ret + + # This lists all engines, with the first listed that is enabled winning. + # Matched with the engine name, as specified in Options.py, via the + # 'engine_name' attribute on the class. + _ocr_engines = [ + OCREngineGOCR, + OCREngineOCRAD, + ] + + def get_engine(engine_name): + if not engine_name: + candidates = _ocr_engines + else: + for e in _ocr_engines: + if e.engine_name == engine_name: + candidates = [e] + break + else: + candidates = [] + for candidate in candidates: + engine = candidate() + if engine.is_enabled(): + return engine + return None + class ImageStripper: def __init__(self, cachefile=""): *************** *** 173,182 **** if self.cachefile: atexit.register(self.close) ! def extract_ocr_info(self, pnmfiles): textbits = [] tokens = Set() - scale = options["Tokenizer", "ocrad_scale"] or 1 - charset = options["Tokenizer", "ocrad_charset"] for pnmfile in pnmfiles: fhash = md5.new(open(pnmfile).read()).hexdigest() --- 295,304 ---- if self.cachefile: atexit.register(self.close) ! self.engine = None ! def extract_ocr_info(self, pnmfiles): + assert self.engine, "must have an engine!" textbits = [] tokens = Set() for pnmfile in pnmfiles: fhash = md5.new(open(pnmfile).read()).hexdigest() *************** *** 186,194 **** else: self.misses += 1 ! ocr = os.popen('%s -s %s -c %s -f "%s" 2>%s' % ! (find_program("ocrad"), scale, charset, ! pnmfile, os.path.devnull)) ! ctext = ocr.read().lower() ! ocr.close() ctokens = set() if not ctext.strip(): --- 308,322 ---- else: self.misses += 1 ! if self.engine.program: ! ctext = self.engine.extract_text(pnmfile).lower() ! else: ! # We should not get here if no OCR is enabled. If it ! # is enabled and we have no program, its OK to spew lots ! # of warnings - they should either disable OCR (it is by ! # default), or fix their config. ! print >> sys.stderr, \ ! "No OCR program '%s' available - can't get text!" \ ! % (self.engine.program_name,) ! ctext = "" ctokens = set() if not ctext.strip(): *************** *** 208,217 **** return "\n".join(textbits), tokens ! def analyze(self, parts): ! if not parts: return "", Set() ! # need ocrad ! if not find_program("ocrad"): return "", Set() --- 336,353 ---- return "\n".join(textbits), tokens ! def analyze(self, engine_name, parts): ! # check engine hasn't changed... ! if self.engine is not None and self.engine.engine_name!=engine_name: ! self.engine = None ! # check engine exists and is valid ! if self.engine is None: ! self.engine = get_engine(engine_name) ! if self.engine is None: ! # We only get here if explicitly enabled - spewing msgs is ok. ! print >> sys.stderr, "invalid engine name '%s' - OCR disabled" \ ! % (engine_name,) return "", Set() ! if not parts: return "", Set() Index: Options.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/Options.py,v retrieving revision 1.139 retrieving revision 1.140 diff -C2 -d -r1.139 -r1.140 *** Options.py 9 Sep 2006 23:02:20 -0000 1.139 --- Options.py 12 Feb 2007 11:24:59 -0000 1.140 *************** *** 129,137 **** (hopefully) text content contained in any images in each message. The current support is minimal, relies on the installation of ! ocrad (http://www.gnu.org/software/ocrad/ocrad.html) and PIL. ! It is almost certainly only useful in its current form on Unix-like ! machines."""), BOOLEAN, RESTORE), ("crack_image_cache", _("Cache to speed up ocr."), "", _("""If non-empty, names a file from which to read cached ocr info --- 129,145 ---- (hopefully) text content contained in any images in each message. The current support is minimal, relies on the installation of ! an OCR 'engine' (see x-ocr_engine.)"""), BOOLEAN, RESTORE), + ("x-ocr_engine", _("OCR engine to use"), "", + _("""(EXPERIMENTAL) The name of the OCR engine to use. If empty, all + supported engines will be checked to see if they are installed. + Engines currently supported include ocrad + (http://www.gnu.org/software/ocrad/ocrad.html) and gocr + (http://jocr.sourceforge.net/download.html) and they require the + appropriate executable be installed in either your PATH, or in the + main spambayes directory."""), + HEADER_VALUE, RESTORE), + ("crack_image_cache", _("Cache to speed up ocr."), "", _("""If non-empty, names a file from which to read cached ocr info Index: tokenizer.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/tokenizer.py,v retrieving revision 1.46 retrieving revision 1.47 diff -C2 -d -r1.46 -r1.47 *** tokenizer.py 13 Dec 2006 14:44:49 -0000 1.46 --- tokenizer.py 12 Feb 2007 11:25:00 -0000 1.47 *************** *** 1619,1623 **** # Find image/* parts of the body, calculating the log(size) of # each image. ! total_len = 0 for part in parts: --- 1619,1623 ---- # Find image/* parts of the body, calculating the log(size) of # each image. ! total_len = 0 for part in parts: *************** *** 1636,1641 **** if options["Tokenizer", "x-crack_images"]: from spambayes.ImageStripper import crack_images ! text, tokens = crack_images(parts) for t in tokens: yield t --- 1636,1642 ---- if options["Tokenizer", "x-crack_images"]: + engine_name = options["Tokenizer", 'x-ocr_engine'] from spambayes.ImageStripper import crack_images ! text, tokens = crack_images(engine_name, parts) for t in tokens: yield t From mhammond at users.sourceforge.net Mon Feb 12 12:35:38 2007 From: mhammond at users.sourceforge.net (Mark Hammond) Date: Mon, 12 Feb 2007 03:35:38 -0800 Subject: [Spambayes-checkins] spambayes/Outlook2000/sandbox dump_email.py, NONE, 1.1 score.py, NONE, 1.1 Message-ID: <20070212113540.D44161E4004@bag.python.org> Update of /cvsroot/spambayes/spambayes/Outlook2000/sandbox In directory sc8-pr-cvs8.sourceforge.net:/tmp/cvs-serv8117/sandbox Added Files: dump_email.py score.py Log Message: Integrate OCR with outlook plugin --- NEW FILE: dump_email.py --- """dump one or more items as an 'email object' to stdout.""" import sys, os import optparse from win32com.mapi import mapi, mapiutil from win32com.mapi.mapitags import * import win32clipboard try: from manager import BayesManager except ImportError: if hasattr(sys, "frozen"): raise sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) from manager import BayesManager import mapi_driver from cStringIO import StringIO def Dump(driver, manager, mapi_folder, subject, stream=None): for item in driver.GetItemsWithValue(mapi_folder, PR_SUBJECT_A, subject): hr, props = item.GetProps((PR_ENTRYID,PR_STORE_ENTRYID), 0) (tag, eid), (tag, store_eid) = props eid = mapi.HexFromBin(eid) store_eid = mapi.HexFromBin(store_eid) print >> stream, "Dumping message with ID %s/%s" % (store_eid, eid) msm = manager.message_store.GetMessage((store_eid, eid)) ob = msm.GetEmailPackageObject() print >> stream, ob.as_string() print >> stream def main(): driver = mapi_driver.MAPIDriver() parser = optparse.OptionParser("%prog [options] [path ...]", description=__doc__) parser.add_option("-q", "--quiet", action="store_true", dest="quiet", default=False, help="don't print status messages to stdout") parser.add_option("-f", "--folder", action="store", default="Inbox", help="folder to search") parser.add_option("-c", "--clipboard", action="store", help="write results to the clipboard") options, args = parser.parse_args() subject = " ".join(args) try: folder = driver.FindFolder(options.folder) except ValueError, details: parser.error(details) stream = None if options.clipboard: stream = StringIO() Dump(driver, BayesManager(), folder, subject, stream) if options.clipboard: win32clipboard.OpenClipboard() win32clipboard.EmptyClipboard() win32clipboard.SetClipboardText(stream.getvalue()) print "Output successfuly written to the Windows clipboard" if __name__=='__main__': main() --- NEW FILE: score.py --- """Scores one or more items in your Outlook store.""" # score one or more items, write results to stdout. # Helps test new features (eg, OCR) outside the Outlook environment. import sys, os import optparse from win32com.mapi import mapi, mapiutil from win32com.mapi.mapitags import * import win32clipboard try: from manager import BayesManager except ImportError: if hasattr(sys, "frozen"): raise sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) from manager import BayesManager from addin import GetClues import mapi_driver from cStringIO import StringIO def Score(driver, manager, mapi_folder, subject, options, stream=None): num = 0 if options.all: getter = driver.GetAllItems getter_args = (mapi_folder,) else: getter = driver.GetItemsWithValue getter_args = (mapi_folder, PR_SUBJECT_A, subject) for item in getter(*getter_args): num += 1 if num % 1000 == 0: print >> sys.stderr, "Processed", num, "items..." hr, props = item.GetProps((PR_ENTRYID,PR_STORE_ENTRYID, PR_SUBJECT_A), 0) (tag, eid), (tag, store_eid), (tag, sub) = props eid = mapi.HexFromBin(eid) store_eid = mapi.HexFromBin(store_eid) try: msm = manager.message_store.GetMessage((store_eid, eid)) manager.classifier_data.message_db.load_msg(msm) score = manager.score(msm) if not options.quiet: print "Message %r scored %g" % (sub, score) if options.show_clues: clues = GetClues(manager, msm) if not options.quiet: print >> stream, clues if options.quiet: continue if options.show_image_info: eob = msm.GetEmailPackageObject() # Show what the OCR managed to extract. from spambayes.ImageStripper import crack_images from spambayes.tokenizer import imageparts image_text, image_toks = crack_images(imageparts(eob)) print >> stream, "Image text:", repr(image_text) print >> stream, "Image tokens:", repr(image_toks) print >> stream # blank lines between messages except: print >> sys.stderr, "FAILED to convert message:", sub raise print >> stream, "Scored", num, "messages." def main(): driver = mapi_driver.MAPIDriver() parser = optparse.OptionParser("%prog [options] subject of message ...", description=__doc__) parser.add_option("-q", "--quiet", action="store_true", dest="quiet", default=False, help="don't print score info - useful for testing") parser.add_option("-f", "--folder", action="store", default="Inbox", help="folder to search") parser.add_option("", "--clipboard", action="store_true", help="write results to the clipboard") parser.add_option("-c", "--show-clues", action="store_true", help="also write the clues for the message") parser.add_option("-a", "--all", action="store_true", help="ignore the subject and score all items in the folder") parser.add_option("-i", "--show-image-info", action="store_true", help="show the information we can extract from images " "in the mail") options, args = parser.parse_args() subject = " ".join(args) try: folder = driver.FindFolder(options.folder) except ValueError, details: parser.error(details) stream = None if options.clipboard: stream = StringIO() Score(driver, BayesManager(), folder, subject, options, stream) if options.clipboard: win32clipboard.OpenClipboard() win32clipboard.EmptyClipboard() win32clipboard.SetClipboardText(stream.getvalue()) print "Output successfuly written to the Windows clipboard" if __name__=='__main__': main() From mhammond at users.sourceforge.net Mon Feb 12 12:35:38 2007 From: mhammond at users.sourceforge.net (Mark Hammond) Date: Mon, 12 Feb 2007 03:35:38 -0800 Subject: [Spambayes-checkins] spambayes/Outlook2000 addin.py, 1.153, 1.154 export.py, 1.19, 1.20 msgstore.py, 1.100, 1.101 Message-ID: <20070212113541.519E51E4004@bag.python.org> Update of /cvsroot/spambayes/spambayes/Outlook2000 In directory sc8-pr-cvs8.sourceforge.net:/tmp/cvs-serv8117 Modified Files: addin.py export.py msgstore.py Log Message: Integrate OCR with outlook plugin Index: addin.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/addin.py,v retrieving revision 1.153 retrieving revision 1.154 diff -C2 -d -r1.153 -r1.154 *** addin.py 10 Jun 2006 04:57:10 -0000 1.153 --- addin.py 12 Feb 2007 11:35:34 -0000 1.154 *************** *** 444,460 **** TrainAsSpam(msgstore_message, self.manager) ! # Event function fired from the "Show Clues" UI items. ! def ShowClues(mgr, explorer): from cgi import escape - - app = explorer.Application - msgstore_message = explorer.GetSelectedMessages(False) - if msgstore_message is None: - return mgr.classifier_data.message_db.load_msg(msgstore_message) - - item = msgstore_message.GetOutlookItem() score, clues = mgr.score(msgstore_message, evidence=True) ! new_msg = app.CreateItem(0) # NOTE: Silly Outlook always switches the message editor back to RTF # once the Body property has been set. Thus, there is no reasonable --- 444,452 ---- TrainAsSpam(msgstore_message, self.manager) ! def GetClues(mgr, msgstore_message): from cgi import escape mgr.classifier_data.message_db.load_msg(msgstore_message) score, clues = mgr.score(msgstore_message, evidence=True) ! # NOTE: Silly Outlook always switches the message editor back to RTF # once the Body property has been set. Thus, there is no reasonable *************** *** 533,538 **** # Now the raw text of the message, as best we can push("

Message Stream

\n") - push("
\n")
      msg = msgstore_message.GetEmailPackageObject(strip_mime_headers=False)
      push(escape(msg.as_string(), True))
      push("
\n") --- 525,530 ---- # Now the raw text of the message, as best we can push("

Message Stream

\n") msg = msgstore_message.GetEmailPackageObject(strip_mime_headers=False) + push("
\n")
      push(escape(msg.as_string(), True))
      push("
\n") *************** *** 562,565 **** --- 554,570 ---- # Put the body together, then the rest of the message. body = ''.join(body) + return body + + # Event function fired from the "Show Clues" UI items. + def ShowClues(mgr, explorer): + + app = explorer.Application + msgstore_message = explorer.GetSelectedMessages(False) + if msgstore_message is None: + return + + body = GetClues(mgr, msgstore_message) + item = msgstore_message.GetOutlookItem() + new_msg = app.CreateItem(0) new_msg.Subject = "Spam Clues: " + item.Subject # As above, use HTMLBody else Outlook refuses to behave. Index: export.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/export.py,v retrieving revision 1.19 retrieving revision 1.20 diff -C2 -d -r1.19 -r1.20 *** export.py 29 Dec 2003 00:35:20 -0000 1.19 --- export.py 12 Feb 2007 11:35:34 -0000 1.20 *************** *** 41,46 **** # Return the text of msg (a MAPIMsgStoreMsg object) as a string. # There are subtleties, alas. ! def get_text(msg): ! email_object = msg.GetEmailPackageObject() try: # Don't use str(msg) instead -- that inserts an information- --- 41,49 ---- # Return the text of msg (a MAPIMsgStoreMsg object) as a string. # There are subtleties, alas. ! def get_text(msg, old_style): ! if old_style: ! email_object = msg.OldGetEmailPackageObject() ! else: ! email_object = msg.GetEmailPackageObject() try: # Don't use str(msg) instead -- that inserts an information- *************** *** 93,97 **** # Returns the total number of .txt files created (== the number of msgs # successfully exported). ! def _export_folders(manager, root, buckets, folder_ids, include_sub): from random import choice --- 96,100 ---- # Returns the total number of .txt files created (== the number of msgs # successfully exported). ! def _export_folders(manager, root, buckets, folder_ids, include_sub, old_style): from random import choice *************** *** 104,108 **** # filename is the EID.txt try: ! msg_text = get_text(message) except KeyboardInterrupt: raise --- 107,111 ---- # filename is the EID.txt try: ! msg_text = get_text(message, old_style) except KeyboardInterrupt: raise *************** *** 121,125 **** # This does all the work. 'directory' is the parent directory for the # generated Ham and Spam sub-folders. ! def export(directory, num_buckets): print "Loading bayes manager..." manager = GetManager() --- 124,128 ---- # This does all the work. 'directory' is the parent directory for the # generated Ham and Spam sub-folders. ! def export(directory, num_buckets, old_style): print "Loading bayes manager..." manager = GetManager() *************** *** 141,145 **** buckets, config.training.spam_folder_ids, ! config.training.spam_include_sub) print "Exported", num, "spam messages." --- 144,149 ---- buckets, config.training.spam_folder_ids, ! config.training.spam_include_sub, ! old_style) print "Exported", num, "spam messages." *************** *** 149,153 **** buckets, config.training.ham_folder_ids, ! config.training.ham_include_sub) print "Exported", num, "ham messages." --- 153,158 ---- buckets, config.training.ham_folder_ids, ! config.training.ham_include_sub, ! old_style) print "Exported", num, "ham messages." *************** *** 156,163 **** try: ! opts, args = getopt.getopt(sys.argv[1:], "hqn:") except getopt.error, d: usage(d) quiet = 0 num_buckets = NUM_BUCKETS for opt, val in opts: --- 161,169 ---- try: ! opts, args = getopt.getopt(sys.argv[1:], "hqon:") except getopt.error, d: usage(d) quiet = 0 + old_style = False num_buckets = NUM_BUCKETS for opt, val in opts: *************** *** 168,171 **** --- 174,179 ---- elif opt == '-n': num_buckets = int(val) + elif opt == '-o': + old_style = True else: assert 0, "internal error on option '%s'" % opt *************** *** 191,195 **** if not quiet: raw_input("Press enter to continue, or Ctrl+C to abort.") ! export(directory, num_buckets) # Display errormsg (if specified), a blank line, and usage information; then --- 199,203 ---- if not quiet: raw_input("Press enter to continue, or Ctrl+C to abort.") ! export(directory, num_buckets, old_style=old_style) # Display errormsg (if specified), a blank line, and usage information; then Index: msgstore.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/Outlook2000/msgstore.py,v retrieving revision 1.100 retrieving revision 1.101 diff -C2 -d -r1.100 -r1.101 *** msgstore.py 6 Apr 2005 03:06:51 -0000 1.100 --- msgstore.py 12 Feb 2007 11:35:34 -0000 1.101 *************** *** 3,6 **** --- 3,20 ---- import sys, os, re import locale + from time import timezone + + import email + from email.MIMEImage import MIMEImage + from email.Message import Message + from email.MIMEMultipart import MIMEMultipart + from email.MIMEText import MIMEText + from email.Parser import HeaderParser + from email.Utils import formatdate + + try: + from cStringIO import StringIO + except ImportError: + from StringIO import StringIO try: *************** *** 723,727 **** folder = self.msgstore._OpenEntry(self.id) # Nuke my MAPI reference, and set my ID to None ! rc = folder.DeleteMessages(real_ids, 0, None, 0) except pythoncom.com_error, details: raise MsgStoreExceptionFromCOMException(details) --- 737,741 ---- folder = self.msgstore._OpenEntry(self.id) # Nuke my MAPI reference, and set my ID to None ! folder.DeleteMessages(real_ids, 0, None, 0) except pythoncom.com_error, details: raise MsgStoreExceptionFromCOMException(details) *************** *** 884,888 **** def _GetMessageText(self): parts = self._GetMessageTextParts() ! # parts is (headers, body, html), but could possibly grow return "\n".join(parts) --- 898,904 ---- def _GetMessageText(self): parts = self._GetMessageTextParts() ! # parts is (headers, body, html) - which needs more formalizing - ! # GetMessageText should become deprecated - it makes no sense in the ! # face of multi-part messages. return "\n".join(parts) *************** *** 893,896 **** --- 909,915 ---- # Note we *dont* look in plain text attachments, which we arguably # should. + # This should be refactored into a function that returns the headers, + # plus a list of email package sub-objects suitable for sending to + # the classifier. from spambayes import mboxutils *************** *** 936,939 **** --- 955,960 ---- # Find all attachments with # PR_ATTACH_MIME_TAG_A=multipart/signed + # XXX - see also self._GetAttachmentsToInclude(), which + # scans the attachment table - we should consolidate! table = self.mapi_object.GetAttachmentTable(0) restriction = (mapi.RES_PROPERTY, # a property restriction *************** *** 973,977 **** # it into a message object, so we can extract the text, so # we can stick it back into another one. Ahhhhh. - import email msg = email.message_from_string(attach_body) assert msg.is_multipart(), "Should be multi-part: %r" % attach_body --- 994,997 ---- *************** *** 1032,1037 **** def _format_time(self, raw): - from time import timezone - from email.Utils import formatdate return formatdate(int(raw)-timezone, True) --- 1052,1055 ---- *************** *** 1068,1071 **** --- 1086,1134 ---- raise MsgStoreExceptionFromCOMException(details) + def _GetAttachmentsToInclude(self): + # Get the list of attachments to include in the email package + # Message object. Currently only images (BUT - consider consolidating + # with the attachment handling above for signed messages!) + from spambayes.Options import options + from spambayes.ImageStripper import image_large_size_attribute + + # For now, we know these are the only 2 options that need attachments. + if not options['Tokenizer', 'x-crack_images'] and \ + not options['Tokenizer', 'x-image_size']: + return [] + try: + table = self.mapi_object.GetAttachmentTable(0) + tags = PR_ATTACH_NUM,PR_ATTACH_MIME_TAG_A,PR_ATTACH_SIZE,PR_ATTACH_DATA_BIN + attach_rows = mapi.HrQueryAllRows(table, tags, None, None, 0) + except pythoncom.com_error, why: + attach_rows = [] + + attachments = [] + # Create a new attachment for each image. + for row in attach_rows: + attach_num = row[0][1] + # mime-tag may not exist - eg, seen on bounce messages + mime_tag = None + if PROP_TYPE(row[1][0]) != PT_ERROR: + mime_tag = row[1][1] + # oh - what is the library for this!? + if mime_tag: + typ, subtyp = mime_tag.split('/', 1) + if typ == 'image': + size = row[2][1] + # If it is too big, just write the size. ImageStripper.py + # checks this attribute. + if size > options["Tokenizer", "max_image_size"]: + sub = MIMEImage(None, subtyp) + setattr(sub, image_large_size_attribute, size) + else: + attach = self.mapi_object.OpenAttach(attach_num, + None, mapi.MAPI_DEFERRED_ERRORS) + data = GetPotentiallyLargeStringProp(attach, + PR_ATTACH_DATA_BIN, row[3]) + sub = MIMEImage(data, subtyp) + attachments.append(sub) + return attachments + def GetEmailPackageObject(self, strip_mime_headers=True): # Return an email.Message object. *************** *** 1096,1099 **** --- 1159,1288 ---- # Short course: we either have to synthesize non-insane MIME # structure, or eliminate all evidence of original MIME structure. + # We used to do the latter - but now that we must give valid + # multipart messages which include attached images, we are forced + # to try and do the former (but actually the 2 options are not + # mutually exclusive - first we eliminate all evidence of original + # MIME structure, before allowing the email package to synthesize + # non-insane MIME structure. + + # We still jump through hoops though - if we have no interesting + # attachments we attempt to return as close as possible as what + # we always returned in the past - a "single-part" message with the + # text and HTML as a simple text body. + header_text, body, html = self._GetMessageTextParts() + + try: # catch all exceptions! + # Try and decide early if we want multipart or not. + # We originally just looked at the content-type - but Outlook + # is unreliable WRT that header! Also, consider a message multipart message + # with only text and html sections and no additional attachments. + # Outlook will generally have copied the HTML and Text sections + # into the relevant properties and they will *not* appear as + # attachments. We should return the 'single' message here to keep + # as close to possible to what we used to return. We can change + # this policy in the future - but we would probably need to insist + # on a full re-train as the training tokens will have changed for + # many messages. + attachments = self._GetAttachmentsToInclude() + new_content_type = None + if attachments: + _class = MIMEMultipart + payload = [] + if body: + payload.append(MIMEText(body)) + if html: + payload.append(MIMEText(html, 'html')) + payload += attachments + new_content_type = "multipart/mixed" + else: + # Single message part with both text and HTML. + _class = Message + payload = body + '\n' + html + + try: + root_msg = HeaderParser(_class=_class).parsestr(header_text) + except email.Errors.HeaderParseError: + raise # sob + # ack - it is about here we need to do what the old code did + # below: But - the fact the code below is dealing only + # with content-type (and the fact we handle that above) makes + # it less obvious.... + + ## But even this doesn't get *everything*. We can still see: + ## "multipart message with no defined boundary" or the + ## HeaderParseError above. Time to get brutal - hack out + ## the Content-Type header, so we see it as plain text. + #if msg is None: + # butcher_pos = text.lower().find("\ncontent-type: ") + # if butcher_pos < 0: + # # This error just just gunna get caught below anyway + # raise RuntimeError( + # "email package croaked with a MIME related error, but " + # "there appears to be no 'Content-Type' header") + # # Put it back together, skipping the original "\n" but + # # leaving the header leaving "\nSpamBayes-Content-Type: " + # butchered = text[:butcher_pos] + "\nSpamBayes-" + \ + # text[butcher_pos+1:] + "\n\n" + # msg = email.message_from_string(butchered) + + # patch up mime stuff - these headers will confuse the email + # package as it walks the attachments. + if strip_mime_headers: + for h, new_val in (('content-type', new_content_type), + ('content-transfer-encoding', None)): + try: + root_msg['X-SpamBayes-Original-' + h] = root_msg[h] + del root_msg[h] + except KeyError: + pass + if new_val is not None: + root_msg[h] = new_val + + root_msg.set_payload(payload) + + # We used to call email.message_from_string(text) and catch: + # email.Errors.BoundaryError: should no longer happen - we no longer + # ask the email package to parse anything beyond headers. + # email.Errors.HeaderParseError: caught above + except: + text = '\r\n'.join([header_text, body, html]) + print "FAILED to create email.message from: ", `text` + raise + + return root_msg + + # XXX - this is the OLD version of GetEmailPackageObject() - it + # temporarily remains as a testing aid, to ensure that the different + # mime structure we now generate has no negative affects. + # Use 'sandbox/export.py -o' to export to the testdata directory + # in the old format, then run the cross-validation tests. + def OldGetEmailPackageObject(self, strip_mime_headers=True): + # Return an email.Message object. + # + # strip_mime_headers is a hack, and should be left True unless you're + # trying to display all the headers for diagnostic purposes. If we + # figure out something better to do, it should go away entirely. + # + # Problem #1: suppose a msg is multipart/alternative, with + # text/plain and text/html sections. The latter MIME decorations + # are plain missing in what _GetMessageText() returns. If we leave + # the multipart/alternative in the headers anyway, the email + # package's "lax parsing" won't complain about not finding any + # sections, but since the type *is* multipart/alternative then + # anyway, the tokenizer finds no text/* parts at all to tokenize. + # As a result, only the headers get tokenized. By stripping + # Content-Type from the headers (if present), the email pkg + # considers the body to be text/plain (the default), and so it + # does get tokenized. + # + # Problem #2: Outlook decodes quoted-printable and base64 on its + # own, but leaves any Content-Transfer-Encoding line in the headers. + # This can cause the email pkg to try to decode the text again, + # with unpleasant (but rarely fatal) results. If we strip that + # header too, no problem -- although the fact that a msg was + # encoded in base64 is usually a good spam clue, and we miss that. + # + # Short course: we either have to synthesize non-insane MIME + # structure, or eliminate all evidence of original MIME structure. # Since we don't have a way to the former, by default this function # does the latter. *************** *** 1146,1150 **** return msg ! def SetField(self, prop, val): # Future optimization note - from GetIDsFromNames doco --- 1335,1340 ---- return msg ! # end of OLD GetEmailPackageObject ! def SetField(self, prop, val): # Future optimization note - from GetIDsFromNames doco *************** *** 1341,1345 **** def test(): - from win32com.client import Dispatch outlook = Dispatch("Outlook.Application") inbox = outlook.Session.GetDefaultFolder(constants.olFolderInbox) --- 1531,1534 ---- From mhammond at users.sourceforge.net Wed Feb 14 01:53:25 2007 From: mhammond at users.sourceforge.net (Mark Hammond) Date: Tue, 13 Feb 2007 16:53:25 -0800 Subject: [Spambayes-checkins] spambayes/spambayes ImageStripper.py, 1.12, 1.13 Message-ID: <20070214005327.0AB0E1E4011@bag.python.org> Update of /cvsroot/spambayes/spambayes/spambayes In directory sc8-pr-cvs8.sourceforge.net:/tmp/cvs-serv4609 Modified Files: ImageStripper.py Log Message: Fix typos - s/program_name/engine_name/ Index: ImageStripper.py =================================================================== RCS file: /cvsroot/spambayes/spambayes/spambayes/ImageStripper.py,v retrieving revision 1.12 retrieving revision 1.13 diff -C2 -d -r1.12 -r1.13 *** ImageStripper.py 12 Feb 2007 11:24:59 -0000 1.12 --- ImageStripper.py 14 Feb 2007 00:53:22 -0000 1.13 *************** *** 219,223 **** class OCRExecutableEngine(OCREngine): """Uses a simple executable that writes to stdout to extract the text""" ! program_name = None def __init__(self): # we go looking for the program first use and cache its location --- 219,223 ---- class OCRExecutableEngine(OCREngine): """Uses a simple executable that writes to stdout to extract the text""" ! engine_name = None def __init__(self): # we go looking for the program first use and cache its location *************** *** 317,321 **** print >> sys.stderr, \ "No OCR program '%s' available - can't get text!" \ ! % (self.engine.program_name,) ctext = "" ctokens = set() --- 317,321 ---- print >> sys.stderr, \ "No OCR program '%s' available - can't get text!" \ ! % (self.engine.engine_name,) ctext = "" ctokens = set()