[Spambayes-checkins] spambayes/Outlook2000 export.py,1.1,1.2

Mark Hammond mhammond@users.sourceforge.net
Thu Nov 21 12:06:58 2002


Update of /cvsroot/spambayes/spambayes/Outlook2000
In directory sc8-pr-cvs1:/tmp/cvs-serv913

Modified Files:
	export.py 
Log Message:
Select correct number of sets even when more spam, and allow user to
specify how many messages in each dir.


Index: export.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Outlook2000/export.py,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** export.py	21 Nov 2002 11:20:14 -0000	1.1
--- export.py	21 Nov 2002 12:06:55 -0000	1.2
***************
*** 4,25 ****
  from manager import GetManager
  
  
! def BuildBuckets(manager, root_directory, folder_ids, include_sub):
      store = manager.message_store
      config = manager.config
!     num = 0
      for folder in store.GetFolderGenerator(config.training.spam_folder_ids, config.training.spam_include_sub):
          for msg in folder.GetMessageGenerator():
!             num += 1
!     num_buckets = num / 400
      dirs = []
      for i in range(num_buckets):
!         dir=os.path.join(root_directory, "Set%d" % (i+1,))
!         dir=os.path.abspath(dir)
!         if os.path.isdir(dir):
!             shutil.rmtree(dir)
!         os.makedirs(dir)
!         dirs.append(dir)
!     return dirs
  
  def ChooseBucket(buckets):
--- 4,24 ----
  from manager import GetManager
  
+ files_per_directory = 400
  
! def BuildBuckets(manager):
      store = manager.message_store
      config = manager.config
!     num_ham = num_spam = 0
      for folder in store.GetFolderGenerator(config.training.spam_folder_ids, config.training.spam_include_sub):
          for msg in folder.GetMessageGenerator():
!             num_spam += 1
!     for folder in store.GetFolderGenerator(config.training.ham_folder_ids, config.training.ham_include_sub):
!         for msg in folder.GetMessageGenerator():
!             num_ham += 1
!     num_buckets = min(num_ham, num_spam)/ files_per_directory
      dirs = []
      for i in range(num_buckets):
!         dirs.append("Set%d" % (i+1,))
!     return num_spam, num_ham, dirs
  
  def ChooseBucket(buckets):
***************
*** 27,38 ****
      return random.choice(buckets)
  
! def _export_folders(manager, dir, folder_ids, include_sub):
      num = 0
      store = manager.message_store
-     buckets = BuildBuckets(manager, dir, folder_ids, include_sub)
      for folder in store.GetFolderGenerator(folder_ids, include_sub):
          print "", folder.name
          for message in folder.GetMessageGenerator():
!             dir = ChooseBucket(buckets)
              # filename is the EID.txt
              try:
--- 26,37 ----
      return random.choice(buckets)
  
! def _export_folders(manager, dir, buckets, folder_ids, include_sub):
      num = 0
      store = manager.message_store
      for folder in store.GetFolderGenerator(folder_ids, include_sub):
          print "", folder.name
          for message in folder.GetMessageGenerator():
!             sub = ChooseBucket(buckets)
!             this_dir = os.path.join(dir, sub)
              # filename is the EID.txt
              try:
***************
*** 45,49 ****
                  continue
  
!             fname = os.path.join(dir, message.GetID()[1]) + ".txt"
              f = open(fname, "w")
              f.write(msg_text)
--- 44,48 ----
                  continue
  
!             fname = os.path.join(this_dir, message.GetID()[1]) + ".txt"
              f = open(fname, "w")
              f.write(msg_text)
***************
*** 57,74 ****
      config = manager.config
  
      print "Exporting spam..."
!     num = _export_folders(manager, os.path.join(directory, "Spam"),
                            config.training.spam_folder_ids, config.training.spam_include_sub)
!     print "Exported", num, " spam messages."
  
!     print "Exporting ham...",
!     num = _export_folders(manager, os.path.join(directory, "Ham"),
                            config.training.ham_folder_ids, config.training.ham_include_sub)
!     print "Exported", num, " ham messages."
  
  def main():
      import getopt
      try:
!         opts, args = getopt.getopt(sys.argv[1:], "q")
      except getopt.error, d:
          print d
--- 56,84 ----
      config = manager.config
  
+     num_spam, num_ham, buckets = BuildBuckets(manager)
+     print "Have %d spam, and %d ham to export, spread over %d directories." \
+           % (num_spam, num_ham, len(buckets))
+ 
+     for sub in ["Spam", "Ham"]:
+         if os.path.exists(os.path.join(directory, sub)):
+             shutil.rmtree(os.path.join(directory, sub))
+         for b in buckets:
+             d = os.path.join(directory, sub, b)
+             os.makedirs(d)
+ 
      print "Exporting spam..."
!     num = _export_folders(manager, os.path.join(directory, "Spam"), buckets,
                            config.training.spam_folder_ids, config.training.spam_include_sub)
!     print "Exported", num, "spam messages."
  
!     print "Exporting ham..."
!     num = _export_folders(manager, os.path.join(directory, "Ham"), buckets,
                            config.training.ham_folder_ids, config.training.ham_include_sub)
!     print "Exported", num, "ham messages."
  
  def main():
      import getopt
      try:
!         opts, args = getopt.getopt(sys.argv[1:], "qn:")
      except getopt.error, d:
          print d
***************
*** 79,82 ****
--- 89,95 ----
          if opt=='-q':
              quiet = 1
+         elif opt=='-n':
+             global files_per_directory
+             files_per_directory = int(val)
  
      if len(args) > 1:
***************
*** 106,109 ****
--- 119,123 ----
  
  -q : quiet - don't prompt for confirmation.
+ -n : Minimum number of files to aim for in each directory, default=%d
  
  Export the folders defined in the Outlook Plugin to a test directory.
***************
*** 115,119 ****
  If 'directory' exists, it will be recursively deleted before
  the export (but you will be asked to confirm unless -q is given).""" \
!             % (os.path.basename(sys.argv[0]))
      sys.exit(1)
  
--- 129,133 ----
  If 'directory' exists, it will be recursively deleted before
  the export (but you will be asked to confirm unless -q is given).""" \
!             % (os.path.basename(sys.argv[0]), files_per_directory)
      sys.exit(1)
  





More information about the Spambayes-checkins mailing list