[Spambayes-checkins] spambayes pop3proxy.py,1.16,1.17

Richie Hindle richiehindle@users.sourceforge.net
Wed Nov 20 12:45:24 2002


Update of /cvsroot/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv21143

Modified Files:
	pop3proxy.py 
Log Message:
 o Multiple server support - the old ini-file settings are deprecated;
   see Options.py
 o Added a 'defer' choice in addition to discard/ham/spam - thanks to
   Skip for the suggestion.
 o The training page now groups by X-Hammie-Disposition - thanks again
   to Skip.
 o Added a Save Database button to the status panel.
 o Added nspam and nham to the status panel.
 o Fixed several Mac-related problems reported by François, whereby I
   needed to use longs for timestamps.


Index: pop3proxy.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/pop3proxy.py,v
retrieving revision 1.16
retrieving revision 1.17
diff -C2 -d -r1.16 -r1.17
*** pop3proxy.py	18 Nov 2002 19:14:48 -0000	1.16
--- pop3proxy.py	20 Nov 2002 12:45:21 -0000	1.17
***************
*** 53,58 ****
  Web training interface:
  
-  o Include more stats in the Status box - it's easy to lose track of
-    where you are when testing.
   o Functional tests.
   o Review already-trained messages, and purge them.
--- 53,56 ----
***************
*** 80,85 ****
   o Possibly integrate Tim Stone's SMTP code - make it use async, make
     the training code update (rather than replace!) the database.
-  o Option to keep trained messages and view potential FPs and FNs to
-    correct them.
   o Allow use of the UI without the POP3 proxy.
   o Remove any existing X-Hammie-Disposition header from incoming emails.
--- 78,81 ----
***************
*** 107,115 ****
   o Classify a web page given a URL.
   o Graphs.  Of something.  Who cares what?
   o Zoe...!
  
  """
  
! import os, sys, re, operator, errno, getopt, cPickle, cStringIO, time, bisect
  import socket, asyncore, asynchat, cgi, urlparse, webbrowser
  import Bayes, tokenizer, mboxutils
--- 103,112 ----
   o Classify a web page given a URL.
   o Graphs.  Of something.  Who cares what?
+  o NNTP proxy.
   o Zoe...!
  
  """
  
! import os, sys, re, operator, errno, getopt, string, cStringIO, time, bisect
  import socket, asyncore, asynchat, cgi, urlparse, webbrowser
  import Bayes, tokenizer, mboxutils
***************
*** 477,481 ****
                  # The message name is the time it arrived, with a uniquifier
                  # appended if two arrive within one clock tick of each other.
!                 messageName = "%10.10d" % time.time()
                  if messageName == state.lastBaseMessageName:
                      state.lastBaseMessageName = messageName
--- 474,478 ----
                  # The message name is the time it arrived, with a uniquifier
                  # appended if two arrive within one clock tick of each other.
!                 messageName = "%10.10d" % long(time.time())
                  if messageName == state.lastBaseMessageName:
                      state.lastBaseMessageName = messageName
***************
*** 603,612 ****
                    &nbsp;<br>\n"""
  
!     summary = """POP3 proxy running on port <b>%(proxyPort)d</b>,
!               proxying to <b>%(serverName)s:%(serverPort)d</b>.<br>
                Active POP3 conversations: <b>%(activeSessions)d</b>.<br>
                POP3 conversations this session: <b>%(totalSessions)d</b>.<br>
                Emails classified this session: <b>%(numSpams)d</b> spam,
!                 <b>%(numHams)d</b> ham, <b>%(numUnsure)d</b> unsure.
                """
  
--- 600,614 ----
                    &nbsp;<br>\n"""
  
!     summary = """POP3 proxy running on <b>%(proxyPortsString)s</b>,
!               proxying to <b>%(serversString)s</b>.<br>
                Active POP3 conversations: <b>%(activeSessions)d</b>.<br>
                POP3 conversations this session: <b>%(totalSessions)d</b>.<br>
                Emails classified this session: <b>%(numSpams)d</b> spam,
!                 <b>%(numHams)d</b> ham, <b>%(numUnsure)d</b> unsure.<br>
!               Total emails trained: Spam: <b>%(nspam)d</b>
!                                      Ham: <b>%(nham)d</b><br>
!               <form action='save' method='POST'>
!               <input type='submit' value='Save database'>
!               </form>
                """
  
***************
*** 620,628 ****
               using the <a href='review'>Review messages</a> page."""
  
!     reviewHeader = """<p>These are unclassified emails, which you can use to
!                    train the classifier.  Check the Discard / Ham / Spam
!                    buttton for each email, then click 'Train' below.  (To
!                    discard the whole page, leave everything with Discard
!                    checked and click 'Train'.)</p>
                     <form action='review' method='GET'>
                         <input type='hidden' name='prior' value='%d'>
--- 622,630 ----
               using the <a href='review'>Review messages</a> page."""
  
!     reviewHeader = """<p>These are untrained emails, which you can use to
!                    train the classifier.  Check the Discard / Defer / Ham /
!                    Spam buttton for each email, then click 'Train' below.
!                    (Defer leaves the message here, to be trained on
!                    later.)</p>
                     <form action='review' method='GET'>
                         <input type='hidden' name='prior' value='%d'>
***************
*** 639,644 ****
                     <form action='review' method='POST'>
                     <table class='messagetable' cellpadding='0' cellspacing='0'>
!                    <tr><td><b>Subject:</b></td><td><b>From:</b></td>
!                    <td><b>Discard / Ham / Spam</b></td></tr>"""
  
      upload = """<form action='%s' method='POST'
--- 641,649 ----
                     <form action='review' method='POST'>
                     <table class='messagetable' cellpadding='0' cellspacing='0'>
!                    """
! 
!     reviewSubheader = """<tr><td><b>Messages classified as %s:</b></td>
!                           <td><b>From:</b></td>
!                           <td><b>Discard / Defer / Ham / Spam</b></td></tr>"""
  
      upload = """<form action='%s' method='POST'
***************
*** 769,773 ****
              homeLink = "<a href='home'>Home</a> > %s" % name
          if showImage:
!             image = "<img src='/helmet.gif' align='absmiddle'>&nbsp;"
          else:
              image = ""
--- 774,778 ----
              homeLink = "<a href='home'>Home</a> > %s" % name
          if showImage:
!             image = "<img src='helmet.gif' align='absmiddle'>&nbsp;"
          else:
              image = ""
***************
*** 796,800 ****
      def onHome(self, params):
          """Serve up the homepage."""
!         body = (self.pageSection % ('Status', self.summary % state.__dict__)+
                  self.pageSection % ('Train on proxied messages', self.review)+
                  self.pageSection % ('Train on a given message', self.train)+
--- 801,807 ----
      def onHome(self, params):
          """Serve up the homepage."""
!         stateDict = state.__dict__
!         stateDict.update(state.bayes.__dict__)
!         body = (self.pageSection % ('Status', self.summary % stateDict)+
                  self.pageSection % ('Train on proxied messages', self.review)+
                  self.pageSection % ('Train on a given message', self.train)+
***************
*** 803,813 ****
          self.push(body)
  
      def onShutdown(self, params):
          """Shutdown the server, saving the pickle if requested to do so."""
          if params['how'].lower().find('save') >= 0:
!             if not state.useDB and state.databaseFilename:
!                 self.push("<b>Saving...</b>")
!                 self.push(' ')  # Acts as a flush for small buffers.
!                 state.bayes.store()
          self.push("<b>Shutdown</b>. Goodbye.</div></body></html>")
          self.push(' ')
--- 810,828 ----
          self.push(body)
  
+     def doSave(self):
+         """Saves the database.  Worker for onSave and onShutdown."""
+         self.push("<b>Saving... ")
+         self.push(' ')
+         state.bayes.store()
+         self.push("Done</b>.")
+ 
+     def onSave(self, params):
+         """Command handler for "Save"."""
+         self.doSave()
+ 
      def onShutdown(self, params):
          """Shutdown the server, saving the pickle if requested to do so."""
          if params['how'].lower().find('save') >= 0:
!             self.doSave()
          self.push("<b>Shutdown</b>. Goodbye.</div></body></html>")
          self.push(' ')
***************
*** 845,849 ****
          for that message.  This is the time that the message was received,
          not the Date header."""
!         return int(key[:10])
  
      def getTimeRange(self, timestamp):
--- 860,864 ----
          for that message.  This is the time that the message was received,
          not the Date header."""
!         return long(key[:10])
  
      def getTimeRange(self, timestamp):
***************
*** 879,884 ****
  
          # Find the subset of the keys within this range.
!         startKeyIndex = bisect.bisect(allKeys, "%d" % start)
!         endKeyIndex = bisect.bisect(allKeys, "%d" % end)
          keys = allKeys[startKeyIndex:endKeyIndex]
          keys.reverse()
--- 894,899 ----
  
          # Find the subset of the keys within this range.
!         startKeyIndex = bisect.bisect(allKeys, "%d" % long(start))
!         endKeyIndex = bisect.bisect(allKeys, "%d" % long(end))
          keys = allKeys[startKeyIndex:endKeyIndex]
          keys.reverse()
***************
*** 896,911 ****
          return keys, date, prior, start, end
  
!     def onReview(self, params):
!         """Present a list of message for (re)training."""
  
!         # This is the radio group for training/discarding.
!         trainRadio = """<input type='radio' name='classify:%s'
!                                value='discard' checked>
!                         <input type='radio' name='classify:%s' value='ham'>
!                         <input type='radio' name='classify:%s' value='spam'>"""
  
          # Train/discard sumbitted messages.
          id = ''
          numTrained = 0
          for key, value in params.items():
              if key.startswith('classify:'):
--- 911,947 ----
          return keys, date, prior, start, end
  
!     def appendMessages(self, lines, keyedMessages, judgement):
!         """Appends the lines of a table of messages to 'lines'."""
!         buttons = """<input type='radio' name='classify:%s' value='discard'>
!                   <input type='radio' name='classify:%s' value='defer' %s>
!                   <input type='radio' name='classify:%s' value='ham' %s>
!                   <input type='radio' name='classify:%s' value='spam' %s>"""
!         stripe = 0
!         for key, message in keyedMessages:
!             # Parse the message and get the relevant headers.
!             subject = self.trimAndQuote(message["Subject"] or "(none)", 50)
!             from_ = self.trimAndQuote(message["From"] or "(none)", 40)
  
!             # Output the table row for this message.
!             defer = ham = spam = ""
!             if judgement == options.header_spam_string:
!                 spam='checked'
!             elif judgement == options.header_ham_string:
!                 ham='checked'
!             elif judgement == options.header_unsure_string:
!                 defer='checked'
!             radioGroup = buttons % (key, key, defer, key, ham, key, spam)
!             stripeClass = ['stripe_on', 'stripe_off'][stripe]
!             lines.append("""<tr class='%s'><td>%s</td><td>%s</td>
!                             <td align='middle'>%s</td></tr>""" % \
!                             (stripeClass, subject, from_, radioGroup))
!             stripe = stripe ^ 1
  
+     def onReview(self, params):
+         """Present a list of message for (re)training."""
          # Train/discard sumbitted messages.
          id = ''
          numTrained = 0
+         numDeferred = 0
          for key, value in params.items():
              if key.startswith('classify:'):
***************
*** 915,921 ****
                  elif value == 'ham':
                      targetCorpus = state.hamCorpus
!                 else: # Discard
                      targetCorpus = None
!                     state.unknownCorpus.removeMessage(state.unknownCorpus[id])
                  if targetCorpus:
                      try:
--- 951,963 ----
                  elif value == 'ham':
                      targetCorpus = state.hamCorpus
!                 elif value == 'discard':
                      targetCorpus = None
!                     try:
!                         state.unknownCorpus.removeMessage(state.unknownCorpus[id])
!                     except KeyError:
!                         pass  # Must be a reload.
!                 else: # defer
!                     targetCorpus = None
!                     numDeferred += 1
                  if targetCorpus:
                      try:
***************
*** 939,946 ****
              self.push("Done.</b></p>")
  
!         # After submitting a page, display the prior page or the next one.
!         # Derive the day of the submitted page from the ID of the last
!         # processed message.
!         if id:
              start = self.keyToTimestamp(id)
              _, _, prior, _, next = self.buildReviewKeys(start)
--- 981,992 ----
              self.push("Done.</b></p>")
  
!         # If any messages were deferred, show the same page again.
!         if numDeferred > 0:
!             start = self.keyToTimestamp(id)
! 
!         # Else after submitting a whole page, display the prior page or the
!         # next one.  Derive the day of the submitted page from the ID of the
!         # last processed message.
!         elif id:
              start = self.keyToTimestamp(id)
              _, _, prior, _, next = self.buildReviewKeys(start)
***************
*** 960,965 ****
              start = 0
  
!         # Present the list of messages in reverse order of appearance.
          keys, date, prior, this, next = self.buildReviewKeys(start)
          if keys:
              priorState = nextState = ""
--- 1006,1024 ----
              start = 0
  
!         # Build the lists of messages: spams, hams and unsure.
          keys, date, prior, this, next = self.buildReviewKeys(start)
+         keyedMessages = {options.header_spam_string: [],
+                          options.header_ham_string: [],
+                          options.header_unsure_string: []}
+         for key in keys:
+             # Parse the message and get the judgement header.
+             cachedMessage = state.unknownCorpus[key]
+             message = mboxutils.get_message(cachedMessage.getSubstance())
+             judgement = message[options.hammie_header_name] or \
+                                             options.header_unsure_string
+             keyedMessages[judgement].append((key, message))
+ 
+         # Present the list of messages in their groups in reverse order of
+         # appearance.
          if keys:
              priorState = nextState = ""
***************
*** 969,996 ****
                  nextState = 'disabled'
              lines = [self.reviewHeader % (prior, next, priorState, nextState)]
!             stripe = 0
!             for key in keys:
!                 # Parse the message and get the relevant headers.
!                 cachedMessage = state.unknownCorpus[key]
!                 message = mboxutils.get_message(cachedMessage.getSubstance())
!                 subject = self.trimAndQuote(message["Subject"] or "(none)", 50)
!                 from_ = self.trimAndQuote(message["From"] or "(none)", 40)
  
-                 # Output the table row for this message.
-                 key = cachedMessage.key()
-                 radioGroup = trainRadio % (key, key, key)
-                 stripeClass = ['stripe_on', 'stripe_off'][stripe]
-                 lines.append("""<tr class='%s'><td>%s</td><td>%s</td>
-                                 <td align='middle'>%s</td></tr>""" % \
-                                 (stripeClass, subject, from_, radioGroup))
-                 stripe = stripe ^ 1
              lines.append("""<tr><td></td><td></td><td align='middle'>&nbsp;<br>
                              <input type='submit' value='Train'></td></tr>""")
              lines.append("</table></form>")
              content = "\n".join(lines)
!             title = "Unclassified messages received on %s" % date
          else:
!             content = "<p>There are no unclassified messages to display.</p>"
!             title = "No unclassified messages"
  
          self.push(self.pageSection % (title, content))
--- 1028,1047 ----
                  nextState = 'disabled'
              lines = [self.reviewHeader % (prior, next, priorState, nextState)]
!             for header, type in ((options.header_spam_string, 'Spam'),
!                                  (options.header_ham_string, 'Ham'),
!                                  (options.header_unsure_string, 'Unsure')):
!                 if keyedMessages[header]:
!                     lines.append("<tr><td>&nbsp;</td><td></td><td></td></tr>")
!                     lines.append(self.reviewSubheader % type)
!                     self.appendMessages(lines, keyedMessages[header], header)
  
              lines.append("""<tr><td></td><td></td><td align='middle'>&nbsp;<br>
                              <input type='submit' value='Train'></td></tr>""")
              lines.append("</table></form>")
              content = "\n".join(lines)
!             title = "Untrained messages received on %s" % date
          else:
!             content = "<p>There are no untrained messages to display.</p>"
!             title = "No untrained messages"
  
          self.push(self.pageSection % (title, content))
***************
*** 1047,1054 ****
          self.logFile = open('_pop3proxy.log', 'wb', 0)
  
!         # Load up the default settings from Option.py / bayescustomize.ini
!         self.proxyPort = options.pop3proxy_port
!         self.serverName = options.pop3proxy_server_name
!         self.serverPort = options.pop3proxy_server_port
          self.databaseFilename = options.persistent_storage_file
          self.useDB = options.persistent_use_database
--- 1098,1134 ----
          self.logFile = open('_pop3proxy.log', 'wb', 0)
  
!         # Load up the old proxy settings from Options.py / bayescustomize.ini
!         # and give warnings if they're present.   XXX Remove these soon.
!         if options.pop3proxy_port != 110 or \
!            options.pop3proxy_server_name != '' or \
!            options.pop3proxy_server_port != 110:
!             print "\n    pop3proxy_port, pop3proxy_server_name and"
!             print "    pop3proxy_server_port are deprecated!  Please use"
!             print "    pop3proxy_servers and pop3proxy_ports instead.\n"
!         self.servers = [(options.pop3proxy_server_name,
!                          options.pop3proxy_server_port)]
!         self.proxyPorts = [options.pop3proxy_port]
! 
!         # Load the new proxy settings - these will override the old ones
!         # if both are present.
!         if options.pop3proxy_servers:
!             self.servers = []
!             for server in options.pop3proxy_servers.split(','):
!                 server = server.strip()
!                 if server.find(':') > -1:
!                     server, port = server.split(':', 1)
!                 else:
!                     port = '110'
!                 self.servers.append((server, int(port)))
! 
!         if options.pop3proxy_ports:
!             splitPorts = options.pop3proxy_ports.split(',')
!             self.proxyPorts = map(int, map(string.strip, splitPorts))
! 
!         if len(self.servers) != len(self.proxyPorts):
!             print "pop3proxy_servers & pop3proxy_ports are different lengths!"
!             sys.exit()
! 
!         # Load up the other settings from Option.py / bayescustomize.ini
          self.databaseFilename = options.persistent_storage_file
          self.useDB = options.persistent_use_database
***************
*** 1074,1077 ****
--- 1154,1164 ----
          self.uniquifier = 2
  
+     def buildServerStrings(self):
+         """After the server details have been set up, this creates string
+         versions of the details, for display in the Status panel."""
+         serverStrings = ["%s:%s" % (s, p) for s, p in self.servers]
+         self.serversString = ', '.join(serverStrings)
+         self.proxyPortsString = ', '.join(map(str, self.proxyPorts))
+ 
      def createWorkers(self):
          """Using the options that were initialised in __init__ and then
***************
*** 1117,1125 ****
  
  
! def main(serverName, serverPort, proxyPort,
!          uiPort, launchUI, databaseFilename, useDB):
      """Runs the proxy forever or until a 'KILL' command is received or
      someone hits Ctrl+Break."""
!     BayesProxyListener(serverName, serverPort, proxyPort)
      UserInterfaceListener(uiPort)
      if launchUI:
--- 1204,1212 ----
  
  
! def main(servers, proxyPorts, uiPort, launchUI):
      """Runs the proxy forever or until a 'KILL' command is received or
      someone hits Ctrl+Break."""
!     for (server, serverPort), proxyPort in zip(servers, proxyPorts):
!         BayesProxyListener(server, serverPort, proxyPort)
      UserInterfaceListener(uiPort)
      if launchUI:
***************
*** 1382,1386 ****
              state.databaseFilename = arg
          elif opt == '-l':
!             state.proxyPort = int(arg)
          elif opt == '-u':
              state.uiPort = int(arg)
--- 1469,1473 ----
              state.databaseFilename = arg
          elif opt == '-l':
!             state.proxyPorts = [int(arg)]
          elif opt == '-u':
              state.uiPort = int(arg)
***************
*** 1393,1396 ****
--- 1480,1484 ----
      if runSelfTest:
          print "\nRunning self-test...\n"
+         state.buildServerStrings()
          test()
          print "Self-test passed."   # ...else it would have asserted.
***************
*** 1403,1420 ****
      elif 0 <= len(args) <= 2:
          # Normal usage, with optional server name and port number.
!         if len(args) >= 1:
!             state.serverName = args[0]
!         if len(args) >= 2:
!             state.serverPort = int(args[1])
  
!         if not state.serverName:
              print >>sys.stderr, \
                    ("Error: You must give a POP3 server name, either in\n"
!                    "bayescustomize.ini as pop3proxy_server_name or on the\n"
                     "command line.  pop3server.py -h prints a usage message.")
          else:
!             main(state.serverName, state.serverPort, state.proxyPort,
!                  state.uiPort, state.launchUI, state.databaseFilename,
!                  state.useDB)
  
      else:
--- 1491,1507 ----
      elif 0 <= len(args) <= 2:
          # Normal usage, with optional server name and port number.
!         if len(args) == 1:
!             state.servers = [(args[0], 110)]
!         elif len(args) == 2:
!             state.servers = [(args[0], int(args[1]))]
  
!         if not state.servers or not state.servers[0][0]:
              print >>sys.stderr, \
                    ("Error: You must give a POP3 server name, either in\n"
!                    "bayescustomize.ini as pop3proxy_servers or on the\n"
                     "command line.  pop3server.py -h prints a usage message.")
          else:
!             state.buildServerStrings()
!             main(state.servers, state.proxyPorts, state.uiPort, state.launchUI)
  
      else:





More information about the Spambayes-checkins mailing list