[Spambayes-checkins] spambayes/testtools incremental.HOWTO.txt, 1.4, 1.5 sort+group.py, 1.1, 1.2

Tim Peters tim_one at users.sourceforge.net
Sat Dec 27 20:12:14 EST 2003


Update of /cvsroot/spambayes/spambayes/testtools
In directory sc8-pr-cvs1:/tmp/cvs-serv24507

Modified Files:
	incremental.HOWTO.txt sort+group.py 
Log Message:
Sort msgs by full-precision timestamp (not just by day).  Normalize
Received time to UTC first (e.g., I have ISPs in different time zones,
and the order in which I receive msgs across all of them doesn't have
much to do with their ideas of local time).  Use email.Utils to parse
dates instead of hand-rolling our own parser (parsedate_tz knows a lot
about common real-world violations of RFC 8222, and can recover from them);
incidentally, then, the dependence on strptime is gone.  Remove line
ends from the first Received header before trying to parse it, as the
date-time field may be split across physical lines.  Preserve files'
extensions (if any) during renaming.


Index: incremental.HOWTO.txt
===================================================================
RCS file: /cvsroot/spambayes/spambayes/testtools/incremental.HOWTO.txt,v
retrieving revision 1.4
retrieving revision 1.5
diff -C2 -d -r1.4 -r1.5
*** incremental.HOWTO.txt	5 Nov 2003 13:01:44 -0000	1.4
--- incremental.HOWTO.txt	28 Dec 2003 01:12:11 -0000	1.5
***************
*** 23,33 ****
     sort+group.py for this.  sort+group.py sorts the messages
     into chronological order (by topmost Received header) and
!    then groups them by 24-hour period.
  
!    Note that you need to have either Python 2.3b1 or a *nix
!    version of Python to run the sort+group.py script.
!    Note also that this script will run through *all* the
!    files in the Data directory, not just those in Data/Ham
!    and Data/Spam.
  
  3. Distribute the corpora into multiple sets so you can do
--- 23,35 ----
     sort+group.py for this.  sort+group.py sorts the messages
     into chronological order (by topmost Received header) and
!    then groups them by 24-hour period.  The group number (0123)
!    is the number of full 24-hour periods that elapsed between
!    the time this msg was received and the time the oldest msg
!    found was received.  The id number (004556) is a unique
!    0-based ordinal across all msgs seen, with 000000 given to
!    the oldest msg found.
  
!    Note that this script will run through *all* the files in
!    the Data directory, not just those in Data/Ham and Data/Spam.
  
  3. Distribute the corpora into multiple sets so you can do
***************
*** 67,69 ****
  
  Please, somebody rewrite this file.
!  
--- 69,71 ----
  
  Please, somebody rewrite this file.
! 

Index: sort+group.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/testtools/sort+group.py,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** sort+group.py	28 Feb 2003 00:02:45 -0000	1.1
--- sort+group.py	28 Dec 2003 01:12:11 -0000	1.2
***************
*** 5,9 ****
  ### testing of chronological incremental training.
  
! """Usage: %(program)s
  
  This program has no options!  Muahahahaha!
--- 5,9 ----
  ### testing of chronological incremental training.
  
! """Usage: sort+group.py
  
  This program has no options!  Muahahahaha!
***************
*** 12,97 ****
  import sys
  import os
- import getopt
  import glob
- import re
  import time
- import filecmp
  
! program = sys.argv[0]
! loud = True
! day = 24 * 60 * 60
! dates = {}
  
! def usage(code, msg=''):
!     """Print usage message and sys.exit(code)."""
!     if msg:
!         print >> sys.stderr, msg
!         print >> sys.stderr
!     print >> sys.stderr, __doc__ % globals()
!     sys.exit(code)
  
! def bydate(name1, name2):
!     return cmp(dates[name1], dates[name2])
  
  def main():
      """Main program; parse options and go."""
  
!     global dates
!     dates = {}
!     names = []
!     date_re = re.compile(
!         r";[^0]* (\d{1,2} (?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) \d{2,4})")
!     now = time.mktime(time.strptime(time.strftime("%d %b %Y"), "%d %b %Y"))
!     if loud: print "Scanning everything"
      for name in glob.glob('Data/*/*/*'):
          if loud:
              sys.stdout.write("%-78s\r" % name)
              sys.stdout.flush()
!         fh = file(name, "rb")
!         received = ""
!         line = fh.readline()
!         while line != "\r\n" and line != "\n" and line != "":
!             if line.lower().startswith("received:"):
!                 received = line
!                 line = fh.readline()
!                 while line != "" and (line[0] == " " or line[0] == "\t"):
!                     received += line
!                     line = fh.readline()
!                 break
!             line = fh.readline()
!         fh.close()
!         # Figure out how old the message is
!         date = now
!         try:
!             log = str(received)
!             received = date_re.search(received).group(1)
!             log = "\n" + str(received)
!             date = time.mktime(time.strptime(received, "%d %b %Y"))
!         except:
!             print "Couldn't parse " + name + ":"
!             print log
!             pass
!         dates[name] = date
!         names.append(name)
!     if loud: print ""
  
!     if loud: print "Sorting"
!     names.sort(bydate)
  
!     if loud: print "Renaming first pass"
!     for name in names:
!         dir = os.path.dirname(name)
!         base = os.path.basename(name)
!         os.rename(name, os.path.join(dir, "-"+base))
  
!     if loud: print "Renaming second pass"
!     first = dates[names[0]]
!     for num in range(0, len(names)):
!         name = names[num]
!         dir = os.path.dirname(name)
!         base = os.path.basename(name)
!         group = int((dates[name] - first) // day)
!         os.rename(os.path.join(dir, "-"+base),
!                   os.path.join(dir, "%04d-%06d" % (group, num)))
  
  if __name__ == "__main__":
--- 12,104 ----
  import sys
  import os
  import glob
  import time
  
! from email.Utils import parsedate_tz, mktime_tz
  
! loud = True
! SECONDS_PER_DAY = 24 * 60 * 60
  
! # Scan the file with path fpath for its first Received header, and return
! # a UTC timestamp for the date-time it specifies.  If anything goes wrong
! # (can't find a Received header; can't parse the date), return None.
! # This is the best guess about when we received the msg.
! def get_time(fpath):
!     fh = file(fpath, 'rb')
!     lines = iter(fh)
!     # Find first Received header.
!     for line in lines:
!         if line.lower().startswith("received:"):
!             break
!     else:
!         print "\nNo Received header found."
!         fh.close()
!         return None
!     # Paste on continuation lines, if any.
!     received = line
!     for line in lines:
!         if line[0] in ' \t':
!             received += line
!         else:
!             break
!     fh.close()
!     # RFC 2822 says the date-time field must follow a semicolon at the end.
!     i = received.rfind(';')
!     if i < 0:
!         print "\n" + received
!         print "No semicolon found in Received header."
!         return None
!     # We only want the part after the semicolon.
!     datestring = received[i+1:]
!     # It may still be split across lines (like "Wed, \r\n\t22 Oct ...").
!     datestring = ' '.join(datestring.split())
!     as_tuple = parsedate_tz(datestring)
!     if as_tuple is None:
!         print "\n" + received
!         print "Couldn't parse the date: %r" % datestring
!         return None
!     return mktime_tz(as_tuple)
  
  def main():
      """Main program; parse options and go."""
  
!     from os.path import join, split
! 
!     data = []   # list of (time_received, dirname, basename) triples
!     if loud:
!         print "Scanning everything"
!     now = time.time()
      for name in glob.glob('Data/*/*/*'):
          if loud:
              sys.stdout.write("%-78s\r" % name)
              sys.stdout.flush()
!         when_received = get_time(name) or now
!         data.append((when_received,) + split(name))
  
!     if loud:
!         print ""
!         print "Sorting ..."
!     data.sort()
  
!     # First rename all the files to a form we can't produce in the end.
!     # This is to protect against name clashes in case the files are
!     # already named according to the scheme we use.
!     if loud:
!         print "Renaming first pass ..."
!     for dummy, dirname, basename in data:
!         os.rename(join(dirname, basename),
!                   join(dirname, "-" + basename))
  
!     if loud:
!         print "Renaming second pass ..."
!     earliest = data[0][0]  # timestamp of earliest msg received
!     i = 0
!     for when_received, dirname, basename in data:
!         extension = os.path.splitext(basename)[-1]
!         group = int((when_received - earliest) / SECONDS_PER_DAY)
!         newbasename = "%04d-%06d" % (group, i)
!         os.rename(join(dirname, "-" + basename),
!                   join(dirname, newbasename + extension))
!         i += 1
  
  if __name__ == "__main__":





More information about the Spambayes-checkins mailing list