[Mailman-Developers] MHonArc integration

Anand Kumria wildfire@progsoc.uts.edu.au, mailman-developers@python.org
Fri, 4 May 2001 02:41:04 +1000


--m51xatjYGsM+13rf
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

Hi folks,

I've just setup another mailman site and once again I wanted
to integrate it with MHonArc. Previously I'd been using shell
script that played games with symlinks and the the archive
files behind Mailman's back.

I wanted something simpler to use, so after much judicious
cutting and pasting of other code I've put together MHonArc.py
which you can use as an almost compatible replacement for
HyperArch.py

There are a number of things left to do:
	- generate HTML index pages properly
	- store text versions of the archive along with each dir.
	- cleanup the code and document it

This is my first real experience with Python, so no laughing (too
loud). I think I've also noticed a bug where Mailman uses the
udnerlying Pythong rfc822 object when Mailman's is more useful --
see comments in add_article.

I've also simplified the date handling (the current archiving
stuff hyperarch/pipermail convert a to/from unix epoch lots of
times)

Doing this also made clear that a nicer interface in Archive.py
could be done which should handle, imo:
	- message date => volume (and thus path) conversions
	- make the archive date format local to each list
	e.g. I'd like to have 2001/05/01 instead of 20010501
	- generation of HTML index pages
	- make the HTML template editable via the admin interface
	- expose some way to allow list-admins to configure
	any options the archiver might allow (MHonArc allows a lot)

Feedback on all of this welcome.

Regards,
Anand

--m51xatjYGsM+13rf
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="MHonArc.py"

#
# Archive interface for MHonArc and Mailman
#

"""
This attempts to simulate the HpyerArch python script but
using instead MHonArc. By doing this I hope to avoid the need
to have 'list rotation' and 'index generation' as external jobs


1. setup archive directory
	- set ARCHIVE_PERIOD to appropriate value
2. convert mbox into individual messages
	- figure out location to put message
		- convert message date to appropriate directory
		- create directory if needed
3. add message to archive
	- add message to text file in 
	- interperlate -output and -resource  into appropriate directory
	- execute MHonArc
4. create HTML index pages
"""

import sys
import os
import time
import string
from Mailman import mm_cfg
from Mailman import Utils
from Mailman.Logging.Syslog import syslog
from Mailman.Mailbox import Mailbox
from Mailman.Utils import mkdir, open_ex

DIRMODE = 0755      # Mode to give to created directories
FILEMODE = 0644     # Mode to give to created files
INDEX_EXT = ".html" # Extension for indexes
VERBOSE = 1

TOC_template='''\
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2//EN">
<HTML>
  <HEAD>
     <title>The %(listname)s Archives</title>
     <META NAME="robots" CONTENT="noindex,follow">
  </HEAD>
  <BODY BGCOLOR="#ffffff">
     <h1>The %(listname)s Archives </h1>
     <p>
      You can get <a href="%(listinfo)s">more information about this list</a>
      or you can <a href="%(fullarch)s">download the full raw archive</a>
      (%(size)s).
     </p>
     %(noarchive_msg)s
     %(archive_listing_start)s
     %(archive_listing)s
     %(archive_listing_end)s
     </BODY>
     </HTML>
'''

TOC_entry_template = '''\

	    <tr>
            <td>%(archive)s:</td>
            <td>
              <A href="%(archive)s/thread.html">[ Thread ]</a>
              <A href="%(archive)s/subject.html">[ Subject ]</a>
              <A href="%(archive)s/author.html">[ Author ]</a>
              <A href="%(archive)s/date.html">[ Date ]</a>
            </td>
            %(textlink)s
            </tr>

'''

def sizeof(filename):
	size = os.path.getsize(filename)
	if size < 1000:
		return ' %d bytes ' % size
	elif size < 1000000:
		return ' %d KB ' % (size / 1000)
	# GB?? :-)
	return ' %d MB ' % (size / 1000000)


class HyperArchive:

	def __init__(self, maillist):
		# make mailist available inside this object
		self.maillist = maillist
		# setup archive attributes
	        if hasattr(self.maillist,'archive_volume_frequency'):
        	    	if self.maillist.archive_volume_frequency == 0:
                		self.ARCHIVE_PERIOD='year'
	            	elif self.maillist.archive_volume_frequency == 2:
	                	self.ARCHIVE_PERIOD='quarter'
		    	elif self.maillist.archive_volume_frequency == 3:
				self.ARCHIVE_PERIOD='week'
		    	elif self.maillist.archive_volume_frequency == 4:
				self.ARCHIVE_PERIOD='day'
	            	else:
        	        	self.ARCHIVE_PERIOD='month'

		self.html_TOC_tmpl = TOC_template
		self.TOC_entry_tmpl = TOC_entry_template    

#	def processUnixMailbox(self, input, articleClass = Article):
	def processUnixMailbox(self, input ):
		mbox = Mailbox(input)
		while 1:
			m = mbox.next()
			if not m:
				break
			self.message("constructed 0 ")
			self.set_msg_date_tuple(m)
			self.message(self.msg_date_tuple)
	
			self.make_archive(self.get_list_archive_path())
	
			self.add_article(m)
			self.message("constructed 3 ")

	def make_archive(self, path):
	        # If the archive directory doesn't exist, create it
		# recursively
	        try:
	            os.stat(path)
	        except os.error, errdata:
	            errno, errmsg = errdata
	            if errno == 2: 
	                mkdir(path, DIRMODE)
			self.message("created: %s" % path)
	            else:
	                raise os.error, errdata
		
	def add_article(self, msg):
		# Using a mailbox give us (eventually) Python's 
		# rfc822 Message object instead of Mailman's.
		# Here we simply reach in and do the necessary
		# but the real fix is to ensure Mailman ships
		# its own version of mailbox.py instead of
		# relying on Python's.
		# convert rfc822 Message object back into text
		txt = msg.unixfrom + string.join(msg.headers,'') + '\n' + msg.fp.read()
		self.ExternalArchive(mm_cfg.MHONARC, txt)
		return

	def message(self, msg):
		if VERBOSE:
	            f = sys.stderr
	            f.write(msg)
	            if msg[-1:] != '\n':
	                f.write('\n')
	            f.flush()

	def ExternalArchive(self, ar, txt):
	        l = Utils.SafeDict({'listname': self.maillist.internal_name()})
		a = Utils.SafeDict({'archivedir': self.get_list_archive_path()})
	        cmd = ar % l
		cmd = cmd % a
		self.message("cmd = %s" % cmd)
	        extarch = os.popen(cmd, 'w')
	        extarch.write(txt)
	        status = extarch.close()
	        if status:
	            syslog('error', 'external archiver non-zero exit status: %d\n' %
	                   (status & 0xff00) >> 8)

	# The following two methods should be inverses of each other. -ddm
	def dateToVolName(self,date):
#		datetuple=time.localtime(date)
		datetuple = date
		if self.ARCHIVE_PERIOD=='year':
			return time.strftime("%Y",datetuple)
		elif self.ARCHIVE_PERIOD=='quarter':
		    	if datetuple[1] in [1,2,3]:
				return time.strftime("%Yq1",datetuple)
			elif datetuple[1] in [4,5,6]:
				return time.strftime("%Yq2",datetuple)
			elif datetuple[1] in [7,8,9]:
				return time.strftime("%Yq3",datetuple)
			else:
				return time.strftime("%Yq4",datetuple)
		elif self.ARCHIVE_PERIOD == 'day':
			return time.strftime("%Y%m%d", datetuple)
		elif self.ARCHIVE_PERIOD == 'week':
	       	     	# Reconstruct "seconds since epoch", and subtract weekday
		        # multiplied by the number of seconds in a day.
	        	monday = time.mktime(datetuple) - datetuple[6] * 24 * 60 * 60
	            	# Build a new datetuple from this "seconds since epoch" value
	            	datetuple = time.localtime(monday)
	            	return time.strftime("Week-of-Mon-%Y%m%d", datetuple)
	        	# month. -ddm
	 	else:
	            	return time.strftime("%Y-%B",datetuple)



### my stuff
	# We convert from string to seconds since 
	# Epoch in order to validate the date.
	def set_msg_date_tuple(self, message):
		if message.has_key('Date'):
			date = message.getdate_tz('Date')
			date, tzoffset = date[:9], date[-1] or 0
			try:
				date = time.mktime(date) # - tzoffset
				date = time.localtime(date)
			except (ValueError, OverflowError):
				date = time.localtime(time.time())
		else:
			date = time.localtime(time.time())

		self.msg_date_tuple = date

	def get_list_archive_path(self):
	        basedir = self.maillist.archive_dir()
		arch = self.dateToVolName(self.msg_date_tuple)
		archivedir = os.path.join(basedir, arch)

		return archivedir



####
	def write_TOC(self):
#	        self.sortarchives()
	        basedir = self.maillist.archive_dir()
        	toc=open_ex(os.path.join(basedir, 'index.html'), 'w')
	        toc.write(self.html_TOC())
	        toc.close()

	def html_TOC(self):
		# for know, fudge the fact we have no archives
		self.archives = None
	        listname = self.maillist.internal_name()
	        mbox = os.path.join(self.maillist.archive_directory+'.mbox',
        	                    listname+'.mbox')
	        d = {"listname": self.maillist.real_name,
	             "listinfo": self.maillist.GetScriptURL('listinfo', absolute=1),
	             "fullarch": '../%s.mbox/%s.mbox' % (listname, listname),
	             "size": sizeof(mbox),
	             }
	        if not self.archives:
	            d["noarchive_msg"] = '<P>Currently, there are no archives. </P>'
	            d["archive_listing_start"] = ""
	            d["archive_listing_end"] = ""
	            d["archive_listing"] = ""
	        else:
	            d["noarchive_msg"] = ""
	            d["archive_listing_start"] = self.arch_listing_start
	            d["archive_listing_end"] = self.arch_listing_end
	            accum = []
	            for a in self.archives:
	                accum.append(self.html_TOC_entry(a))
	            d["archive_listing"] = string.join(accum, '')
	        if not d.has_key("encoding"):
	            d["encoding"] = ""
	        return self.html_TOC_tmpl % d

	def html_TOC_entry(self, arch):
	# Check to see if the archive is gzip'd or not
		txtfile = os.path.join(mm_cfg.PRIVATE_ARCHIVE_FILE_DIR,
					self.maillist.internal_name(),
					arch + '.txt')
		gzfile = txtfile + '.gz'
		templ = '<td><A href="%(url)s">[ %(fmt)sText%(sz)s]</a></td>'
		# which exists?  .txt.gz first, then .txt
		if os.path.exists(gzfile):
			file = gzfile
			url = arch + '.txt.gz'
			fmt = "Gzip'd "
		elif os.path.exists(txtfile):
			file = txtfile
			url = arch + '.txt'
			fmt = ''
		else:
			# neither found?
			file = None
		# in Python 1.5.2 we have an easy way to get the size
		if file:
			textlink = templ % {'url': url,
						'fmt': fmt,
						'sz' : sizeof(file),
						}
		else:
			# there's no archive file at all... hmmm.
			textlink = ''
			return self.TOC_entry_tmpl % { 'archive': arch,
							'textlink': textlink }


	def close(self):
#		"Close an archive, save its state, and update any changed archives."
#		self.update_dirty_archives()
#        	self.update_TOC = 0
		self.write_TOC()
#		# Save the collective state 
#		self.message('Pickling archive state into ' \
#               	      + os.path.join(self.basedir, 'pipermail.pck'))
#		self.database.close()
#		del self.database
#
#		f = open(os.path.join(self.basedir, 'pipermail.pck'), 'w')
#		pickle.dump(self.getstate(), f)
#		f.close()
		return

--m51xatjYGsM+13rf--