Mailman 3 Mailman web-archiver - Mailman-Developers

26 Aug 1998 · *[)])') # Subject lines preceded with 'Re:' REpat=re.compile( r"\s*

      Well, it's a bit rough, and there's a few more things i'd like to
do with it, But my free time may be scarse lately & I'm going on vacation
in a week, so I figure I'd better post this & let some folks play with it.
Anyway, this is the intgrated web-archiver module for Mailman.
For the moment, it still needs bsddb (mebbe after I get back I can try
tackling replacing that)
To get this working, here is what ye need to do:
1) Make sure you have the bsddb module compiled :P 
2) Make sure you have somewhere in your PYTHONPATH
   the following python modules:
     a) The latest pipermail (0.0.5)
     b) Digicool's DocumentTemplate package
       &lt;http://www.digicool.com/releases/bobo/DocumentTemplate-rn.html>
3) Make sure you get rid of the old pipermail 0.0.2 sitting in 
   the Mailman package.
4) Put HyperArch.py in the Mailman package directory.
5) Put the 'arch' script in the ${prefix}/cron directory     
6) Apply the included patch.
7) add an entry like this to your crontab:
# Periodically  update the webarchive.
0 3,9,17,23 * * * /usr/local/bin/python /usr/services/mailman/cron/arch
Notes:
You can run the arch script as often as you like, depending on how
up-to-date you want your webarchives to be.
You also can run the script from the command line to manually add
articles to the archive. (I just ran three years worth of traffic from a
reasonabally high-volume majordomo list I'm moving over thru this system,
and it went thru fine.)
You can set archives to use yearly, quarterly, or monthly archives
with this.
You will probably want to use the 'Set date to when re-sent' option
on your lists with this module.
All the HTML generated by this module is generated from a template.
This makes customizing the output trivial, just edit the template.
(Right now the templates are static strings. This should be changed soon.)
If you do not run the arch cronjob, then this module will do nothing,
and will not interfere with an external archiver.
-The Dragon De Monsyne
diff -c /usr/src/mailman-1.0b5/Mailman/Archiver.py Mailman/Archiver.py
*** /usr/src/mailman-1.0b5/Mailman/Archiver.py	Mon Jul 27 17:48:31 1998
--- Mailman/Archiver.py	Sun Aug 16 11:11:52 1998

*** 42,49 ****
self.archive_private = mm_cfg.DEFAULT_ARCHIVE_PRIVATE
## 	self.archive_update_frequency = 

## 		 mm_cfg.DEFAULT_ARCHIVE_UPDATE_FREQUENCY
! ## 	self.archive_volume_frequency = 

! ## 		mm_cfg.DEFAULT_ARCHIVE_VOLUME_FREQUENCY
## 	self.archive_retain_text_copy = 

## 		mm_cfg.DEFAULT_ARCHIVE_RETAIN_TEXT_COPY
--- 42,49 ----
self.archive_private = mm_cfg.DEFAULT_ARCHIVE_PRIVATE
## 	self.archive_update_frequency = 

## 		 mm_cfg.DEFAULT_ARCHIVE_UPDATE_FREQUENCY
!  	self.archive_volume_frequency = 

!  		mm_cfg.DEFAULT_ARCHIVE_VOLUME_FREQUENCY
## 	self.archive_retain_text_copy = 

## 		mm_cfg.DEFAULT_ARCHIVE_RETAIN_TEXT_COPY

*** 62,71 ****
def GetBaseArchiveURL(self):
if self.archive_private:
return os.path.join(mm_cfg.PRIVATE_ARCHIVE_URL,
!                                 self._internal_name + ".html")
else:
return os.path.join(mm_cfg.PUBLIC_ARCHIVE_URL,
!                                 self._internal_name + ".html")
  def GetConfigInfo(self):
return [
--- 62,71 ----
def GetBaseArchiveURL(self):
if self.archive_private:
return os.path.join(mm_cfg.PRIVATE_ARCHIVE_URL,
!                                 self._internal_name + mm_cfg.PRIVATE_ARCHIVE_URL_EXT)
else:
return os.path.join(mm_cfg.PUBLIC_ARCHIVE_URL,
!                                 self._internal_name + mm_cfg.PRIVATE_ARCHIVE_URL_EXT)
  def GetConfigInfo(self):
return [

*** 85,93 ****
## 	     "How often should new messages be incorporated?  "
## 	     "0 for no archival, 1 for daily, 2 for hourly"),
! ## 	    ('archive_volume_frequency', mm_cfg.Radio, ('Yearly', 'Monthly'),
! ## 	     0,
! ## 	     'How often should a new archive volume be started?'),
## 	    ('archive_retain_text_copy', mm_cfg.Toggle, ('No', 'Yes'),
## 	     0,
--- 85,93 ----
## 	     "How often should new messages be incorporated?  "
## 	     "0 for no archival, 1 for daily, 2 for hourly"),
!  	    ('archive_volume_frequency', mm_cfg.Radio,
!                ('Yearly', 'Monthly','Quarterly'), 0,
!  	     'How often should a new archive volume be started?'),
## 	    ('archive_retain_text_copy', mm_cfg.Toggle, ('No', 'Yes'),
## 	     0,
diff -c /usr/src/mailman-1.0b5/Mailman/Defaults.py Mailman/Defaults.py
*** /usr/src/mailman-1.0b5/Mailman/Defaults.py	Wed Aug 19 01:46:37 1998
--- Mailman/Defaults.py	Thu Aug 27 00:22:46 1998

*** 37,42 ****
--- 37,48 ----
PUBLIC_ARCHIVE_URL = 'http://www.OVERRIDE.WITH.YOUR.PUBLIC.ARCHIVE.URL/'
PRIVATE_ARCHIVE_URL = 'http://www.OVERRIDE.WITH.YOUR.PRIVATE.ARCHIVE.URL/'

DEFAULT_ARCHIVE_VOLUME_FREQUENCY = 1

PUBLIC_ARCHIVE_URL_EXT = ''
PRIVATE_ARCHIVE_URL_EXT = ''
DEFAULT_ARCHIVE_PRIVATE    = 0		# 0=public, 1=private
HOME_PAGE         = 'index.html'
MAILMAN_OWNER     = 'mailman-owner@%s' % DEFAULT_HOST_NAME

*** 216,219 ****
VERSION           = '1.0b5'
# Data file version number
! DATA_FILE_VERSION = 3
--- 222,225 ----
VERSION           = '1.0b5'
# Data file version number
! DATA_FILE_VERSION = 4
diff -c /usr/src/mailman-1.0b5/Mailman/htmlformat.py Mailman/htmlformat.py
*** /usr/src/mailman-1.0b5/Mailman/htmlformat.py	Mon Jul 27 17:48:31 1998
--- Mailman/htmlformat.py	Wed Aug 19 02:24:43 1998

*** 383,389 ****
InputObj.__init__(self, name, "TEXT", value, checked=0, size=size)
class TextArea:
!     def __init__(self, name, text='', rows=None, cols=None, wrap='soft'):
self.name = name
self.text = text
self.rows = rows
--- 383,389 ----
InputObj.__init__(self, name, "TEXT", value, checked=0, size=size)
class TextArea:
!     def __init__(self, name, text='', rows=None, cols=None, wrap='off'):
self.name = name
self.text = text
self.rows = rows
diff -c /usr/src/mailman-1.0b5/Mailman/versions.py Mailman/versions.py
*** /usr/src/mailman-1.0b5/Mailman/versions.py	Mon Jul 27 17:48:31 1998
--- Mailman/versions.py	Wed Aug 19 02:31:06 1998

*** 63,70 ****
PreferStored('automatically_remove', 'automatic_bounce_action')
#  - dropped vars:
for a in ['archive_retain_text_copy',
!               'archive_update_frequency',
!               'archive_volume_frequency']:
if hasattr(l, a): delattr(l, a)
def UpdateOldUsers(l):
--- 63,69 ----
PreferStored('automatically_remove', 'automatic_bounce_action')
#  - dropped vars:
for a in ['archive_retain_text_copy',
!               'archive_update_frequency']:
if hasattr(l, a): delattr(l, a)
def UpdateOldUsers(l):
#!/usr/local/bin/python
import sys, string, getopt, os
import paths
import Mailman.HyperArch
import Mailman.Utils
from Mailman.HyperArch import HyperArchive
from Mailman.MailList import MailList
def ArchList(list):
l=MailList(list)
h=HyperArchive(l)
h.processListArch()
h.close()
def PrintUsage():
print ('  usage: arch [-v | --verbose] [-l "list" | --list "list"] '
'[mailboxfiles]')
import sys
sys.exit(1)
def main():
if '-h' in sys.argv or '-?' in sys.argv:
PrintUsage()
return
config={'VERBOSE':0}

# Now we parse the command line options
opts, params = getopt.getopt(sys.argv[1:], 'vl:', ['verbose','list='])
for option, arg in opts:
if option=='-l': config['LIST']=arg
if option=='-v': config['VERBOSE']=1
if option=='--list': config['LIST']=arg
if option=='--verbose': config['VERBOSE']=1

if config.has_key('LIST'):
    if params:
        l=MailList(config['LIST'],lock=0)
        h=HyperArchive(l)
        if config['VERBOSE']:
            h.VERBOSE=1
        for each in params:
            try:
                f=open(each,'r')
            except IOError:
                sys.stderr.write("Cannot open %s\n" % each )
                continue
            h.processUnixMailbox(f,Mailman.HyperArch.Article)
            f.close()
        h.close()
    else:
        l=MailList(config['LIST'])
        h=HyperArchive(l)
        if config['VERBOSE']:
            h.VERBOSE=1
        h.processListArch()
        h.close()
else:
    #loop thru all the lists, doing archiving -ddm.     
    for each in Mailman.Utils.list_names():
        if config['VERBOSE']:
            sys.stderr.write("Processing list %s\n" % each)
        l=MailList(each)
        if not l.archive:
            l.Unlock()
            del l
        else:
            h=HyperArchive(l)
            if config['VERBOSE']:
                h.VERBOSE=1
            h.processListArch()
            h.close()

if __name__ == '__main__':
main()
"""HyperArch:  Pipermail archiving for MailMan, using DocumentTemplate
templates
   - The Dragon De Monsyne &lt;dragondm@integral.org> 
See <http://www.digicool.com/site/Principia/DTML.html> for some explanation
of DTML (DocumentTemplate) Syntax.
(Note: the URL listed acually documents a slightly extended DTML syntax
used by a product called Principia. Bug the folks at Digicool fer 'plain'
DTML documentation :> )
TODO:
- The templates should be be files in Mailman's Template dir, instead
of static strings.
- Each list should be able to have it's own templates.
Also, it should automatically fall back to default template in case
of error in list specific template.
- Should be able to force all HTML to be regenerated next time the archive
is run, incase a template is changed.
- Replace pipermail.BSDDBdatabase with something that dosen't require a C
extention (i.e. bsddb).

(Perhaps BoboPOS <http://www.digicool.com/releases/bobo/BoboPOS-rn.html>
could be used here?)
- Run a command to generate tarball of html archives for downloading
(prolly in the 'update_dirty_archives' method )
"""
import re, cgi, urllib, string
import time, pickle, os, posixfile
import DocumentTemplate
import pipermail
import mm_cfg
article_text_template="""From <!--#var email -->  <!--#var datestr -->
Date: <!--#var datestr -->
From: <!--#var author --> <<!--#var email -->>
Subject: <!--#var subject -->
<!--#in body --><!--#var sequence-item  --><!--#/in -->
"""
article_template="""<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2//EN">
<HTML>
<HEAD>
<TITLE><!--#var subject html_quote --></TITLE>
<LINK REL="Index" HREF="index.html" >
<LINK REL="made" HREF="mailto:<!--#var email url_quote -->">
<!--linkthreads-->
<!--#if prev -->
<LINK REL="Previous"  HREF="<!--#var expr="prev.filename" url_quote -->">
<!--#/if -->
<!--#if next -->
<LINK REL="Next"  HREF="<!--#var expr="next.filename" url_quote -->">
<!--#/if -->
<!--endlinkthreads-->
</HEAD>
<BODY BGCOLOR="#ffffff">
<H1><!--#var subject html_quote --></H1>
<B><!--#var author html_quote --></B>
<A HREF="mailto:<!--#var email url_quote -->" TITLE="<!--#var subject html_quote -->"><!--#var email html_quote --></A><BR>
<I><!--#var datestr html_quote --></I>
<P><UL>
<!--threads-->
<!--#if prev -->
<LI> Previous message: <A HREF="<!--#var expr="prev.filename" url_quote -->"><!--#var expr="prev.subject" html_quote--></A></li>
<!--#/if -->
<!--#if next -->
<LI> Next message: <A HREF="<!--#var expr="next.filename" url_quote -->"><!--#var expr="next.subject" html_quote --></A></li>
<!--#/if -->
<!--endthreads-->
<LI> <B>Messages sorted by:</B>
<a href="date.html#<!--#var sequence -->">[ date ]</a>
<a href="thread.html#<!--#var sequence -->">[ thread ]</a>
<a href="subject.html#<!--#var sequence -->">[ subject ]</a>
<a href="author.html#<!--#var sequence -->">[ author ]</a>
</LI>
</UL>
<HR>

<!--beginarticle-->
<!--#in body --><!--#var sequence-item  --><!--#/in -->
<!--endarticle-->
<HR>
<P><UL>
<!--threads-->
<!--#if prev -->
<LI> Previous message: <A HREF="<!--#var expr="prev.filename" url_quote -->"><!--#var expr="prev.subject" html_quote--></A></li>
<!--#/if -->
<!--#if next -->
<LI> Next message: <A HREF="<!--#var expr="next.filename" url_quote -->"><!--#var expr="next.subject" html_quote --></A></li>
<!--#/if -->
<!--endthreads-->
<LI> <B>Messages sorted by:</B>
<a href="date.html#<!--#var sequence -->">[ date ]</a>
<a href="thread.html#<!--#var sequence -->">[ thread ]</a>
<a href="subject.html#<!--#var sequence -->">[ subject ]</a>
<a href="author.html#<!--#var sequence -->">[ author ]</a>
</LI>
</UL>
</body></html>
"""
index_header_template="""<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2//EN">
<HTML>
<HEAD>
<title>The <!--#var expr="maillist.real_name" --> <!--#var archive --> Archive by <!--#var type --></title>
</HEAD>
<BODY BGCOLOR="#ffffff">
<a name="start"></A>
<h1><!--#var archive --> Archives by <!--#var type --></h1>
<ul>
<li> <b>Messages sorted by:</b>
<!--#if expr="type<>'Thread' " -->
<a href="thread.html#start">[ thread ]</a>
<!--#/if -->
<!--#if expr="type<>'Subject' " -->
<a href="subject.html#start">[ subject ]</a>
<!--#/if -->
<!--#if expr="type<>'Author' " -->
<a href="author.html#start">[ author ]</a>
<!--#/if -->
<!--#if expr="type<> 'Date' " -->
<a href="date.html#start">[ date ]</a>
<!--#/if --> </li>
<li><b><a href="<!--#var expr="maillist.GetAbsoluteScriptURL('listinfo')" -->">More info on this list...</a></b></li>
<!--#if ARCHIVES -->
<li> <b><a href="<!--#var ARCHIVES -->">Other mail archives</a></b> </li>
<!--#/if -->
</ul>
<p><b>Starting:</b> <i><!--#var firstdate --></i><br>
<b>Ending:</b> <i><!--#var lastdate --></i><br>
<b>Messages:</b> <!--#var size --><p>
<ul>
"""
index_footer_template="""
</ul>
<p>
<a name="end"><b>Last message date:</b></a>
<i><!--#var lastdate --></i><br>
<b>Archived on:</b> <i><!--#var archivedate --></i>
<p>
<ul>
<li> <b>Messages sorted by:</b>
<!--#if expr="type<>'Thread' " -->
<a href="thread.html#start">[ thread ]</a>
<!--#/if -->
<!--#if expr="type<>'Subject' " -->
<a href="subject.html#start">[ subject ]</a>
<!--#/if -->
<!--#if expr="type<>'Author' " -->
<a href="author.html#start">[ author ]</a>
<!--#/if -->
<!--#if expr="type<> 'Date' " -->
<a href="date.html#start">[ date ]</a>
<!--#/if --> </li>
<li><b><a href="<!--#var expr="maillist.GetAbsoluteScriptURL('listinfo')" -->">More info on this list...</a></b></li>
<!--#if ARCHIVES -->
<li> <b><a href="<!--#var ARCHIVES -->">Other mail archives</a></b> </li>
<!--#/if -->
</ul>
<p>
<hr>
<i>This archive was generated by <a href="http://starship.skyport.net/crew/amk/maintained/pipermail.html">Pipermail <!--#var version --></a>.</i>
</BODY>
</HTML>"""
TOC_template="""<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2//EN">
<HTML>
<HEAD>
<title>The <!--#var expr="maillist.real_name"  --> Archives</title>
</HEAD>
<BODY BGCOLOR="#ffffff">
<h1>The <!--#var expr="maillist.real_name"  --> Archives </h1>
<p>
<a href="<!--#var expr="maillist.GetAbsoluteScriptURL('listinfo')" -->">More info on this list...</a>
</p>
<!--#if archives -->
<!--#in archives -->
<!--#if sequence-start  -->
<table border=3>
<tr><td>Archive</td> <td>View by:</td> <td>Downloadable version</td></tr>
<!--#/if -->
<tr>
<td><!--#var sequence-item  -->:</td>
<td>
<A href="<!--#var sequence-item  -->/thread.html">[ Thread ]</a>
<A href="<!--#var sequence-item  -->/subject.html">[ Subject ]</a>
<A href="<!--#var sequence-item  -->/author.html">[ Author ]</a>
<A href="<!--#var sequence-item  -->/date.html">[ Date ]</a>
</td>
<td><A href="<!--#var sequence-item  -->.txt.gz">[ Text ]</a></td>
</tr>
<!--#if sequence-end -->
</table>
<!--#/if -->
<!--#/in -->
 &lt;!--#else -->
   &lt;P>Currently, there are no archives. &lt;/P> 
 &lt;!--#/if -->
</BODY>
</HTML>"""
def CGIescape(arg):
s=cgi.escape(str(arg))
s=re.sub('"', '"', s)
return s
# Parenthesized human name
paren_name_pat=re.compile(r'([(].*[)])')
# Subject lines preceded with 'Re:'
REpat=re.compile( r"\s*RE\s*:\s*",
re.IGNORECASE)
# E-mail addresses and URLs in text
emailpat=re.compile(r'([-+,.\w]+@[-+.\w]+)')
#  Argh!  This pattern is buggy, and will choke on URLs with GET parameters.
urlpat=re.compile(r'(\w+://[^>)\s]+)') # URLs in text
# Blank lines
blankpat=re.compile(r'^\s*$')
#
# Starting <html> directive
htmlpat=re.compile(r'^\s*<HTML>\s*$', re.IGNORECASE)

# Ending </html> directive
nohtmlpat=re.compile(r'^\s*</HTML>\s*$', re.IGNORECASE)
# Match quoted text
quotedpat=re.compile(r'^([>|:]|>)+')
# Note: I'm overriding most, if not all of the pipermail Article class here -ddm
# The Article class encapsulates a single posting.  The attributes
# are:
#
#  sequence : Sequence number, unique for each article in a set of archives
#  subject  : Subject
#  datestr  : The posting date, in human-readable format
#  date     : The posting date, in purely numeric format
#  headers  : Any other headers of interest
#  author   : The author's name (and possibly organization)
#  email    : The author's e-mail address
#  msgid    : A unique message ID
#  in_reply_to : If !="", this is the msgid of the article being replied to
#  references: A (possibly empty) list of msgid's of earlier articles in the thread
#  body     : A list of strings making up the message body
class Article(pipermail.Article):
__last_article_time=time.time()
html_tmpl=DocumentTemplate.HTML(article_template)
text_tmpl=DocumentTemplate.HTML(article_text_template)

def as_html(self):
    return self.html_tmpl(self)

def as_text(self):
    return self.text_tmpl(self)

def __init__(self, message=None, sequence=0, keepHeaders=[]):
import time
if message==None: return
self.sequence=sequence

self.parentID = None 
    self.threadKey = None
    self.prev=None
    self.next=None
# otherwise the current sequence number is used.
id=pipermail.strip_separators(message.getheader('Message-Id'))
if id=="": self.msgid=str(self.sequence)
else: self.msgid=id

if message.has_key('Subject'): self.subject=str(message['Subject'])
else: self.subject='No subject'
i=0
while (i!=-1):
    result=REpat.match(self.subject)
    if result: 
	i = result.end(0)
	self.subject=self.subject[i:]
    else: i=-1
if self.subject=="": self.subject='No subject'

if message.has_key('Date'): 
    self.datestr=str(message['Date'])
    date=message.getdate_tz('Date')
else: 
    self.datestr='None' 
    date=None
if date!=None:
    date, tzoffset=date[:9], date[-1] 
        if not tzoffset:
            tzoffset = 0
    date=time.mktime(date)-tzoffset
else:
    date=self.__last_article_time+1 
#            print 'Article without date:', self.msgid
self.__last_article_time=date
self.date='%011i' % (date,)
# Figure out the e-mail address and poster's name
self.author, self.email=message.getaddr('From')
#	e=message.getheader('Reply-To')
#	if e!=None: self.email=e
self.email=pipermail.strip_separators(self.email)
self.author=pipermail.strip_separators(self.author)
if self.author=="": self.author=self.email

# Save the 'In-Reply-To:' and 'References:' lines
i_r_t=message.getheader('In-Reply-To')
if i_r_t==None: self.in_reply_to=''
else:
    match=pipermail.msgid_pat.search(i_r_t)
    if match==None: self.in_reply_to=''
    else: self.in_reply_to=pipermail.strip_separators(match.group(1))

references=message.getheader('References')
if references==None: self.references=[]
else: self.references=map(pipermail.strip_separators, string.split(references))

# Save any other interesting headers
self.headers={}
for i in keepHeaders:
    if message.has_key(i): self.headers[i]=message[i]

# Read the message body
self.body=[]
message.rewindbody()
while (1):
    line=message.fp.readline()
    if line=="": break
    self.body.append(line)

def loadbody_fromHTML(self,fileobj):
    self.body=[]
    begin=0
while(1):
        line=fileobj.readline()
        if not line:
            break
        if (not begin) and string.strip(line)=='&lt;!--beginarticle-->':
        begin=1
            continue
        if string.strip(line)=='&lt;!--endarticle-->':
            break
        if begin:
            self.body.append(line)

def __getstate__(self):
    d={}
    for each in self.__dict__.keys():
        if each in ['maillist','prev','next','body']:
            d[each] = None
        else:
            d[each] = self.__dict__[each]
    d['body']=[]
    return d
class HyperArchive(pipermail.T):
# some defaults
DIRMODE=0755 
FILEMODE=0644 

VERBOSE=0
DEFAULTINDEX='thread'
ARCHIVE_PERIOD='month'

THREADLAZY=0
THREADLEVELS=3

ALLOWHTML=1
SHOWHTML=1
IQUOTES=1
SHOWBR=1

html_hdr_tmpl=DocumentTemplate.HTML(index_header_template)
html_foot_tmpl=DocumentTemplate.HTML(index_footer_template)
html_TOC_tmpl=DocumentTemplate.HTML(TOC_template)

def html_foot(self):
    return self.html_foot_tmpl(self)

def html_head(self):
    return self.html_hdr_tmpl(self)

def html_TOC(self):
    return self.html_TOC_tmpl(self)

def __init__(self, maillist,unlock=1):
    self.maillist=maillist
    self._unlocklist=unlock
    self._lock_file=None

    if hasattr(self.maillist,'archive_volume_frequency'):
        if self.maillist.archive_volume_frequency == 0:
            self.ARCHIVE_PERIOD='year'
        elif self.maillist.archive_volume_frequency == 2:
            self.ARCHIVE_PERIOD='quarter'
        else:
            self.ARCHIVE_PERIOD='month'

pipermail.T.__init__(self, maillist.archive_directory, reload=1, database=pipermail.BSDDBdatabase(maillist.archive_directory))

def GetArchLock(self):
    if self._lock_file:
        return 1
    ou = os.umask(0)
    try:
        self._lock_file = posixfile.open(
                          os.path.join(mm_cfg.LOCK_DIR, '%s@arch.lock' % 
                          self.maillist._internal_name), 'a+')
    finally:
        os.umask(ou)
    # minor race condition here, there is no way to atomicly 
    # check & get a lock. That shouldn't matter here tho' -ddm
    if not self._lock_file.lock('w?', 1):
        self._lock_file.lock('w|', 1)
    else:
        return 0
    return 1

def DropArchLock(self):
    if self._lock_file:
        self._lock_file.lock('u')
        self._lock_file.close()
        self._lock_file = None

def processListArch(self):
    name = self.maillist.ArchiveFileName()
    wname= name+'.working'
    ename= name+'.err_unarchived'
    try:
        os.stat(name)
    except (IOError,os.error):
        #no archive file, nothin to do -ddm
        return

    #see if arch is locked here -ddm 
    if not self.GetArchLock():
        #another archiver is running, nothing to do. -ddm
        return

    #if the working file is still here, the archiver may have 
    # crashed during archiving. Save it, log an error, and move on. 
try:
        wf=open(wname,'r')
        self.maillist.LogMsg("error","Archive working file %s present. "
                             "Check %s for possibly unarchived msgs" %
                              (wname,ename) )
        ef=open(ename, 'a+')
        ef.seek(1,2)
        if ef.read(1) &lt;> '\n':
            ef.write('\n')
        ef.write(wf.read())
        ef.close()
        wf.close()
        os.unlink(wname)
    except IOError:
        pass
    os.rename(name,wname)
    if self._unlocklist:
        self.maillist.Unlock()
    archfile=open(wname,'r')
    self.processUnixMailbox(archfile, Article)
    archfile.close()
    os.unlink(wname)
    self.DropArchLock()

def get_filename(self, article):
return '%06i.html' % (article.sequence,)

def get_archives(self, article):
"""Return a list of indexes where the article should be filed.
A string can be returned if the list only contains one entry, 
and the empty list is legal."""
if article.subject in ['subscribe', 'unsubscribe']: return None
    return self.dateToVolName(string.atof(article.date))
# The following two methods should be inverses of each other. -ddm
def dateToVolName(self,date):
    datetuple=time.gmtime(date)
if self.ARCHIVE_PERIOD=='year':
    return time.strftime("%Y",datetuple)
elif self.ARCHIVE_PERIOD=='quarter':
    if adate[1] in [1,2,3]:
        return time.strftime("%Yq1",datetuple)
    elif adate[1] in [4,5,6]:
        return time.strftime("%Yq2",datetuple)
    elif adate[1] in [7,8,9]:
        return time.strftime("%Yq3",datetuple)
    else:
        return time.strftime("%Yq4",datetuple)
    # month. -ddm
else:
        return time.strftime("%Y-%B",datetuple)

def volNameToDate(self,volname):
    volname=string.strip(volname)
    volre= { 'year' : r'^(?P&lt;year>[0-9]{4,4})$',
             'quarter' : r'^(?P&lt;year>[0-9]{4,4})q(?P&lt;quarter>[1234])$',
             'month' : r'^(?P&lt;year>[0-9]{4,4})-(?P&lt;month>[a-zA-Z]+)$' }
    for each in volre.keys():
        match=re.match(volre[each],volname)
        if match:
            year=string.atoi(match.group('year'))
            month=1
            if each == 'quarter':
                q=string.atoi(match.group('quarter'))
                month=(q*3)-2
            elif each == 'month':
                monthstr=string.lower(match.group('month'))
                m=[]
                for i in range(1,13):
                    m.append(string.lower(
                             time.strftime("%B",(1999,i,1,0,0,0,0,1,0))))
                try:
                    month=m.index(monthstr)+1
                except ValueError:
                    pass
            return time.mktime((year,month,1,0,0,0,0,1,-1))
    return 0.0

def sortarchives(self):
    def sf(a,b,s=self):
        al=s.volNameToDate(a)
        bl=s.volNameToDate(b)
        if al>bl:
            return 1
        elif al&lt;bl:
            return -1
        else:
            return 0
    if self.ARCHIVE_PERIOD in ('month','year','quarter'):
        self.archives.sort(sf)
    else:
        self.archives.sort()

def message(self, msg):
if self.VERBOSE:
    import sys
    sys.stderr.write(msg)
    if msg[-1:]!='\n': sys.stderr.write('\n')

def open_new_archive(self, archive, archivedir):
import os
index_html=os.path.join(archivedir, 'index.html') 
try: os.unlink(index_html)
except: pass
os.symlink(self.DEFAULTINDEX+'.html',index_html)

def write_index_header(self):
self.depth=0
    print self.html_head()

    if not self.THREADLAZY and self.type=='Thread':
    # Update the threaded index
    self.message("Computing threaded index\n")
    self.updateThreadedIndex()

def write_index_footer(self):
import string
for i in range(self.depth): print '&lt;/UL>'
    print self.html_foot()

def write_index_entry(self, article):
print '&lt;LI> &lt;A HREF="%s">%s&lt;/A> &lt;A NAME="%i">&lt;/A>&lt;I>%s&lt;/I>' % (urllib.quote(article.filename), 
							     CGIescape(article.subject), article.sequence, 
							     CGIescape(article.author))

def write_threadindex_entry(self, article, depth):
if depth&lt;0: 
    sys.stderr.write('depth&lt;0') ; depth=0
if depth>self.THREADLEVELS: depth=self.THREADLEVELS
if depth&lt;self.depth: 
    for i in range(self.depth-depth): print '&lt;/UL>'
elif depth>self.depth: 
    for i in range(depth-self.depth): print '&lt;UL>'
print '&lt;!--%i %s -->' % (depth, article.threadKey)
self.depth=depth
print '&lt;LI> &lt;A HREF="%s">%s&lt;/A> &lt;A NAME="%i">&lt;/A>&lt;I>%s&lt;/I>' % (CGIescape(urllib.quote(article.filename)),
							     CGIescape(article.subject), article.sequence+910, 
							     CGIescape(article.author))

def write_TOC(self):
    self.sortarchives()
    toc=open(os.path.join(self.basedir, 'index.html'), 'w')
    toc.write(self.html_TOC())
    toc.close()

# Archive an Article object.
def add_article(self, article):
    # Determine into what archives the article should be placed
    archives=self.get_archives(article)
    if archives==None: archives=[]        # If no value was returned, ignore it
    if type(archives)==type(''): archives=[archives]        # If a string was returned, convert to a list
    if archives==[]: return         # Ignore the article

    # Add the article to each archive in turn
    article.filename=filename=self.get_filename(article)
    article_text=article.as_text()
    temp=self.format_article(article) # Reformat the article
    self.message("Processing article #"+str(article.sequence)+' into archives '+str(archives))
    for i in archives:
        self.archive=i
        archivedir=os.path.join(self.basedir, i)
        # If it's a new archive, create it
        if i not in self.archives: 
            self.archives.append(i) ; self.update_TOC=1
            self.database.newArchive(i)
            # If the archive directory doesn't exist, create it
            try: os.stat(archivedir)
            except os.error, errdata:
                errno, errmsg=errdata
                if errno==2: 
                    os.mkdir(archivedir, self.DIRMODE)
                else: raise os.error, errdata
            self.open_new_archive(i, archivedir)

        # Write the HTML-ized article to the html archive.
        f=open(os.path.join(archivedir, filename), 'w')
        os.chmod(os.path.join(archivedir, filename), self.FILEMODE)
        f.write(temp.as_html())
        f.close()

        # Write the text article to the text archive.
        archivetextfile=os.path.join(self.basedir,"%s.txt" % i)
        f=open(archivetextfile, 'a+')
        os.chmod(archivetextfile, self.FILEMODE)
        f.write(article_text)
        f.close()

        authorkey=pipermail.fixAuthor(article.author)+'\000'+article.date
        subjectkey=string.lower(article.subject)+'\000'+article.date

        # Update parenting info
        parentID=None
        if article.in_reply_to!='': parentID=article.in_reply_to
        elif article.references!=[]: 
            # Remove article IDs that aren't in the archive
            refs=filter(lambda x, self=self: self.database.hasArticle(self.archive, x), 
                        article.references)
            if len(refs):
                refs=map(lambda x, s=self: s.database.getArticle(s.archive, x), refs)
                maxdate=refs[0]
                for ref in refs[1:]: 
                    if ref.date>maxdate.date: maxdate=ref
                parentID=maxdate.msgid
        else:
            # Get the oldest article with a matching subject, and assume this is 
            # a follow-up to that article
            parentID=self.database.getOldestArticle(self.archive, article.subject)

        if parentID!=None and not self.database.hasArticle(self.archive, parentID): 
            parentID=None
        article.parentID=parentID 
        if parentID!=None:
            parent=self.database.getArticle(self.archive, parentID)
            article.threadKey=parent.threadKey+article.date+'-'
        else: article.threadKey=article.date+'-'
        self.database.setThreadKey(self.archive, article.threadKey+'\000'+article.msgid, article.msgid)
        self.database.addArticle(i, temp, subjectkey, authorkey)

        if i not in self._dirty_archives: 
            self._dirty_archives.append(i)
    del temp

# Update only archives that have been marked as "changed".
def update_dirty_archives(self):
    for i in self._dirty_archives:
        self.update_archive(i)
        archz=None
        archt=None
        try:
            import gzip
            try: 
                archt=open(os.path.join(self.basedir,"%s.txt" % i),"r") 
                try: 
                    os.rename(os.path.join(self.basedir,"%s.txt.gz" % i),
                          os.path.join(self.basedir,"%s.old.txt.gz" % i))
                    archz=gzip.open(os.path.join(self.basedir,"%s.old.txt.gz" % i),"r")
                except (IOError, RuntimeError, os.error):
                    pass
                newz=gzip.open(os.path.join(self.basedir,"%s.txt.gz" % i),"w") 
	    if archz :
                    newz.write(archz.read())
                    archz.close()
                    os.unlink(os.path.join(self.basedir,"%s.old.txt.gz" % i))
                newz.write(archt.read())
                newz.close()
                archt.close()
                os.unlink(os.path.join(self.basedir,"%s.txt" % i))
            except IOError:
                pass
        except ImportError:
            pass
    self._dirty_archives=[]

def close(self):
    "Close an archive, saving its state and updating any changed archives."
    self.update_dirty_archives()# Update all changed archives
    # If required, update the table of contents
    if self.update_TOC or 1:
        self.update_TOC=0
        self.write_TOC()
    # Save the collective state 
    self.message('Pickling archive state into '+os.path.join(self.basedir, 'pipermail.pck'))
    self.database.close()
    del self.database
    f=open(os.path.join(self.basedir, 'pipermail.pck'), 'w')
    pickle.dump(self.__getstate__(), f)
    f.close()

def __getstate__(self):
    d={}
    for each in self.__dict__.keys():
        if not (each in ['maillist','_lock_file','_unlocklist']):
            d[each] = self.__dict__[each]
    return d

# Add &lt;A HREF="..."> tags around URLs and e-mail addresses.

def __processbody_URLquote(self, source, dest):
body2=[]
last_line_was_quoted=0
for i in xrange(0, len(source)):
    Lorig=L=source[i] ; prefix=suffix=""
    if L==None: continue
    # Italicise quoted text
    if self.IQUOTES:
	quoted=quotedpat.match(L)
	if quoted==None: last_line_was_quoted=0
	else:
	    quoted = quoted.end(0)
	    prefix=CGIescape(L[:quoted]) + '&lt;i>' 
	    suffix='&lt;/I>'
	    if self.SHOWHTML: suffix=suffix+'&lt;BR>'
	    if not last_line_was_quoted: prefix='&lt;BR>'+prefix
	    L= L[quoted:] 
	    last_line_was_quoted=1
    # Check for an e-mail address
    L2="" ; jr=emailpat.search(L) ; kr=urlpat.search(L)
    while jr!=None or kr!=None:
	if jr==None: j=-1
	else: j = jr.start(0)
	if kr==None: k=-1
	else: k = kr.start(0)
	if j!=-1 and (j&lt;k or k==-1): text=jr.group(1) ; URL='mailto:'+text ; pos=j
	elif k!=-1 and (j>k or j==-1): text=URL=kr.group(1) ; pos=k
	else: # j==k
	    raise ValueError, "j==k: This can't happen!"
	length=len(text)
#		sys.stderr.write("URL: %s %s %s \n" % (CGIescape(L[:pos]), URL, CGIescape(text)))
L2=L2+'%s<A HREF="%s">%s</A>' % (CGIescape(L[:pos]), URL, CGIescape(text))
L=L[pos+length:]
jr=emailpat.search(L) ; kr=urlpat.search(L)
if jr==None and kr==None: L=CGIescape(L)
L=prefix+L2+L+suffix
if L!=Lorig: source[i], dest[i]=None, L
# Escape all special characters
def __processbody_CGIescape(self, source, dest):
    import cgi
    for i in xrange(0, len(source)):
    if source[i]!=None: 
        dest[i]=cgi.escape(source[i]) ; source[i]=None

# Perform Hypermail-style processing of &lt;HTML>&lt;/HTML> directives
# in message bodies.  Lines between &lt;HTML> and &lt;/HTML> will be written
# out precisely as they are; other lines will be passed to func2
# for further processing .

def __processbody_HTML(self, source, dest):
    l=len(source) ; i=0
while i&lt;l:
    while i&lt;l and htmlpat.match(source[i])==None: i=i+1
    if i&lt;l: source[i]=None ; i=i+1
    while i&lt;l and nohtmlpat.match(source[i])==None:
        dest[i], source[i] = source[i], None
        i=i+1
    if i&lt;l: source[i]=None ; i=i+1

def format_article(self, article):
source=article.body ; dest=[None]*len(source)
# Handle &lt;HTML> &lt;/HTML> directives
if self.ALLOWHTML: 
    self.__processbody_HTML(source, dest)
self.__processbody_URLquote(source, dest)
if not self.SHOWHTML: 
    # Do simple formatting here: &lt;PRE>..&lt;/PRE>
    for i in range(0, len(source)):
	s=source[i]
	if s==None: continue
	dest[i]=CGIescape(s) ; source[i]=None
    if len(dest) > 0:
	dest[0]='&lt;PRE>'+dest[0] ; dest[-1]=dest[-1]+'&lt;/PRE>'
else:
    # Do fancy formatting here
    if self.SHOWBR:
	# Add &lt;BR> onto every line
	for i in range(0, len(source)):
	    s=source[i]
	    if s==None: continue
	    s=CGIescape(s) +'&lt;BR>'
	    dest[i]=s ; source[i]=None
    else:
	for i in range(0, len(source)):
	    s=source[i]
	    if s==None: continue
	    s=CGIescape(s)
	    if s[0:1] in ' \t\n': s='&lt;P>'+s
	    dest[i]=s ; source[i]=None
    article.body=filter(lambda x: x!=None, dest)
return article

def update_article(self, arcdir, article, prev, next):
import os
self.message('Updating HTML for article '+str(article.sequence))
try:
    f=open(os.path.join(arcdir, article.filename), 'r')
        article.loadbody_fromHTML(f)
    f.close()
    except IOError:
        self.message("article file %s is missing!" % os.path.join(arcdir, article.filename)) 
    article.prev=prev
    article.next=next
f=open(os.path.join(arcdir, article.filename), 'w')
f.write(article.as_html())
f.close()

Mailman web-archiver

The Dragon De Monsyne

tags

participants (1)