Well, it's a bit rough, and there's a few more things i'd like to
do with it, But my free time may be scarse lately & I'm going on vacation in a week, so I figure I'd better post this & let some folks play with it.
Anyway, this is the intgrated web-archiver module for Mailman.
For the moment, it still needs bsddb (mebbe after I get back I can try tackling replacing that)
To get this working, here is what ye need to do:
1) Make sure you have the bsddb module compiled :P
2) Make sure you have somewhere in your PYTHONPATH
the following python modules:
a) The latest pipermail (0.0.5)
b) Digicool's DocumentTemplate package
<http://www.digicool.com/releases/bobo/DocumentTemplate-rn.html>
3) Make sure you get rid of the old pipermail 0.0.2 sitting in
the Mailman package.
4) Put HyperArch.py in the Mailman package directory.
5) Put the 'arch' script in the ${prefix}/cron directory
6) Apply the included patch.
7) add an entry like this to your crontab:
# Periodically update the webarchive. 0 3,9,17,23 * * * /usr/local/bin/python /usr/services/mailman/cron/arch Notes: You can run the arch script as often as you like, depending on how up-to-date you want your webarchives to be.
You also can run the script from the command line to manually add
articles to the archive. (I just ran three years worth of traffic from a reasonabally high-volume majordomo list I'm moving over thru this system, and it went thru fine.)
You can set archives to use yearly, quarterly, or monthly archives
with this.
You will probably want to use the 'Set date to when re-sent' option
on your lists with this module.
All the HTML generated by this module is generated from a template.
This makes customizing the output trivial, just edit the template. (Right now the templates are static strings. This should be changed soon.)
If you do not run the arch cronjob, then this module will do nothing,
and will not interfere with an external archiver.
-The Dragon De Monsyne
diff -c /usr/src/mailman-1.0b5/Mailman/Archiver.py Mailman/Archiver.py *** /usr/src/mailman-1.0b5/Mailman/Archiver.py Mon Jul 27 17:48:31 1998 --- Mailman/Archiver.py Sun Aug 16 11:11:52 1998
*** 42,49 ****
self.archive_private = mm_cfg.DEFAULT_ARCHIVE_PRIVATE
## self.archive_update_frequency =
## mm_cfg.DEFAULT_ARCHIVE_UPDATE_FREQUENCY
! ## self.archive_volume_frequency =
! ## mm_cfg.DEFAULT_ARCHIVE_VOLUME_FREQUENCY
## self.archive_retain_text_copy =
## mm_cfg.DEFAULT_ARCHIVE_RETAIN_TEXT_COPY
--- 42,49 ----
self.archive_private = mm_cfg.DEFAULT_ARCHIVE_PRIVATE
## self.archive_update_frequency =
## mm_cfg.DEFAULT_ARCHIVE_UPDATE_FREQUENCY
! self.archive_volume_frequency =
! mm_cfg.DEFAULT_ARCHIVE_VOLUME_FREQUENCY
## self.archive_retain_text_copy =
## mm_cfg.DEFAULT_ARCHIVE_RETAIN_TEXT_COPY
*** 62,71 **** def GetBaseArchiveURL(self): if self.archive_private: return os.path.join(mm_cfg.PRIVATE_ARCHIVE_URL, ! self._internal_name + ".html") else: return os.path.join(mm_cfg.PUBLIC_ARCHIVE_URL, ! self._internal_name + ".html")
def GetConfigInfo(self):
return [
--- 62,71 ---- def GetBaseArchiveURL(self): if self.archive_private: return os.path.join(mm_cfg.PRIVATE_ARCHIVE_URL, ! self._internal_name + mm_cfg.PRIVATE_ARCHIVE_URL_EXT) else: return os.path.join(mm_cfg.PUBLIC_ARCHIVE_URL, ! self._internal_name + mm_cfg.PRIVATE_ARCHIVE_URL_EXT)
def GetConfigInfo(self):
return [
*** 85,93 **** ## "How often should new messages be incorporated? " ## "0 for no archival, 1 for daily, 2 for hourly"),
! ## ('archive_volume_frequency', mm_cfg.Radio, ('Yearly', 'Monthly'), ! ## 0, ! ## 'How often should a new archive volume be started?'),
## ('archive_retain_text_copy', mm_cfg.Toggle, ('No', 'Yes'), ## 0, --- 85,93 ---- ## "How often should new messages be incorporated? " ## "0 for no archival, 1 for daily, 2 for hourly"),
! ('archive_volume_frequency', mm_cfg.Radio, ! ('Yearly', 'Monthly','Quarterly'), 0, ! 'How often should a new archive volume be started?'),
## ('archive_retain_text_copy', mm_cfg.Toggle, ('No', 'Yes'), ## 0, diff -c /usr/src/mailman-1.0b5/Mailman/Defaults.py Mailman/Defaults.py *** /usr/src/mailman-1.0b5/Mailman/Defaults.py Wed Aug 19 01:46:37 1998 --- Mailman/Defaults.py Thu Aug 27 00:22:46 1998
*** 37,42 **** --- 37,48 ---- PUBLIC_ARCHIVE_URL = 'http://www.OVERRIDE.WITH.YOUR.PUBLIC.ARCHIVE.URL/' PRIVATE_ARCHIVE_URL = 'http://www.OVERRIDE.WITH.YOUR.PRIVATE.ARCHIVE.URL/'
- DEFAULT_ARCHIVE_VOLUME_FREQUENCY = 1
- PUBLIC_ARCHIVE_URL_EXT = ''
- PRIVATE_ARCHIVE_URL_EXT = ''
- DEFAULT_ARCHIVE_PRIVATE = 0 # 0=public, 1=private HOME_PAGE = 'index.html' MAILMAN_OWNER = 'mailman-owner@%s' % DEFAULT_HOST_NAME
*** 216,219 **** VERSION = '1.0b5'
# Data file version number ! DATA_FILE_VERSION = 3 --- 222,225 ---- VERSION = '1.0b5'
# Data file version number ! DATA_FILE_VERSION = 4 diff -c /usr/src/mailman-1.0b5/Mailman/htmlformat.py Mailman/htmlformat.py *** /usr/src/mailman-1.0b5/Mailman/htmlformat.py Mon Jul 27 17:48:31 1998 --- Mailman/htmlformat.py Wed Aug 19 02:24:43 1998
*** 383,389 **** InputObj.__init__(self, name, "TEXT", value, checked=0, size=size)
class TextArea: ! def __init__(self, name, text='', rows=None, cols=None, wrap='soft'): self.name = name self.text = text self.rows = rows --- 383,389 ---- InputObj.__init__(self, name, "TEXT", value, checked=0, size=size)
class TextArea: ! def __init__(self, name, text='', rows=None, cols=None, wrap='off'): self.name = name self.text = text self.rows = rows diff -c /usr/src/mailman-1.0b5/Mailman/versions.py Mailman/versions.py *** /usr/src/mailman-1.0b5/Mailman/versions.py Mon Jul 27 17:48:31 1998 --- Mailman/versions.py Wed Aug 19 02:31:06 1998
*** 63,70 **** PreferStored('automatically_remove', 'automatic_bounce_action') # - dropped vars: for a in ['archive_retain_text_copy', ! 'archive_update_frequency', ! 'archive_volume_frequency']: if hasattr(l, a): delattr(l, a)
def UpdateOldUsers(l): --- 63,69 ---- PreferStored('automatically_remove', 'automatic_bounce_action') # - dropped vars: for a in ['archive_retain_text_copy', ! 'archive_update_frequency']: if hasattr(l, a): delattr(l, a)
def UpdateOldUsers(l):
#!/usr/local/bin/python
import sys, string, getopt, os import paths import Mailman.HyperArch import Mailman.Utils from Mailman.HyperArch import HyperArchive from Mailman.MailList import MailList
def ArchList(list): l=MailList(list) h=HyperArchive(l) h.processListArch() h.close()
def PrintUsage(): print (' usage: arch [-v | --verbose] [-l "list" | --list "list"] ' '[mailboxfiles]') import sys sys.exit(1)
def main(): if '-h' in sys.argv or '-?' in sys.argv: PrintUsage() return
config={'VERBOSE':0}
# Now we parse the command line options
opts, params = getopt.getopt(sys.argv[1:], 'vl:', ['verbose','list='])
for option, arg in opts:
if option=='-l': config['LIST']=arg
if option=='-v': config['VERBOSE']=1
if option=='--list': config['LIST']=arg
if option=='--verbose': config['VERBOSE']=1
if config.has_key('LIST'):
if params:
l=MailList(config['LIST'],lock=0)
h=HyperArchive(l)
if config['VERBOSE']:
h.VERBOSE=1
for each in params:
try:
f=open(each,'r')
except IOError:
sys.stderr.write("Cannot open %s\n" % each )
continue
h.processUnixMailbox(f,Mailman.HyperArch.Article)
f.close()
h.close()
else:
l=MailList(config['LIST'])
h=HyperArchive(l)
if config['VERBOSE']:
h.VERBOSE=1
h.processListArch()
h.close()
else:
#loop thru all the lists, doing archiving -ddm.
for each in Mailman.Utils.list_names():
if config['VERBOSE']:
sys.stderr.write("Processing list %s\n" % each)
l=MailList(each)
if not l.archive:
l.Unlock()
del l
else:
h=HyperArchive(l)
if config['VERBOSE']:
h.VERBOSE=1
h.processListArch()
h.close()
if __name__ == '__main__': main()
"""HyperArch: Pipermail archiving for MailMan, using DocumentTemplate templates
- The Dragon De Monsyne <dragondm@integral.org>
See <http://www.digicool.com/site/Principia/DTML.html> for some explanation of DTML (DocumentTemplate) Syntax. (Note: the URL listed acually documents a slightly extended DTML syntax used by a product called Principia. Bug the folks at Digicool fer 'plain' DTML documentation :> )
TODO:
- The templates should be be files in Mailman's Template dir, instead
of static strings.
- Each list should be able to have it's own templates.
Also, it should automatically fall back to default template in case
of error in list specific template.
- Should be able to force all HTML to be regenerated next time the archive
is run, incase a template is changed.
- Replace pipermail.BSDDBdatabase with something that dosen't require a C
extention (i.e. bsddb).
(Perhaps BoboPOS <http://www.digicool.com/releases/bobo/BoboPOS-rn.html>
could be used here?)
- Run a command to generate tarball of html archives for downloading
(prolly in the 'update_dirty_archives' method )
"""
import re, cgi, urllib, string import time, pickle, os, posixfile import DocumentTemplate import pipermail import mm_cfg
article_text_template="""From <!--#var email --> <!--#var datestr --> Date: <!--#var datestr --> From: <!--#var author --> <<!--#var email -->> Subject: <!--#var subject -->
<!--#in body --><!--#var sequence-item --><!--#/in -->
"""
article_template="""<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2//EN">
<HTML>
<HEAD>
<TITLE><!--#var subject html_quote --></TITLE>
<LINK REL="Index" HREF="index.html" >
<LINK REL="made" HREF="mailto:<!--#var email url_quote -->">
<!--linkthreads-->
<!--#if prev -->
<LINK REL="Previous" HREF="<!--#var expr="prev.filename" url_quote -->">
<!--#/if -->
<!--#if next -->
<LINK REL="Next" HREF="<!--#var expr="next.filename" url_quote -->">
<!--#/if -->
<!--endlinkthreads-->
</HEAD>
<BODY BGCOLOR="#ffffff">
<H1><!--#var subject html_quote --></H1>
<B><!--#var author html_quote --></B>
<A HREF="mailto:<!--#var email url_quote -->" TITLE="<!--#var subject html_quote -->"><!--#var email html_quote --></A><BR>
<I><!--#var datestr html_quote --></I>
<P><UL>
<!--threads-->
<!--#if prev -->
<LI> Previous message: <A HREF="<!--#var expr="prev.filename" url_quote -->"><!--#var expr="prev.subject" html_quote--></A></li>
<!--#/if -->
<!--#if next -->
<LI> Next message: <A HREF="<!--#var expr="next.filename" url_quote -->"><!--#var expr="next.subject" html_quote --></A></li>
<!--#/if -->
<!--endthreads-->
<LI> <B>Messages sorted by:</B>
<a href="date.html#<!--#var sequence -->">[ date ]</a>
<a href="thread.html#<!--#var sequence -->">[ thread ]</a>
<a href="subject.html#<!--#var sequence -->">[ subject ]</a>
<a href="author.html#<!--#var sequence -->">[ author ]</a>
</LI>
</UL>
<HR>
<!--beginarticle-->
<!--#in body --><!--#var sequence-item --><!--#/in -->
<!--endarticle--> <HR> <P><UL> <!--threads--> <!--#if prev --> <LI> Previous message: <A HREF="<!--#var expr="prev.filename" url_quote -->"><!--#var expr="prev.subject" html_quote--></A></li> <!--#/if --> <!--#if next --> <LI> Next message: <A HREF="<!--#var expr="next.filename" url_quote -->"><!--#var expr="next.subject" html_quote --></A></li> <!--#/if --> <!--endthreads--> <LI> <B>Messages sorted by:</B> <a href="date.html#<!--#var sequence -->">[ date ]</a> <a href="thread.html#<!--#var sequence -->">[ thread ]</a> <a href="subject.html#<!--#var sequence -->">[ subject ]</a> <a href="author.html#<!--#var sequence -->">[ author ]</a> </LI> </UL> </body></html> """
index_header_template="""<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2//EN"> <HTML> <HEAD> <title>The <!--#var expr="maillist.real_name" --> <!--#var archive --> Archive by <!--#var type --></title> </HEAD> <BODY BGCOLOR="#ffffff"> <a name="start"></A> <h1><!--#var archive --> Archives by <!--#var type --></h1> <ul> <li> <b>Messages sorted by:</b> <!--#if expr="type<>'Thread' " --> <a href="thread.html#start">[ thread ]</a> <!--#/if --> <!--#if expr="type<>'Subject' " --> <a href="subject.html#start">[ subject ]</a> <!--#/if --> <!--#if expr="type<>'Author' " --> <a href="author.html#start">[ author ]</a> <!--#/if --> <!--#if expr="type<> 'Date' " --> <a href="date.html#start">[ date ]</a> <!--#/if --> </li> <li><b><a href="<!--#var expr="maillist.GetAbsoluteScriptURL('listinfo')" -->">More info on this list...</a></b></li> <!--#if ARCHIVES --> <li> <b><a href="<!--#var ARCHIVES -->">Other mail archives</a></b> </li> <!--#/if --> </ul> <p><b>Starting:</b> <i><!--#var firstdate --></i><br> <b>Ending:</b> <i><!--#var lastdate --></i><br> <b>Messages:</b> <!--#var size --><p> <ul>
"""
index_footer_template=""" </ul> <p> <a name="end"><b>Last message date:</b></a> <i><!--#var lastdate --></i><br> <b>Archived on:</b> <i><!--#var archivedate --></i> <p> <ul> <li> <b>Messages sorted by:</b> <!--#if expr="type<>'Thread' " --> <a href="thread.html#start">[ thread ]</a> <!--#/if --> <!--#if expr="type<>'Subject' " --> <a href="subject.html#start">[ subject ]</a> <!--#/if --> <!--#if expr="type<>'Author' " --> <a href="author.html#start">[ author ]</a> <!--#/if --> <!--#if expr="type<> 'Date' " --> <a href="date.html#start">[ date ]</a> <!--#/if --> </li> <li><b><a href="<!--#var expr="maillist.GetAbsoluteScriptURL('listinfo')" -->">More info on this list...</a></b></li> <!--#if ARCHIVES --> <li> <b><a href="<!--#var ARCHIVES -->">Other mail archives</a></b> </li> <!--#/if --> </ul> <p> <hr> <i>This archive was generated by <a href="http://starship.skyport.net/crew/amk/maintained/pipermail.html">Pipermail <!--#var version --></a>.</i> </BODY> </HTML>"""
TOC_template="""<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2//EN"> <HTML> <HEAD> <title>The <!--#var expr="maillist.real_name" --> Archives</title> </HEAD> <BODY BGCOLOR="#ffffff"> <h1>The <!--#var expr="maillist.real_name" --> Archives </h1> <p> <a href="<!--#var expr="maillist.GetAbsoluteScriptURL('listinfo')" -->">More info on this list...</a> </p> <!--#if archives --> <!--#in archives --> <!--#if sequence-start --> <table border=3> <tr><td>Archive</td> <td>View by:</td> <td>Downloadable version</td></tr> <!--#/if --> <tr> <td><!--#var sequence-item -->:</td> <td> <A href="<!--#var sequence-item -->/thread.html">[ Thread ]</a> <A href="<!--#var sequence-item -->/subject.html">[ Subject ]</a> <A href="<!--#var sequence-item -->/author.html">[ Author ]</a> <A href="<!--#var sequence-item -->/date.html">[ Date ]</a> </td> <td><A href="<!--#var sequence-item -->.txt.gz">[ Text ]</a></td> </tr> <!--#if sequence-end --> </table> <!--#/if --> <!--#/in -->
<!--#else -->
<P>Currently, there are no archives. </P>
<!--#/if -->
</BODY> </HTML>"""
def CGIescape(arg): s=cgi.escape(str(arg)) s=re.sub('"', '"', s) return s
# Parenthesized human name paren_name_pat=re.compile(r'([(].*[)])') # Subject lines preceded with 'Re:' REpat=re.compile( r"\s*RE\s*:\s*", re.IGNORECASE) # E-mail addresses and URLs in text emailpat=re.compile(r'([-+,.\w]+@[-+.\w]+)') # Argh! This pattern is buggy, and will choke on URLs with GET parameters. urlpat=re.compile(r'(\w+://[^>)\s]+)') # URLs in text # Blank lines blankpat=re.compile(r'^\s*$')
#
# Starting <html> directive
htmlpat=re.compile(r'^\s*<HTML>\s*$', re.IGNORECASE)
# Ending </html> directive
nohtmlpat=re.compile(r'^\s*</HTML>\s*$', re.IGNORECASE)
# Match quoted text
quotedpat=re.compile(r'^([>|:]|>)+')
# Note: I'm overriding most, if not all of the pipermail Article class here -ddm # The Article class encapsulates a single posting. The attributes # are: # # sequence : Sequence number, unique for each article in a set of archives # subject : Subject # datestr : The posting date, in human-readable format # date : The posting date, in purely numeric format # headers : Any other headers of interest # author : The author's name (and possibly organization) # email : The author's e-mail address # msgid : A unique message ID # in_reply_to : If !="", this is the msgid of the article being replied to # references: A (possibly empty) list of msgid's of earlier articles in the thread # body : A list of strings making up the message body
class Article(pipermail.Article): __last_article_time=time.time()
html_tmpl=DocumentTemplate.HTML(article_template)
text_tmpl=DocumentTemplate.HTML(article_text_template)
def as_html(self):
return self.html_tmpl(self)
def as_text(self):
return self.text_tmpl(self)
def __init__(self, message=None, sequence=0, keepHeaders=[]):
import time
if message==None: return
self.sequence=sequence
self.parentID = None
self.threadKey = None
self.prev=None
self.next=None
# otherwise the current sequence number is used.
id=pipermail.strip_separators(message.getheader('Message-Id'))
if id=="": self.msgid=str(self.sequence)
else: self.msgid=id
if message.has_key('Subject'): self.subject=str(message['Subject'])
else: self.subject='No subject'
i=0
while (i!=-1):
result=REpat.match(self.subject)
if result:
i = result.end(0)
self.subject=self.subject[i:]
else: i=-1
if self.subject=="": self.subject='No subject'
if message.has_key('Date'):
self.datestr=str(message['Date'])
date=message.getdate_tz('Date')
else:
self.datestr='None'
date=None
if date!=None:
date, tzoffset=date[:9], date[-1]
if not tzoffset:
tzoffset = 0
date=time.mktime(date)-tzoffset
else:
date=self.__last_article_time+1
# print 'Article without date:', self.msgid self.__last_article_time=date self.date='%011i' % (date,)
# Figure out the e-mail address and poster's name
self.author, self.email=message.getaddr('From')
# e=message.getheader('Reply-To') # if e!=None: self.email=e self.email=pipermail.strip_separators(self.email) self.author=pipermail.strip_separators(self.author)
if self.author=="": self.author=self.email
# Save the 'In-Reply-To:' and 'References:' lines
i_r_t=message.getheader('In-Reply-To')
if i_r_t==None: self.in_reply_to=''
else:
match=pipermail.msgid_pat.search(i_r_t)
if match==None: self.in_reply_to=''
else: self.in_reply_to=pipermail.strip_separators(match.group(1))
references=message.getheader('References')
if references==None: self.references=[]
else: self.references=map(pipermail.strip_separators, string.split(references))
# Save any other interesting headers
self.headers={}
for i in keepHeaders:
if message.has_key(i): self.headers[i]=message[i]
# Read the message body
self.body=[]
message.rewindbody()
while (1):
line=message.fp.readline()
if line=="": break
self.body.append(line)
def loadbody_fromHTML(self,fileobj):
self.body=[]
begin=0
while(1):
line=fileobj.readline()
if not line:
break
if (not begin) and string.strip(line)=='<!--beginarticle-->':
begin=1
continue
if string.strip(line)=='<!--endarticle-->':
break
if begin:
self.body.append(line)
def __getstate__(self):
d={}
for each in self.__dict__.keys():
if each in ['maillist','prev','next','body']:
d[each] = None
else:
d[each] = self.__dict__[each]
d['body']=[]
return d
class HyperArchive(pipermail.T):
# some defaults
DIRMODE=0755
FILEMODE=0644
VERBOSE=0
DEFAULTINDEX='thread'
ARCHIVE_PERIOD='month'
THREADLAZY=0
THREADLEVELS=3
ALLOWHTML=1
SHOWHTML=1
IQUOTES=1
SHOWBR=1
html_hdr_tmpl=DocumentTemplate.HTML(index_header_template)
html_foot_tmpl=DocumentTemplate.HTML(index_footer_template)
html_TOC_tmpl=DocumentTemplate.HTML(TOC_template)
def html_foot(self):
return self.html_foot_tmpl(self)
def html_head(self):
return self.html_hdr_tmpl(self)
def html_TOC(self):
return self.html_TOC_tmpl(self)
def __init__(self, maillist,unlock=1):
self.maillist=maillist
self._unlocklist=unlock
self._lock_file=None
if hasattr(self.maillist,'archive_volume_frequency'):
if self.maillist.archive_volume_frequency == 0:
self.ARCHIVE_PERIOD='year'
elif self.maillist.archive_volume_frequency == 2:
self.ARCHIVE_PERIOD='quarter'
else:
self.ARCHIVE_PERIOD='month'
pipermail.T.__init__(self, maillist.archive_directory, reload=1, database=pipermail.BSDDBdatabase(maillist.archive_directory))
def GetArchLock(self):
if self._lock_file:
return 1
ou = os.umask(0)
try:
self._lock_file = posixfile.open(
os.path.join(mm_cfg.LOCK_DIR, '%s@arch.lock' %
self.maillist._internal_name), 'a+')
finally:
os.umask(ou)
# minor race condition here, there is no way to atomicly
# check & get a lock. That shouldn't matter here tho' -ddm
if not self._lock_file.lock('w?', 1):
self._lock_file.lock('w|', 1)
else:
return 0
return 1
def DropArchLock(self):
if self._lock_file:
self._lock_file.lock('u')
self._lock_file.close()
self._lock_file = None
def processListArch(self):
name = self.maillist.ArchiveFileName()
wname= name+'.working'
ename= name+'.err_unarchived'
try:
os.stat(name)
except (IOError,os.error):
#no archive file, nothin to do -ddm
return
#see if arch is locked here -ddm
if not self.GetArchLock():
#another archiver is running, nothing to do. -ddm
return
#if the working file is still here, the archiver may have
# crashed during archiving. Save it, log an error, and move on.
try:
wf=open(wname,'r')
self.maillist.LogMsg("error","Archive working file %s present. "
"Check %s for possibly unarchived msgs" %
(wname,ename) )
ef=open(ename, 'a+')
ef.seek(1,2)
if ef.read(1) <> '\n':
ef.write('\n')
ef.write(wf.read())
ef.close()
wf.close()
os.unlink(wname)
except IOError:
pass
os.rename(name,wname)
if self._unlocklist:
self.maillist.Unlock()
archfile=open(wname,'r')
self.processUnixMailbox(archfile, Article)
archfile.close()
os.unlink(wname)
self.DropArchLock()
def get_filename(self, article):
return '%06i.html' % (article.sequence,)
def get_archives(self, article):
"""Return a list of indexes where the article should be filed.
A string can be returned if the list only contains one entry,
and the empty list is legal."""
if article.subject in ['subscribe', 'unsubscribe']: return None
return self.dateToVolName(string.atof(article.date))
# The following two methods should be inverses of each other. -ddm
def dateToVolName(self,date):
datetuple=time.gmtime(date)
if self.ARCHIVE_PERIOD=='year':
return time.strftime("%Y",datetuple)
elif self.ARCHIVE_PERIOD=='quarter':
if adate[1] in [1,2,3]:
return time.strftime("%Yq1",datetuple)
elif adate[1] in [4,5,6]:
return time.strftime("%Yq2",datetuple)
elif adate[1] in [7,8,9]:
return time.strftime("%Yq3",datetuple)
else:
return time.strftime("%Yq4",datetuple)
# month. -ddm
else:
return time.strftime("%Y-%B",datetuple)
def volNameToDate(self,volname):
volname=string.strip(volname)
volre= { 'year' : r'^(?P<year>[0-9]{4,4})$',
'quarter' : r'^(?P<year>[0-9]{4,4})q(?P<quarter>[1234])$',
'month' : r'^(?P<year>[0-9]{4,4})-(?P<month>[a-zA-Z]+)$' }
for each in volre.keys():
match=re.match(volre[each],volname)
if match:
year=string.atoi(match.group('year'))
month=1
if each == 'quarter':
q=string.atoi(match.group('quarter'))
month=(q*3)-2
elif each == 'month':
monthstr=string.lower(match.group('month'))
m=[]
for i in range(1,13):
m.append(string.lower(
time.strftime("%B",(1999,i,1,0,0,0,0,1,0))))
try:
month=m.index(monthstr)+1
except ValueError:
pass
return time.mktime((year,month,1,0,0,0,0,1,-1))
return 0.0
def sortarchives(self):
def sf(a,b,s=self):
al=s.volNameToDate(a)
bl=s.volNameToDate(b)
if al>bl:
return 1
elif al<bl:
return -1
else:
return 0
if self.ARCHIVE_PERIOD in ('month','year','quarter'):
self.archives.sort(sf)
else:
self.archives.sort()
def message(self, msg):
if self.VERBOSE:
import sys
sys.stderr.write(msg)
if msg[-1:]!='\n': sys.stderr.write('\n')
def open_new_archive(self, archive, archivedir):
import os
index_html=os.path.join(archivedir, 'index.html')
try: os.unlink(index_html)
except: pass
os.symlink(self.DEFAULTINDEX+'.html',index_html)
def write_index_header(self):
self.depth=0
print self.html_head()
if not self.THREADLAZY and self.type=='Thread':
# Update the threaded index
self.message("Computing threaded index\n")
self.updateThreadedIndex()
def write_index_footer(self):
import string
for i in range(self.depth): print '</UL>'
print self.html_foot()
def write_index_entry(self, article):
print '<LI> <A HREF="%s">%s</A> <A NAME="%i"></A><I>%s</I>' % (urllib.quote(article.filename),
CGIescape(article.subject), article.sequence,
CGIescape(article.author))
def write_threadindex_entry(self, article, depth):
if depth<0:
sys.stderr.write('depth<0') ; depth=0
if depth>self.THREADLEVELS: depth=self.THREADLEVELS
if depth<self.depth:
for i in range(self.depth-depth): print '</UL>'
elif depth>self.depth:
for i in range(depth-self.depth): print '<UL>'
print '<!--%i %s -->' % (depth, article.threadKey)
self.depth=depth
print '<LI> <A HREF="%s">%s</A> <A NAME="%i"></A><I>%s</I>' % (CGIescape(urllib.quote(article.filename)),
CGIescape(article.subject), article.sequence+910,
CGIescape(article.author))
def write_TOC(self):
self.sortarchives()
toc=open(os.path.join(self.basedir, 'index.html'), 'w')
toc.write(self.html_TOC())
toc.close()
# Archive an Article object.
def add_article(self, article):
# Determine into what archives the article should be placed
archives=self.get_archives(article)
if archives==None: archives=[] # If no value was returned, ignore it
if type(archives)==type(''): archives=[archives] # If a string was returned, convert to a list
if archives==[]: return # Ignore the article
# Add the article to each archive in turn
article.filename=filename=self.get_filename(article)
article_text=article.as_text()
temp=self.format_article(article) # Reformat the article
self.message("Processing article #"+str(article.sequence)+' into archives '+str(archives))
for i in archives:
self.archive=i
archivedir=os.path.join(self.basedir, i)
# If it's a new archive, create it
if i not in self.archives:
self.archives.append(i) ; self.update_TOC=1
self.database.newArchive(i)
# If the archive directory doesn't exist, create it
try: os.stat(archivedir)
except os.error, errdata:
errno, errmsg=errdata
if errno==2:
os.mkdir(archivedir, self.DIRMODE)
else: raise os.error, errdata
self.open_new_archive(i, archivedir)
# Write the HTML-ized article to the html archive.
f=open(os.path.join(archivedir, filename), 'w')
os.chmod(os.path.join(archivedir, filename), self.FILEMODE)
f.write(temp.as_html())
f.close()
# Write the text article to the text archive.
archivetextfile=os.path.join(self.basedir,"%s.txt" % i)
f=open(archivetextfile, 'a+')
os.chmod(archivetextfile, self.FILEMODE)
f.write(article_text)
f.close()
authorkey=pipermail.fixAuthor(article.author)+'\000'+article.date
subjectkey=string.lower(article.subject)+'\000'+article.date
# Update parenting info
parentID=None
if article.in_reply_to!='': parentID=article.in_reply_to
elif article.references!=[]:
# Remove article IDs that aren't in the archive
refs=filter(lambda x, self=self: self.database.hasArticle(self.archive, x),
article.references)
if len(refs):
refs=map(lambda x, s=self: s.database.getArticle(s.archive, x), refs)
maxdate=refs[0]
for ref in refs[1:]:
if ref.date>maxdate.date: maxdate=ref
parentID=maxdate.msgid
else:
# Get the oldest article with a matching subject, and assume this is
# a follow-up to that article
parentID=self.database.getOldestArticle(self.archive, article.subject)
if parentID!=None and not self.database.hasArticle(self.archive, parentID):
parentID=None
article.parentID=parentID
if parentID!=None:
parent=self.database.getArticle(self.archive, parentID)
article.threadKey=parent.threadKey+article.date+'-'
else: article.threadKey=article.date+'-'
self.database.setThreadKey(self.archive, article.threadKey+'\000'+article.msgid, article.msgid)
self.database.addArticle(i, temp, subjectkey, authorkey)
if i not in self._dirty_archives:
self._dirty_archives.append(i)
del temp
# Update only archives that have been marked as "changed".
def update_dirty_archives(self):
for i in self._dirty_archives:
self.update_archive(i)
archz=None
archt=None
try:
import gzip
try:
archt=open(os.path.join(self.basedir,"%s.txt" % i),"r")
try:
os.rename(os.path.join(self.basedir,"%s.txt.gz" % i),
os.path.join(self.basedir,"%s.old.txt.gz" % i))
archz=gzip.open(os.path.join(self.basedir,"%s.old.txt.gz" % i),"r")
except (IOError, RuntimeError, os.error):
pass
newz=gzip.open(os.path.join(self.basedir,"%s.txt.gz" % i),"w")
if archz :
newz.write(archz.read())
archz.close()
os.unlink(os.path.join(self.basedir,"%s.old.txt.gz" % i))
newz.write(archt.read())
newz.close()
archt.close()
os.unlink(os.path.join(self.basedir,"%s.txt" % i))
except IOError:
pass
except ImportError:
pass
self._dirty_archives=[]
def close(self):
"Close an archive, saving its state and updating any changed archives."
self.update_dirty_archives()# Update all changed archives
# If required, update the table of contents
if self.update_TOC or 1:
self.update_TOC=0
self.write_TOC()
# Save the collective state
self.message('Pickling archive state into '+os.path.join(self.basedir, 'pipermail.pck'))
self.database.close()
del self.database
f=open(os.path.join(self.basedir, 'pipermail.pck'), 'w')
pickle.dump(self.__getstate__(), f)
f.close()
def __getstate__(self):
d={}
for each in self.__dict__.keys():
if not (each in ['maillist','_lock_file','_unlocklist']):
d[each] = self.__dict__[each]
return d
# Add <A HREF="..."> tags around URLs and e-mail addresses.
def __processbody_URLquote(self, source, dest):
body2=[]
last_line_was_quoted=0
for i in xrange(0, len(source)):
Lorig=L=source[i] ; prefix=suffix=""
if L==None: continue
# Italicise quoted text
if self.IQUOTES:
quoted=quotedpat.match(L)
if quoted==None: last_line_was_quoted=0
else:
quoted = quoted.end(0)
prefix=CGIescape(L[:quoted]) + '<i>'
suffix='</I>'
if self.SHOWHTML: suffix=suffix+'<BR>'
if not last_line_was_quoted: prefix='<BR>'+prefix
L= L[quoted:]
last_line_was_quoted=1
# Check for an e-mail address
L2="" ; jr=emailpat.search(L) ; kr=urlpat.search(L)
while jr!=None or kr!=None:
if jr==None: j=-1
else: j = jr.start(0)
if kr==None: k=-1
else: k = kr.start(0)
if j!=-1 and (j<k or k==-1): text=jr.group(1) ; URL='mailto:'+text ; pos=j
elif k!=-1 and (j>k or j==-1): text=URL=kr.group(1) ; pos=k
else: # j==k
raise ValueError, "j==k: This can't happen!"
length=len(text)
# sys.stderr.write("URL: %s %s %s \n" % (CGIescape(L[:pos]), URL, CGIescape(text))) L2=L2+'%s<A HREF="%s">%s</A>' % (CGIescape(L[:pos]), URL, CGIescape(text)) L=L[pos+length:] jr=emailpat.search(L) ; kr=urlpat.search(L) if jr==None and kr==None: L=CGIescape(L) L=prefix+L2+L+suffix if L!=Lorig: source[i], dest[i]=None, L
# Escape all special characters
def __processbody_CGIescape(self, source, dest):
import cgi
for i in xrange(0, len(source)):
if source[i]!=None:
dest[i]=cgi.escape(source[i]) ; source[i]=None
# Perform Hypermail-style processing of <HTML></HTML> directives
# in message bodies. Lines between <HTML> and </HTML> will be written
# out precisely as they are; other lines will be passed to func2
# for further processing .
def __processbody_HTML(self, source, dest):
l=len(source) ; i=0
while i<l:
while i<l and htmlpat.match(source[i])==None: i=i+1
if i<l: source[i]=None ; i=i+1
while i<l and nohtmlpat.match(source[i])==None:
dest[i], source[i] = source[i], None
i=i+1
if i<l: source[i]=None ; i=i+1
def format_article(self, article):
source=article.body ; dest=[None]*len(source)
# Handle <HTML> </HTML> directives
if self.ALLOWHTML:
self.__processbody_HTML(source, dest)
self.__processbody_URLquote(source, dest)
if not self.SHOWHTML:
# Do simple formatting here: <PRE>..</PRE>
for i in range(0, len(source)):
s=source[i]
if s==None: continue
dest[i]=CGIescape(s) ; source[i]=None
if len(dest) > 0:
dest[0]='<PRE>'+dest[0] ; dest[-1]=dest[-1]+'</PRE>'
else:
# Do fancy formatting here
if self.SHOWBR:
# Add <BR> onto every line
for i in range(0, len(source)):
s=source[i]
if s==None: continue
s=CGIescape(s) +'<BR>'
dest[i]=s ; source[i]=None
else:
for i in range(0, len(source)):
s=source[i]
if s==None: continue
s=CGIescape(s)
if s[0:1] in ' \t\n': s='<P>'+s
dest[i]=s ; source[i]=None
article.body=filter(lambda x: x!=None, dest)
return article
def update_article(self, arcdir, article, prev, next):
import os
self.message('Updating HTML for article '+str(article.sequence))
try:
f=open(os.path.join(arcdir, article.filename), 'r')
article.loadbody_fromHTML(f)
f.close()
except IOError:
self.message("article file %s is missing!" % os.path.join(arcdir, article.filename))
article.prev=prev
article.next=next
f=open(os.path.join(arcdir, article.filename), 'w')
f.write(article.as_html())
f.close()
participants (1)
-
The Dragon De Monsyne