[Tutor] Re: OO approach

Prahlad Vaidyanathan slime@vsnl.net
Tue, 26 Feb 2002 10:20:39 +0530


--vtzGhvizbBRQ85DL
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

Hi,

On Mon, 25 Feb 2002 Danny Yoo spewed into the ether:
[-- snip --]
> Can you show us the mailURL base class?  Perhaps it's possible to avoid
> inheritance altogether by making the parent class general enough to handle
> all cases.

Well, I've attached the entire script here (~9K). I've just added
another sub-class for the Wired Daily News as well, but I am yet to
check if that works (it should).

The script works all right, but something tells me there _must_ be a
better way of doing this :-)

pv.
-- 
Prahlad Vaidyanathan  <http://www.symonds.net/~prahladv/>

Parallel lines never meet, unless you bend one or both of them.

--vtzGhvizbBRQ85DL
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="mailurl.py"

#! /usr/local/bin/python -S
"""
Script to download news URLs from :

    * The Register <http://www.theregister.co.uk>
        Extracted from an email received every weekday.
        The email is dumped into ~/Email/tmp_register by procmail
        URLs are extracted using a regex
        After they are retrieved, the tmp_register folder is removed

    * Linux Journal News <http://www.linuxjournal.com>
        Extracted from an email received weekly
        The email is dumped into ~/Email/tmp_ljnews by procmail
        URLs are extracted using a regex
        After they are retrieved, the tmp_ljnews folder is removed

    * Wired News <http://www.wired.com>
        Extracted from an email received daily
        The email is dumped into ~/Email/tmp_wired by procmail
        URLs are extracted using a regex
        After they are retrieved, the tmp_wired folder is removed

    * Linux Weekly News <http://lwn.net>
        Retrieved every Thursday

    * Linux Documentation Project <http://www.linuxdoc.org>
        Retrieved every Wednesday

    * Kernel Traffic <http://kt.zork.net>
        Retrieved every Tuesday

Author: Prahlad Vaidyanathan <slime@vsnl.net>
"""

# Set this variable if you are behind a proxy
# os.environ['http_proxy'] = 'http://foo.bar:3128'

# CAVEAT
# * if i don't connect to the net on said day, that day's LWN/LDP/etc. is
#   not retrieved

import os, sys, time
import re, string
import urllib, rfc822
import MailMsg # By Prabhu Ramachandran <prabhu@aero.iitm.ernet.in>

class mailURL :
    """
    Base class :

    Extracts the dictionary of links, and then mails them to all the 
    users in the to_addrs list using the description of the link in 
    the subject line.

    Arguments :
        from_addr   - "From" address to be used when mailing the pages
        to_addrs    - List of "To" addresses to be used when mailing
        file        - File containing the original news Email
        regex       - The regex to match in the file, to extract the URLs

    """
    def __init__(self, from_addr, to_addrs, file='/dev/null', regex='') :
        """
        Creates the archive directory
        """
        self.archive_dir = '/tmp'
        if not os.path.isdir(self.archive_dir) :
            os.mkdir(self.archive_dir)
        self.from_addr = from_addr
        self.to_addrs = to_addrs
        self.log_file = sys.stdout
        self.regex = regex
        self.file = file

    def log(self,msg) :
        """
        Logs messages to self.log_file
        """
        self.log_file.write(msg)
        self.log_file.flush()
   
    def retrieveURL(self,link) :
        """
        Retrieves a single URL, and returns the filename
        """
        save_file = os.path.join(self.archive_dir,self.filename(link))
        if os.path.isfile(save_file) :
            self.log('%s exists .. ignoring.\n' % string.strip(save_file))
            return(0)
        else :
            try :
                filename, headers = urllib.urlretrieve(link,save_file)
                return(filename) # return(save_file) ?
            except IOError, txt :
                self.log("\nConnect error: %s\n" % txt)
                sys.exit(-1)
            except KeyboardInterrupt :
                self.log("\nUser aborted ...\n")
                sys.exit(-1)
 
    def getAllURLs(self) :
        """
        Retrieves all the URLs in the dictionary, and mails them
        """
        retrieve_count = 0
        dict = self.extractURLs()
        for desc in dict.keys() :
            self.log('%s\n\t%s -> ' % (desc, dict[desc]))
            file = self.retrieveURL(dict[desc])
            if file :
                self.mailHTML(desc,file)
                self.log("mailed.\n")
                retrieve_count += 1
        return(retrieve_count)

    def mailHTML(self, subject, file) :
        """
        Mails file with the subject line specified.
        """
        m = MailMsg.MailMsg(self.from_addr,subject)
        fd = open(file)
        if self.file != '/dev/null' :
            fp = open(self.file)
            mail = rfc822.Message()
            message_id = mail['message-id']
            m.addHeader('References',message_id)
            fp.close()
        m.addHtml( fd.read() )
        m.mail(self.from_addr,self.to_addrs)
        fd.close()

# The next functions are specific to the sub-class

    def filename(self,link) :
        """
        Returns an appropriate filename for a link
        """
        return None

    def extractURLs (self) :
        """
        Return dictionary in link[description] pairs
        """
        return {}

# ----------

class LinuxWeeklyNews (mailURL) :
    def filename (self, link) :
        return('lwn-%s.html' % \
            time.strftime("%Y-%m-%d",time.gmtime()))

    def extractURLs (self) :
        desc = 'Linux Weekly News %s' % \
            time.strftime("%Y-%m-%d",time.localtime())
        link = 'http://lwn.net//%s/bigpage.php3' % \
            time.strftime("%Y/%m%d",time.localtime())
        return({desc:link})

# ----------

class KernelTraffic (mailURL) :
    def filename (self, link) :
        return('kt-%s.html' % \
            time.strftime("%Y-%m-%d",time.localtime()))

    def extractURLs (self) :
        desc = 'Kernel Traffic %s' % \
            time.strftime("%Y-%m-%d",time.localtime())
        link = 'http://kt.zork.net/kernel-traffic/latest.html'
        return({desc:link})

# ----------

class LinuxDoc (mailURL) :
    def filename (self, link) :
        return('ldp-%s.html' % \
            time.strftime("%Y-%m-%d",time.localtime()))

    def extractURLs (self) :
        desc = 'Linux Documentation Project News %s' % \
            time.strftime("%Y-%m-%d",time.localtime())
        link = 'http://www.linuxdoc.org/ldpwn/latest.html'
        return({desc:link})

# ----------

class WiredNews (mailURL) :
    def filename (self, link) :
        file = os.path.basename(link[:-14])
        file = string.replace(file,',','-')
        return(file)

    def extractURLs (self) :
        fd = open(self.file)
        links = {}
        lines = fd.readlines()
        for count in range(len(lines)) :
            line = lines[count]
            if self.regex.search(line) :
                desc = string.strip(lines[count-2])
                desc += " " + string.strip(lines[count-1])
                links[desc] = string.strip(line)
        fd.close()
        return(links)

# ----------

class Register (mailURL) :
    def filename (self, link) :
        return(os.path.basename(link))

    def extractURLs(self) :
        fd = open(self.file)
        links = {}
        lines = fd.readlines()
        for count in range(len(lines)) :
            line = lines[count]
            if self.regex.search(line) :
                desc = string.strip(lines[count-2])
                desc = desc + " " + string.strip(lines[count-1])
                links[desc] = string.strip(line)
        fd.close()
        return(links)
 
# ----------

class LinuxJournal (mailURL) :
    def filename (self, link) :
        return(link[-4:] + '.html')

    def extractURLs (self) :
        fd = open(self.file)
        links = {}
        lines = fd.readlines()
        for count in range(len(lines)) :
            line = lines[count]
            match = self.regex.search(line)
            if match :
                desc = string.strip(lines[count-2])
                desc = desc + " " + string.strip(lines[count-1])
                links[desc] = string.strip(match.group(1))
        fd.close()
        return(links)

# ----------

def main() :
    """Runs the script"""
    # Change these to suit your needs
    frm = "The News <newsd@marvin>"
    to_adds = ["prahlad@localhost"]
    news = []
    # Register
    filename = os.path.join(os.environ['HOME'],'Email/tmp_register') 
    if os.path.isfile(filename) :
        regex = re.compile(r'http://www.theregister.co.uk/')
        news.append( Register(frm,to_adds,filename,regex) )
    # LJ news
    filename = os.path.join(os.environ['HOME'],'Email/tmp_ljnews')
    if os.path.isfile(filename) :
        regex = re.compile(r'(http://www.linuxjournal.com/article.php[^\s]*)')
        news.append( LinuxJournal(frm,to_adds,filename,regex) )
    # Wired news
    filename = os.path.join(os.environ['HOME'],'Email/tmp_wired')
    if os.path.isfile(filename) :
        regex = re.compile(r'http://www.wired.com/news/..*')
        news.append( WiredNews(frm,to_adds,filename,regex) )

    # Weekly news
    day_of_the_week = time.localtime()[6]
    # Days => 0:Mon ... 6:Sun
    if day_of_the_week == 3 :
        news.append( LinuxWeeklyNews(frm,to_adds) )
    elif day_of_the_week == 2 :
        news.append( LinuxDoc(frm,to_adds) ) 
    elif day_of_the_week == 1 :
        news.append( KernelTraffic(frm,to_adds) )

    # The loop
    count = 0
    for new in news :
        count += new.getAllURLs()
        try :
            os.remove(new.file)
        except OSError :
            pass

    print "%s: %d retrieved and mailed." % \
        (os.path.basename(sys.argv[0]),count)

if __name__ == '__main__' :
    main()

--vtzGhvizbBRQ85DL--