How to save web pages for offline reading?

Tue Aug 19 16:05:16 EDT 2003

> [Anand Pillai]
> I hope this thread is not dead.
> 
> I would like to know what you decided at the end :-)
> Harvestman has about 10 active subscribers right now 
> and some corporates in India and umpteen of my own friends
> use it for their personal 'harvesting' needs :->
> 
> I hope you downloaded at least the (new) binaries
> and gave it a go!

Just downloaded it and I will study it.  Thanks!

What I use now is suboptimal, but works good enough for simple offline
reading of blogs and it is fast (no .GIFs etc.).  It stores a bunch of
blogs I want to read in a directory (if they are new, checked by
content-size and stuff).  I am sure it can be improved a lot, comments
welcome :-)

# file getsites.py

##
# DEFINITION bloglist.ini format
# This is like what effnews (www.effbot.org) uses.  But I stopped
# using it because it gets too expensive to read online with a
# telephone connection.
# A file in bloglist.ini format contains a sequence of URI and Title
# pairs: URIs are on a line that starts with a '+', and then the title
# for that URI follows.
#
# for instance:
#
# +http://www.python.org
#  Python Language Website
# +http://online.effbot.org
#  online.effbot.org
###

import sys
import os
import urllib

TMPDIR = 'getsites'

NEWDELTA = 60  
# If a new blogentry differs less than NEWDELTA bytes from the old
# entry, then it is considered not new (suspicion of only a date
# change etc.)
# For instance, Simon Willison's weblog changed 57 bytes but without
# new content: only the "2 days 7 hours ago" message changed.
# Pyzine.com changed 400 bytes because of a new "random abstract".
# So far, Industry Toulouse is the only site that changes bytes
# without new content but does send a content-length, it is because it
# has a random quote every time you visit it.

# Some titles need in a bloglist.ini file may need editing, for
instance
# the title 'zone ::: effbot' will create an IOError: "No such file or
# directory 'zone ::: effbot'" (on my Windows laptop).
DEFAULTFILE = 'bloglist.ini'

wfm =  "This file has a size but can not have its date changed?"
# weird file message

##
# Change the date of a file
#
# @param filename The file that will have its date changed
# @param ddmmyyyy String with date
# @return Side-effect: file's date changed to ddmmyyyy
##
def setdate(filename, ddmmyyyy):
    import os
    import time
    d = int(ddmmyyyy[2:4])
    m = int(ddmmyyyy[:2])
    y = int(ddmmyyyy[4:])
    # 12 hours, 1 minute: gives 01:01 PM, odd
    t = time.mktime((y, d, m, 12, 1, 0, 0, 0, 0))
    os.utime(filename, (t, t))

##
# An example of a list returned:
# [{'http://myurl.com/index.html: 'My Site Name'},
# {'http://other.net/c.htm': 'spam'}]
#
# @param bloglistfile Open file in bloglist.ini format
# @return List of dict, every dict has one uri:title pair
##
def getsitedict(bloglistfile):
    lines = []
    for line in bloglistfile: lines.append(line)
    entry = {}      # a dict with a uri:title pair
    haveurl = ''    # the uri of the entry we are adding now
    sites = []      # a list with dicts with url:title
    for line in lines:
        line = line.strip()
        if line[0] == '+':          # a new url
            haveurl = line[1:]
            if entry:               # if a uri:title available
                sites.append(entry)
                entry = {}
        elif haveurl:               # collecting a title
            if entry.has_key(haveurl):
                entry[haveurl] = entry[haveurl] + line
            else:
                entry[haveurl] = line
            continue
    return sites

##
# Check and maybe download sites
#
# @param sites List of dict, every dict has one uri:title pair
# @return Side-effect: new HTML in TMPDIR 
##
def getsites(sites):
    for site in sites:
        uri = site.keys()[0]
        title = site[uri]
        filename = os.path.join(TMPDIR, title + '.html')
        print 
        print title
        print uri
        try:
            fp = urllib.urlopen(uri)
        except IOError:
            print 'ERROR: no connection'
            continue
        oldsize = 0
        try:
            oldsize = os.path.getsize(filename)
        except: pass
        newsize = 0
        for k, v in fp.headers.items():
            if k.lower() == 'content-length': 
                newsize = long(v)
                break
        if (oldsize == 0) or (newsize != oldsize):
            # There is a HTTP content-length and it is not the same as
            # the file we already have (new != old), or we don't have
            # a file already (old == 0).
            print 'oldsize', oldsize, 'newsize', newsize
            print 'Downloading: '
            try:
                op = file(filename, "wb")
            except IOError:
                print 'Illegal filename:', filename
                continue
            n = 0
            while 1:
                s = fp.read(8192)
                if not s: break
                op.write(s)
                n = n + len(s)
            fp.close()
            op.close()
            for k, v in fp.headers.items():
                print k, "=", v
            print "stored %s (%s bytes)" % (filename, n)
            if  ( (oldsize > 0)     # there is an old file
                  and
                  (newsize == 0)    # no HTTP content-length
                  and
                  (abs(n - oldsize) <= NEWDELTA)    # "no change"
                ):
                # Change date of saved blogs that do not send HTTP
                # content-length and that do not appear to have
                # changed.
                # TODO: this also removes NEW pages that have the same
                # length as the old ones.  It would be better to check
                # the file content, viz. >= 95% same content.  Or
                # check the content of the first 8192 bytes?
                try: setdate(filename, '31012001')
                except IOError: print wfm
                print 'Setting date to 01012001 (untrusted "new")'

if __name__ == '__main__':
    # handle commandline options
    if len(sys.argv) < 2: filename = DEFAULTFILE
    else: filename = sys.argv[1]

    # check and maybe setup download directory
    try:
        os.chdir(TMPDIR)
    except OSError:
        os.mkdir(TMPDIR)
        os.chdir(TMPDIR)
    os.chdir('..')

    # get dict with sites
    sites = getsitedict(file(filename, 'r'))

    # check and maybe download sites
    getsites(sites)