How to save web pages for offline reading?
Will Stuyvesant
hwlgw at hotmail.com
Tue Aug 19 16:05:16 EDT 2003
> [Anand Pillai]
> I hope this thread is not dead.
>
> I would like to know what you decided at the end :-)
> Harvestman has about 10 active subscribers right now
> and some corporates in India and umpteen of my own friends
> use it for their personal 'harvesting' needs :->
>
> I hope you downloaded at least the (new) binaries
> and gave it a go!
Just downloaded it and I will study it. Thanks!
What I use now is suboptimal, but works good enough for simple offline
reading of blogs and it is fast (no .GIFs etc.). It stores a bunch of
blogs I want to read in a directory (if they are new, checked by
content-size and stuff). I am sure it can be improved a lot, comments
welcome :-)
# file getsites.py
##
# DEFINITION bloglist.ini format
# This is like what effnews (www.effbot.org) uses. But I stopped
# using it because it gets too expensive to read online with a
# telephone connection.
# A file in bloglist.ini format contains a sequence of URI and Title
# pairs: URIs are on a line that starts with a '+', and then the title
# for that URI follows.
#
# for instance:
#
# +http://www.python.org
# Python Language Website
# +http://online.effbot.org
# online.effbot.org
###
import sys
import os
import urllib
TMPDIR = 'getsites'
NEWDELTA = 60
# If a new blogentry differs less than NEWDELTA bytes from the old
# entry, then it is considered not new (suspicion of only a date
# change etc.)
# For instance, Simon Willison's weblog changed 57 bytes but without
# new content: only the "2 days 7 hours ago" message changed.
# Pyzine.com changed 400 bytes because of a new "random abstract".
# So far, Industry Toulouse is the only site that changes bytes
# without new content but does send a content-length, it is because it
# has a random quote every time you visit it.
# Some titles need in a bloglist.ini file may need editing, for
instance
# the title 'zone ::: effbot' will create an IOError: "No such file or
# directory 'zone ::: effbot'" (on my Windows laptop).
DEFAULTFILE = 'bloglist.ini'
wfm = "This file has a size but can not have its date changed?"
# weird file message
##
# Change the date of a file
#
# @param filename The file that will have its date changed
# @param ddmmyyyy String with date
# @return Side-effect: file's date changed to ddmmyyyy
##
def setdate(filename, ddmmyyyy):
import os
import time
d = int(ddmmyyyy[2:4])
m = int(ddmmyyyy[:2])
y = int(ddmmyyyy[4:])
# 12 hours, 1 minute: gives 01:01 PM, odd
t = time.mktime((y, d, m, 12, 1, 0, 0, 0, 0))
os.utime(filename, (t, t))
##
# An example of a list returned:
# [{'http://myurl.com/index.html: 'My Site Name'},
# {'http://other.net/c.htm': 'spam'}]
#
# @param bloglistfile Open file in bloglist.ini format
# @return List of dict, every dict has one uri:title pair
##
def getsitedict(bloglistfile):
lines = []
for line in bloglistfile: lines.append(line)
entry = {} # a dict with a uri:title pair
haveurl = '' # the uri of the entry we are adding now
sites = [] # a list with dicts with url:title
for line in lines:
line = line.strip()
if line[0] == '+': # a new url
haveurl = line[1:]
if entry: # if a uri:title available
sites.append(entry)
entry = {}
elif haveurl: # collecting a title
if entry.has_key(haveurl):
entry[haveurl] = entry[haveurl] + line
else:
entry[haveurl] = line
continue
return sites
##
# Check and maybe download sites
#
# @param sites List of dict, every dict has one uri:title pair
# @return Side-effect: new HTML in TMPDIR
##
def getsites(sites):
for site in sites:
uri = site.keys()[0]
title = site[uri]
filename = os.path.join(TMPDIR, title + '.html')
print
print title
print uri
try:
fp = urllib.urlopen(uri)
except IOError:
print 'ERROR: no connection'
continue
oldsize = 0
try:
oldsize = os.path.getsize(filename)
except: pass
newsize = 0
for k, v in fp.headers.items():
if k.lower() == 'content-length':
newsize = long(v)
break
if (oldsize == 0) or (newsize != oldsize):
# There is a HTTP content-length and it is not the same as
# the file we already have (new != old), or we don't have
# a file already (old == 0).
print 'oldsize', oldsize, 'newsize', newsize
print 'Downloading: '
try:
op = file(filename, "wb")
except IOError:
print 'Illegal filename:', filename
continue
n = 0
while 1:
s = fp.read(8192)
if not s: break
op.write(s)
n = n + len(s)
fp.close()
op.close()
for k, v in fp.headers.items():
print k, "=", v
print "stored %s (%s bytes)" % (filename, n)
if ( (oldsize > 0) # there is an old file
and
(newsize == 0) # no HTTP content-length
and
(abs(n - oldsize) <= NEWDELTA) # "no change"
):
# Change date of saved blogs that do not send HTTP
# content-length and that do not appear to have
# changed.
# TODO: this also removes NEW pages that have the same
# length as the old ones. It would be better to check
# the file content, viz. >= 95% same content. Or
# check the content of the first 8192 bytes?
try: setdate(filename, '31012001')
except IOError: print wfm
print 'Setting date to 01012001 (untrusted "new")'
if __name__ == '__main__':
# handle commandline options
if len(sys.argv) < 2: filename = DEFAULTFILE
else: filename = sys.argv[1]
# check and maybe setup download directory
try:
os.chdir(TMPDIR)
except OSError:
os.mkdir(TMPDIR)
os.chdir(TMPDIR)
os.chdir('..')
# get dict with sites
sites = getsitedict(file(filename, 'r'))
# check and maybe download sites
getsites(sites)
More information about the Python-list
mailing list