[Tutor] Here is a completed script for everyones perusal.
montana
sarmxiii@knology.net
Tue, 10 Sep 2002 13:32:50 -0500
Hi Everyone-
I've attached a script to this email for everyone to check out. Please
feel free to make comments or suggestions. This script is a basic new
retriever from the FoxNews site. It downloads the articles into easily
readable html that I then transfer over to my Zaurus for reading. The
script is as follows:
#!/usr/bin/env python
# A simple Python script that downloads new stories from FoxNews
# Sean Armstrong, 06 September 2002
import re
import urllib, urllister
import sys
import os
import string
import time
class Html:
def __init__(self, address):
self.address = address
#connection and parsing
def connect(self):
sock = urllib.urlopen("http://" + self.address)
parser = urllister.URLLister()
parser.feed(sock.read())
parser.close()
sock.close()
parser = parser.urls
return parser
#search main doc for story links
def linkSearch(self):
source = self.connect()
link = r'''\/story\/0,\d+,6\d+,00\.html'''
sch = re.findall(link, string.join(source))
sch = string.join(sch)
return sch
#compare old source file with new
def compare(self):
t = time.strftime("%j_%H%M%S_%Y", time.localtime())
folder_contents = os.listdir("/Users/montana/News/Fox/")
l = len(folder_contents)
for i in folder_contents:
if i[-3:] == "fox":
oldnews = i
newsin = open("/Users/montana/News/Fox/"+oldnews, "rb")
newsfile = newsin.read()
if self.linkSearch() != newsin:
newsin.close()
print "News is being updated ..."
newsout = open("/Users/montana/News/Fox/news"+t+".fox", "wb")
newsout.write(self.linkSearch())
newsout.close()
os.remove("/Users/montana/News/Fox/"+oldnews)
else:
print "Nothing to update. Bye."
exit
#download desired html links
def download(self):
sch = string.split(self.linkSearch())
l = len(sch)
for i in range(l):
file = sch[i]
os.system("touch /Users/montana/News/Fox/"+file[7:])
sock = urllib.urlopen("http://www.foxnews.com" + sch[i])
links = sock.read()
output = open("/Users/montana/News/Fox/"+file[7:], "wb")
output.write(links)
output.close()
sock.close()
class StoryExtractor:
def __init__(self, html):
self.html = html
#search and crop headers from html
def cropHeader(self):
f = open(self.html, "r")
file = f.read()
headline = "%s.+%s" % ("<!--Headline-->", "<!--/Headline-->")
headline = re.findall(headline, file)
headline = string.join(headline)
f.close()
return headline
#search and crop text from html
def crop(self):
f = open(self.html, "r")
file = f.read()
story = "(?sx)%s.+%s" % ("<!--Storytext-->", "<!--/Storytext-->")
newhtml = re.findall(story, file)
newhtml = string.join(newhtml)
f.close()
# start = "<TABLE"
# end = "</TABLE>"
middle = "(?sx)%s.+%s" % ("<TABLE", "</TABLE>")
body = re.sub(middle, " ", newhtml)
return body
if __name__ == "__main__":
start = Html("www.foxnews.com")
start.compare()
start.download()
list = os.listdir("/Users/montana/News/Fox/")
l = len(list)
t = time.strftime("%j_%H%M%S_%Y", time.localtime())
os.mkdir("/Users/montana/News/Fox/"+t)
savedir = "/Users/montana/News/Fox/"+t+"/"
count = 0
headera = '''<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01
Transitional//EN"
"http://www.w3.org/TR/1999/REC-html401-19991224/loose.dtd">
<html lang="en">
<head>
<meta http-equiv="content-type" content="text/html;
charset=iso-8859-1">
<title>'''
headerb = '''</title>
<meta name="generator" content="BBEdit 6.5.2">
</head>
<body>'''
footer = '''</body>
</html>'''
for i in range(l):
count += 1
item = list[i]
if item[-4:] == "html":
story = StoryExtractor("/Users/montana/News/Fox/"+item)
headline = story.cropHeader()
body = story.crop()
newstory = headera + "Story" + headerb + "<h3><u>" + headline +
"</u></h3>" + body + footer
number = str(count)
outfile = open(savedir+"story"+number+".html", "w")
outfile.write(newstory)
outfile.close()
os.chdir(savedir)
newlist = os.listdir(savedir)
f = open("index"+t+".html", "a")
l = len(newlist)
f.write(headera + "FoxNews" + headerb)
for i in range(l):
item = newlist[i]
html = open(item, "r")
file = html.read()
html.close()
headline = "%s(.+)%s" % ("<!--Headline-->", "<!--/Headline-->")
headline = re.findall(headline, file)
f.write('''<a href="'''+item+'''">'''+headline[0]+"</a><br><br>")
f.write(footer)
os.chdir("/Users/montana/News/Fox/")
os.system("rm -rf *.html")
Let me know of any bugs you come across please.
Thanks.
SA
"I can do everything on my Mac I used to do on my PC, plus alot more
..."
--Me