mangled attempt at using htmllib

Wed Oct 11 14:30:28 EDT 2000

Well,

I think I'm very lost again. The idea was to parse a file which
contains lots of lines in the following format:

200 OK         <a href="urlstatusgo.html?col=test&url=http%
3A//www.foobar.com/archive/091400.html">http://www.foobar.com/archive/09
1400.html</a>

What I thought I was doing was looking for lines that began with
the "200 OK        '
string, then using htmllib to return the text between the anchor tags.
If I run the program, below, I get the following error:

D:\Program Files\Python\work>python filtertest4.py

Traceback (innermost last):
  File "filtertest4.py", line 58, in ?
    print '%s' % process(line)
  File "filtertest4.py", line 36, in process
    text2print = parser.getLink
AttributeError: getLink

So, two questions. First, how do I do this sanely. Second, when I get
an AttributeError, is there a way to find out what Attributes were
being expected, so I can adjust whatever I'm doing accordingly?

---------------------------------------------------------------
import re, htmllib,formatter,string
import exceptions,string,sys,traceback,time
import DateTime
import ODBC,ODBC.Windows,ODBC.Misc.proc
import urllib

class seekUrl(htmllib.HTMLParser):

    def __init__(self):
        self.start=0
        self.current_data=''
        htmllib.HTMLParser.__init__(
            self, formatter.NullFormatter())

	def start_a(self,attributes):
		self.current_data='boo!'
	def end_a(self):
		pass
	def handle_data(self, data):
		self.current_data=data
	def getLink(self):
		return self.current_data

def process(stuff):
    parser=seekUrl()
    parser.feed(stuff)
    text2print = parser.getLink

def showparse(filename):
    pprint.pprint(process(stuff))

currentFilePrefix = 'http://'
fileDict = {'www.cio.com':'webbusiness.cio.com'}
testOutFile = 'd:\\program files\\python\\work\\ciodump.html'
parser=seekUrl()

for root_url,currentFile in fileDict.items():
	currentFile = currentFilePrefix+currentFile+'/'
	current = urllib.urlopen(currentFile)
	inFile = current.readlines()

	for line in inFile:
		if re.search('200 OK         ',line):
			print '%s' % process(line)
#			foo = parser.myUrl()
#			print '%s' % foo
current.close()
# don't forget to clean up after yourself.
urllib.urlcleanup()
------------------------------------------

Many, many thanks,
ari

--
Ari Davidow
ari at ivritype.com

Sent via Deja.com http://www.deja.com/
Before you buy.