Difficulty using htmllib
Joshua Goodlett
joshua at goodish.org
Sat Jan 18 10:16:51 EST 2003
I've written a small web robot script to familiarize myself with both
the htmllib and urllib modules but have been getting an error message
when my HTMLParser subclass is fed. I've included the relevant code,
as well as the error message, below:
----- snip -----
import htmllib
from formatter import NullFormatter
class URLExtractor(htmllib.HTMLParser):
def __init__(self, outfile=None):
htmllib.HTMLParser.__init__(self, NullFormatter())
self.url_list = []
def anchor_bgn(self, href, name, type):
self.url_list.append(href)
def anchor_end(self): pass
def reset(self)
htmllib.HTMLParser.reset(self)
self.url_list = []
def get_urls(self):
return self.url_list
if __name__ == "__main__":
parser = URLExtractor()
hrefs = []
urllist = ['www.4SEINC.com',
'www.aw-comm.com',
'www.arwireless.net',
'www.turn-keywireless.com',
'www.acadianatowers.com',
'www.adamstower.com',
'www.advancedlightning.com',
'www.advtwoway.com',
'www.aerosolutionsllc.com',
'www.afcommsupply.com',
'www.alandick.com',
'www.allcomm.com',
'www.alliancecomm.com',
'www.alliedtower.com',
'www.allteccorp.com',
'www.alpha-telecom.com',
'www.alumaform.com',
'www.amchel.com',
'www.amertower.com',
'www.anchorguard.com',
'www.andrew.com',
'www.us.anritsu.com']
for url in urllist:
sock = urllib.urlopen('http://' + url)
data = sock.read()
sock.close()
parser.feed(data)
hrefs.extend(parser.get_urls())
parser.close()
parser.reset()
----- snip -----
And now the error message...
----- snip -----
Traceback (most recent call last):
File "parsertest.py", line 55, in ?
parser.feed(data)
File "C:\Python22\lib\sgmllib.py", line 95, in feed
self.goahead(0)
File "C:\Python22\lib\sgmllib.py", line 161, in goahead
k = self.parse_declaration(i)
File "C:\Python22\lib\markupbase.py", line 66, in parse_declaration
decltype, j = self._scan_name(j, i)
File "C:\Python22\lib\markupbase.py", line 313, in _scan_name
self.error("expected name token")
File "C:\Python22\lib\sgmllib.py", line 102, in error
raise SGMLParseError(message)
sgmllib.SGMLParseError: expected name token
Press any key to continue . . .
----- snip -----
Any suggestions or advice as to the cause of the error would be
appreciated. I'd suspected that the html source being fed to the
parser is malformed, but this may be wrong.
Cheers,
Joshua Goodlett
More information about the Python-list
mailing list