Difficulty using htmllib
pythonhda
pythonhda at yahoo.com.replacepythonwithlinux
Sat Jan 18 11:28:53 EST 2003
On 18 Jan 2003 07:16:51 -0800
joshua at goodish.org (Joshua Goodlett) wrote:
> I've written a small web robot script to familiarize myself with both
> the htmllib and urllib modules but have been getting an error message
> when my HTMLParser subclass is fed. I've included the relevant code,
> as well as the error message, below:
>
> ----- snip -----
>
> import htmllib
> from formatter import NullFormatter
>
> class URLExtractor(htmllib.HTMLParser):
> def __init__(self, outfile=None):
> htmllib.HTMLParser.__init__(self, NullFormatter())
> self.url_list = []
>
> def anchor_bgn(self, href, name, type):
> self.url_list.append(href)
>
> def anchor_end(self): pass
>
> def reset(self)
> htmllib.HTMLParser.reset(self)
> self.url_list = []
>
> def get_urls(self):
> return self.url_list
>
> if __name__ == "__main__":
>
> parser = URLExtractor()
> hrefs = []
>
> urllist = ['www.4SEINC.com',
> 'www.aw-comm.com',
> 'www.arwireless.net',
> 'www.turn-keywireless.com',
> 'www.acadianatowers.com',
> 'www.adamstower.com',
> 'www.advancedlightning.com',
> 'www.advtwoway.com',
> 'www.aerosolutionsllc.com',
> 'www.afcommsupply.com',
> 'www.alandick.com',
> 'www.allcomm.com',
> 'www.alliancecomm.com',
> 'www.alliedtower.com',
> 'www.allteccorp.com',
> 'www.alpha-telecom.com',
> 'www.alumaform.com',
> 'www.amchel.com',
> 'www.amertower.com',
> 'www.anchorguard.com',
> 'www.andrew.com',
> 'www.us.anritsu.com']
>
> for url in urllist:
> sock = urllib.urlopen('http://' + url)
> data = sock.read()
> sock.close()
> parser.feed(data)
> hrefs.extend(parser.get_urls())
> parser.close()
> parser.reset()
>
> ----- snip -----
>
> And now the error message...
>
> ----- snip -----
>
> Traceback (most recent call last):
> File "parsertest.py", line 55, in ?
> parser.feed(data)
> File "C:\Python22\lib\sgmllib.py", line 95, in feed
> self.goahead(0)
> File "C:\Python22\lib\sgmllib.py", line 161, in goahead
> k = self.parse_declaration(i)
> File "C:\Python22\lib\markupbase.py", line 66, in parse_declaration
> decltype, j = self._scan_name(j, i)
> File "C:\Python22\lib\markupbase.py", line 313, in _scan_name
> self.error("expected name token")
> File "C:\Python22\lib\sgmllib.py", line 102, in error
> raise SGMLParseError(message)
> sgmllib.SGMLParseError: expected name token
> Press any key to continue . . .
>
> ----- snip -----
>
> Any suggestions or advice as to the cause of the error would be
> appreciated. I'd suspected that the html source being fed to the
> parser is malformed, but this may be wrong.
>
> Cheers,
> Joshua Goodlett
It works up to www.turn-keywireless.com - Remove that url and it works for the rest...
More information about the Python-list
mailing list