Difficulty using htmllib

Sat Jan 18 11:28:53 EST 2003

On 18 Jan 2003 07:16:51 -0800
joshua at goodish.org (Joshua Goodlett) wrote:

> I've written a small web robot script to familiarize myself with both
> the htmllib and urllib modules but have been getting an error message
> when my HTMLParser subclass is fed.  I've included the relevant code,
> as well as the error message, below:
> 
> ----- snip -----
> 
> import htmllib
> from formatter import NullFormatter
> 
> class URLExtractor(htmllib.HTMLParser):
>     def __init__(self, outfile=None): 
>         htmllib.HTMLParser.__init__(self, NullFormatter())
>         self.url_list = []
>                 
>     def anchor_bgn(self, href, name, type):
>         self.url_list.append(href)
>     
>     def anchor_end(self): pass
> 
>     def reset(self)
>         htmllib.HTMLParser.reset(self)
>         self.url_list = []
> 
>     def get_urls(self):
>         return self.url_list
> 
> if __name__ == "__main__":
> 
>     parser = URLExtractor()
>     hrefs = []
> 
>     urllist = ['www.4SEINC.com',
>                'www.aw-comm.com',
>                'www.arwireless.net',
>                'www.turn-keywireless.com',
>                'www.acadianatowers.com',
>                'www.adamstower.com',
>                'www.advancedlightning.com',
>                'www.advtwoway.com',
>                'www.aerosolutionsllc.com',
>                'www.afcommsupply.com',
>                'www.alandick.com',
>                'www.allcomm.com',
>                'www.alliancecomm.com',
>                'www.alliedtower.com',
>                'www.allteccorp.com',
>                'www.alpha-telecom.com',
>                'www.alumaform.com',
>                'www.amchel.com',
>                'www.amertower.com',
>                'www.anchorguard.com',
>                'www.andrew.com',
>                'www.us.anritsu.com']
>                
>     for url in urllist:        
>         sock = urllib.urlopen('http://' + url)
>         data = sock.read()
>         sock.close()
>         parser.feed(data)
>         hrefs.extend(parser.get_urls())
>         parser.close()
>         parser.reset()
> 
> ----- snip -----
> 
> And now the error message...
> 
> ----- snip -----
> 
> Traceback (most recent call last):
>   File "parsertest.py", line 55, in ?
>     parser.feed(data)
>   File "C:\Python22\lib\sgmllib.py", line 95, in feed
>     self.goahead(0)
>   File "C:\Python22\lib\sgmllib.py", line 161, in goahead
>     k = self.parse_declaration(i)
>   File "C:\Python22\lib\markupbase.py", line 66, in parse_declaration
>     decltype, j = self._scan_name(j, i)
>   File "C:\Python22\lib\markupbase.py", line 313, in _scan_name
>     self.error("expected name token")
>   File "C:\Python22\lib\sgmllib.py", line 102, in error
>     raise SGMLParseError(message)
> sgmllib.SGMLParseError: expected name token
> Press any key to continue . . .
> 
> ----- snip -----
> 
> Any suggestions or advice as to the cause of the error would be
> appreciated.  I'd suspected that the html source being fed to the
> parser is malformed, but this may be wrong.
> 
> Cheers,
> Joshua Goodlett

It works up to www.turn-keywireless.com - Remove that url and it works for the rest...