Difficulty using htmllib

Joshua Goodlett joshua at goodish.org
Sat Jan 18 16:16:51 CET 2003


I've written a small web robot script to familiarize myself with both
the htmllib and urllib modules but have been getting an error message
when my HTMLParser subclass is fed.  I've included the relevant code,
as well as the error message, below:

----- snip -----

import htmllib
from formatter import NullFormatter

class URLExtractor(htmllib.HTMLParser):
    def __init__(self, outfile=None): 
        htmllib.HTMLParser.__init__(self, NullFormatter())
        self.url_list = []
                
    def anchor_bgn(self, href, name, type):
        self.url_list.append(href)
    
    def anchor_end(self): pass

    def reset(self)
        htmllib.HTMLParser.reset(self)
        self.url_list = []

    def get_urls(self):
        return self.url_list

if __name__ == "__main__":

    parser = URLExtractor()
    hrefs = []

    urllist = ['www.4SEINC.com',
               'www.aw-comm.com',
               'www.arwireless.net',
               'www.turn-keywireless.com',
               'www.acadianatowers.com',
               'www.adamstower.com',
               'www.advancedlightning.com',
               'www.advtwoway.com',
               'www.aerosolutionsllc.com',
               'www.afcommsupply.com',
               'www.alandick.com',
               'www.allcomm.com',
               'www.alliancecomm.com',
               'www.alliedtower.com',
               'www.allteccorp.com',
               'www.alpha-telecom.com',
               'www.alumaform.com',
               'www.amchel.com',
               'www.amertower.com',
               'www.anchorguard.com',
               'www.andrew.com',
               'www.us.anritsu.com']
               
    for url in urllist:        
        sock = urllib.urlopen('http://' + url)
        data = sock.read()
        sock.close()
        parser.feed(data)
        hrefs.extend(parser.get_urls())
        parser.close()
        parser.reset()

----- snip -----

And now the error message...

----- snip -----

Traceback (most recent call last):
  File "parsertest.py", line 55, in ?
    parser.feed(data)
  File "C:\Python22\lib\sgmllib.py", line 95, in feed
    self.goahead(0)
  File "C:\Python22\lib\sgmllib.py", line 161, in goahead
    k = self.parse_declaration(i)
  File "C:\Python22\lib\markupbase.py", line 66, in parse_declaration
    decltype, j = self._scan_name(j, i)
  File "C:\Python22\lib\markupbase.py", line 313, in _scan_name
    self.error("expected name token")
  File "C:\Python22\lib\sgmllib.py", line 102, in error
    raise SGMLParseError(message)
sgmllib.SGMLParseError: expected name token
Press any key to continue . . .

----- snip -----

Any suggestions or advice as to the cause of the error would be
appreciated.  I'd suspected that the html source being fed to the
parser is malformed, but this may be wrong.

Cheers,
Joshua Goodlett




More information about the Python-list mailing list