HTMLParser and non-ascii html pages
Yaşar Arabacı
yasar11732 at gmail.com
Tue Sep 20 12:44:46 EDT 2011
Hi,
I am using a simple sublclass of HTMLParser like this:
class LinkCollector(HTMLParser):
def reset(self):
self.links = []
HTMLParser.reset(self)
def handle_starttag(self,tag,attr):
if tag in ("a","link"):
key = "href"
elif tag in ("img","script"):
key = "src"
else:
return
self.links.extend([v for k,v in attr if k == key])
This gives following error:
Traceback (most recent call last):
File "downloader.py", line 209, in <module>
if __name__ == "__main__": main()
File "downloader.py", line 201, in main
link_collect.feed(response)
File "C:\Python27\lib\HTMLParser.py", line 108, in feed
self.goahead(0)
File "C:\Python27\lib\HTMLParser.py", line 148, in goahead
k = self.parse_starttag(i)
File "C:\Python27\lib\HTMLParser.py", line 252, in parse_starttag
attrvalue = self.unescape(attrvalue)
File "C:\Python27\lib\HTMLParser.py", line 393, in unescape
return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities,
s)
File "C:\Python27\lib\re.py", line 151, in sub
return _compile(pattern, flags).sub(repl, string, count)
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 13:
ordinal not in range(128)
Rest of the code available as attachment. Does anyone know how to solve
this?
--
http://yasar.serveblog.net/
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.python.org/pipermail/python-list/attachments/20110920/d7951db7/attachment.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: downloader.py
Type: application/octet-stream
Size: 6210 bytes
Desc: not available
URL: <http://mail.python.org/pipermail/python-list/attachments/20110920/d7951db7/attachment.obj>
More information about the Python-list
mailing list