HTMLParser and non-ascii html pages
Peter Otten
__peter__ at web.de
Tue Sep 20 16:11:58 EDT 2011
Yaşar Arabacı wrote:
> I am using a simple sublclass of HTMLParser like this:
>
> class LinkCollector(HTMLParser):
>
> def reset(self):
> self.links = []
> HTMLParser.reset(self)
>
> def handle_starttag(self,tag,attr):
> if tag in ("a","link"):
> key = "href"
> elif tag in ("img","script"):
> key = "src"
> else:
> return
> self.links.extend([v for k,v in attr if k == key])
>
> This gives following error:
>
> Traceback (most recent call last):
> File "downloader.py", line 209, in <module>
> if __name__ == "__main__": main()
> File "downloader.py", line 201, in main
> link_collect.feed(response)
> File "C:\Python27\lib\HTMLParser.py", line 108, in feed
> self.goahead(0)
> File "C:\Python27\lib\HTMLParser.py", line 148, in goahead
> k = self.parse_starttag(i)
> File "C:\Python27\lib\HTMLParser.py", line 252, in parse_starttag
> attrvalue = self.unescape(attrvalue)
> File "C:\Python27\lib\HTMLParser.py", line 393, in unescape
> return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities,
> s)
> File "C:\Python27\lib\re.py", line 151, in sub
> return _compile(pattern, flags).sub(repl, string, count)
> UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 13:
> ordinal not in range(128)
Trying to reproduce the error:
>>> from HTMLParser import HTMLParser
>>> class P(HTMLParser):
... def handle_starttag(self, tag, attrs):
... key, value = attrs[0]
... print tag, key, "=", value
...
>>> def feed(s):
... P().feed(s)
...
>>> feed("<a href='yadda'/>")
a href = yadda
>>> feed("<a href='ä yadda'/>")
a href = ä yadda
>>> feed("<a href='ä ä'/>")
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "<stdin>", line 2, in feed
File "/usr/local/lib/python2.7/HTMLParser.py", line 108, in feed
self.goahead(0)
File "/usr/local/lib/python2.7/HTMLParser.py", line 148, in goahead
k = self.parse_starttag(i)
File "/usr/local/lib/python2.7/HTMLParser.py", line 252, in parse_starttag
attrvalue = self.unescape(attrvalue)
File "/usr/local/lib/python2.7/HTMLParser.py", line 390, in unescape
return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities,
s)
File "/usr/local/lib/python2.7/re.py", line 151, in sub
return _compile(pattern, flags).sub(repl, string, count)
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 1:
ordinal not in range(128)
It seems that the exception is triggered by an attribute value that contains
both entities and non-ascii bytes.
>>> feed(u"<a href='ä ä'/>")
a href = ä ä
> Rest of the code available as attachment. Does anyone know how to solve
> this?
The documentation doesn't mention unicode, but it seems to work anyway:
>>> feed(u"<a href='ä ä'/>")
a href = ä ä
So one fix might be to convert the data to unicode before passing it to the
HTMLParser.
More information about the Python-list
mailing list