HTMLParser and non-ascii html pages

Tue Sep 20 16:11:58 EDT 2011

Yaşar Arabacı wrote:

> I am using a simple sublclass of HTMLParser like this:
> 
> class LinkCollector(HTMLParser):
> 
>     def reset(self):
>         self.links = []
>         HTMLParser.reset(self)
> 
>     def handle_starttag(self,tag,attr):
>         if tag in ("a","link"):
>             key = "href"
>         elif tag in ("img","script"):
>             key = "src"
>         else:
>             return
>         self.links.extend([v for k,v in attr if k == key])
> 
> This gives following error:
> 
> Traceback (most recent call last):
>   File "downloader.py", line 209, in <module>
>     if __name__ == "__main__": main()
>   File "downloader.py", line 201, in main
>     link_collect.feed(response)
>   File "C:\Python27\lib\HTMLParser.py", line 108, in feed
>     self.goahead(0)
>   File "C:\Python27\lib\HTMLParser.py", line 148, in goahead
>     k = self.parse_starttag(i)
>   File "C:\Python27\lib\HTMLParser.py", line 252, in parse_starttag
>     attrvalue = self.unescape(attrvalue)
>   File "C:\Python27\lib\HTMLParser.py", line 393, in unescape
>     return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities,
> s)
>   File "C:\Python27\lib\re.py", line 151, in sub
>     return _compile(pattern, flags).sub(repl, string, count)
> UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 13:
> ordinal not in range(128)

Trying to reproduce the error:

>>> from HTMLParser import HTMLParser
>>> class P(HTMLParser):
...     def handle_starttag(self, tag, attrs):
...             key, value = attrs[0]
...             print tag, key, "=", value
...
>>> def feed(s):
...     P().feed(s)
...
>>> feed("<a href='yadda'/>")
a href = yadda
>>> feed("<a href='ä yadda'/>")
a href = ä yadda
>>> feed("<a href='ä ä'/>")
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "<stdin>", line 2, in feed
  File "/usr/local/lib/python2.7/HTMLParser.py", line 108, in feed
    self.goahead(0)
  File "/usr/local/lib/python2.7/HTMLParser.py", line 148, in goahead
    k = self.parse_starttag(i)
  File "/usr/local/lib/python2.7/HTMLParser.py", line 252, in parse_starttag
    attrvalue = self.unescape(attrvalue)
  File "/usr/local/lib/python2.7/HTMLParser.py", line 390, in unescape
    return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, 
s)
  File "/usr/local/lib/python2.7/re.py", line 151, in sub
    return _compile(pattern, flags).sub(repl, string, count)
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 1: 
ordinal not in range(128)

It seems that the exception is triggered by an attribute value that contains 
both entities and non-ascii bytes.

>>> feed(u"<a href='ä ä'/>")
a href = ä ä

> Rest of the code available as attachment. Does anyone know how to solve
> this?

The documentation doesn't mention unicode, but it seems to work anyway:

>>> feed(u"<a href='ä ä'/>")
a href = ä ä

So one fix might be to convert the data to unicode before passing it to the 
HTMLParser.