Webpy and UnicodeDecodeError

Fri Dec 18 10:43:21 EST 2009

Oscar Del Ben wrote:
> So I'm trying to send a file through webpy and urllib2 but I can't get
> around these UnicodeErrors. Here's the code:
>
> # controller
>
> x = web.input(video_original={})
> params = {'foo': x['foo']}
>
> files = (('video[original]', 'test', x['video_original'].file.read
> ()),)
> client.upload(upload_url, params, files, access_token())
>
> # client library
>
> def __encodeMultipart(self, fields, files):
>         """
>         fields is a sequence of (name, value) elements for regular
> form fields.
>         files is a sequence of (name, filename, value) elements for
> data to be uploaded as files
>         Return (content_type, body) ready for httplib.HTTP instance
>         """
>         boundary = mimetools.choose_boundary()
>         crlf = '\r\n'
>
>         l = []
>         for k, v in fields.iteritems():
>             l.append('--' + boundary)
>             l.append('Content-Disposition: form-data; name="%s"' % k)
>             l.append('')
>             l.append(v)
>         for (k, f, v) in files:
>             l.append('--' + boundary)
>             l.append('Content-Disposition: form-data; name="%s";
> filename="%s"' % (k, f))
>             l.append('Content-Type: %s' % self.__getContentType(f))
>             l.append('')
>             l.append(v)
>         l.append('--' + boundary + '--')
>         l.append('')
>         body = crlf.join(l)
>
>         return boundary, body
>
>     def __getContentType(self, filename):
>         return mimetypes.guess_type(filename)[0] or 'application/octet-
> stream'
>
>     def upload(self, path, post_params, files, token=None):
>
>       if token:
>         token = oauth.OAuthToken.from_string(token)
>
>       url = "http://%s%s" % (self.authority, path)
>
>       (boundary, body) = self.__encodeMultipart(post_params, files)
>
>       headers = {'Content-Type': 'multipart/form-data; boundary=%s' %
> boundary,
>           'Content-Length': str(len(body))
>           }
>
>       request = oauth.OAuthRequest.from_consumer_and_token(
>         self.consumer,
>         token,
>         http_method='POST',
>         http_url=url,
>         parameters=post_params
>       )
>
>       request.sign_request(oauth.OAuthSignatureMethod_HMAC_SHA1(),
> self.consumer, token)
>
>       request = urllib2.Request(request.http_url, postdata=body,
> headers=headers)
>       request.get_method = lambda: 'POST'
>
>       return urllib2.urlopen(request)
>
> Unfortunately I get two kinds of unicode error, the first one in the
> crlf.join(l):
>
> Traceback (most recent call last):
>   File "/Users/oscar/projects/work/whitelabel/web/application.py",
> line 242, in process
>     return self.handle()
>   File "/Users/oscar/projects/work/whitelabel/web/application.py",
> line 233, in handle
>     return self._delegate(fn, self.fvars, args)
>   File "/Users/oscar/projects/work/whitelabel/web/application.py",
> line 412, in _delegate
>     return handle_class(cls)
>   File "/Users/oscar/projects/work/whitelabel/web/application.py",
> line 387, in handle_class
>     return tocall(*args)
>   File "/Users/oscar/projects/work/whitelabel/code.py", line 328, in
> POST
>     return simplejson.load(client.upload(upload_url, params, files,
> access_token()))
>   File "/Users/oscar/projects/work/whitelabel/oauth_client.py", line
> 131, in upload
>     (boundary, body) = self.__encodeMultipart(post_params, files)
>   File "/Users/oscar/projects/work/whitelabel/oauth_client.py", line
> 111, in __encodeMultipart
>     body = crlf.join(l)
> UnicodeDecodeError: 'ascii' codec can't decode byte 0xb7 in position
> 42: ordinal not in range(128)
>
>
> And here's another one:
>
> Traceback (most recent call last):
>   File "/Users/oscar/projects/work/whitelabel/web/application.py",
> line 242, in process
>     return self.handle()
>   File "/Users/oscar/projects/work/whitelabel/web/application.py",
> line 233, in handle
>     return self._delegate(fn, self.fvars, args)
>   File "/Users/oscar/projects/work/whitelabel/web/application.py",
> line 412, in _delegate
>     return handle_class(cls)
>   File "/Users/oscar/projects/work/whitelabel/web/application.py",
> line 387, in handle_class
>     return tocall(*args)
>   File "/Users/oscar/projects/work/whitelabel/code.py", line 328, in
> POST
>     return simplejson.load(client.upload(upload_url, params, files,
> access_token()))
>   File "/Users/oscar/projects/work/whitelabel/oauth_client.py", line
> 131, in upload
>     (boundary, body) = self.__encodeMultipart(post_params, files)
>   File "/Users/oscar/projects/work/whitelabel/oauth_client.py", line
> 111, in __encodeMultipart
>     body = crlf.join(l)
> UnicodeDecodeError: 'ascii' codec can't decode byte 0xb7 in position
> 42: ordinal not in range(128)
>
> Does anyone know why this errors happens and what I should do to
> prevent them? Many thanks.
>
> Oscar
>
>   
I did a short test to demonstrate the likely problem, without all the 
other libraries and complexity.

lst = ["abc"]
lst.append("def")
lst.append(u"abc")
lst.append("g\x48\x82\x94i")
print lst
print "**".join(lst)

That fragment of code generates (in Python 2.6) the following output and 
traceback:

['abc', 'def', u'abc', 'gH\x82\x94i']
Traceback (most recent call last):
  File "M:\Programming\Python\sources\dummy\stuff2.py", line 10, in <module>
    print "**".join(lst)
UnicodeDecodeError: 'ascii' codec can't decode byte 0x82 in position 2: 
ordinal not in range(128)

You'll notice that one of the strings is a unicode one, and another one 
has the character 0x82 in it.  Once join() discovers Unicode, it needs 
to produce a Unicode string, and by default, it uses the ASCII codec to 
get it.

If you print your 'l' list (bad name, by the way, looks too much like a 
'1'), you can see which element is Unicode, and which one has the \xb7 
in position 42.  You'll have to decide which is the problem, and solve 
it accordingly.  Was the fact that one of the strings is unicode an 
oversight?  Or did you think that all characters would be 0x7f or less?  
Or do you want to handle all possible characters, and if so, with what 
encoding?

DaveA