[pypy-commit] pypy py3k: Fix utf-8 encoding; all test_runicode passes.
amauryfa
noreply at buildbot.pypy.org
Tue Oct 18 00:37:53 CEST 2011
Author: Amaury Forgeot d'Arc <amauryfa at gmail.com>
Branch: py3k
Changeset: r48163:8942f2c46162
Date: 2011-10-17 20:18 +0200
http://bitbucket.org/pypy/pypy/changeset/8942f2c46162/
Log: Fix utf-8 encoding; all test_runicode passes.
diff --git a/pypy/rlib/runicode.py b/pypy/rlib/runicode.py
--- a/pypy/rlib/runicode.py
+++ b/pypy/rlib/runicode.py
@@ -253,6 +253,8 @@
result.append((chr((0x80 | (ch & 0x3f)))))
def unicode_encode_utf_8(s, size, errors, errorhandler=None):
+ if errorhandler is None:
+ errorhandler = raise_unicode_exception_encode
assert(size >= 0)
result = StringBuilder(size)
pos = 0
@@ -279,11 +281,14 @@
pos += 1
_encodeUCS4(result, ch3)
continue
- r, pos = errorhandler(errors, 'utf-8',
- 'surrogates not allowed',
- s, pos-1, pos)
- result.append(r)
- continue
+ r, pos = errorhandler(errors, 'utf-8',
+ 'surrogates not allowed',
+ s, pos-1, pos)
+ result.append(r)
+ continue
+ result.append((chr((0xe0 | (ch >> 12)))))
+ result.append((chr((0x80 | ((ch >> 6) & 0x3f)))))
+ result.append((chr((0x80 | (ch & 0x3f)))))
else:
_encodeUCS4(result, ch)
return result.build()
diff --git a/pypy/rlib/test/test_runicode.py b/pypy/rlib/test/test_runicode.py
--- a/pypy/rlib/test/test_runicode.py
+++ b/pypy/rlib/test/test_runicode.py
@@ -118,6 +118,9 @@
for i in range(10000):
for encoding in ("utf-7 utf-8 utf-16 utf-16-be utf-16-le "
"utf-32 utf-32-be utf-32-le").split():
+ if encoding == 'utf-8' and 0xd800 <= i <= 0xdfff:
+ # Don't try to encode lone surrogates
+ continue
self.checkdecode(unichr(i), encoding)
def test_random(self):
More information about the pypy-commit
mailing list