[pypy-commit] pypy py3.5: Add support for rejecting lone surrogates in utf16 and utf32 decoders.
amauryfa
pypy.commits at gmail.com
Tue Apr 18 18:55:38 EDT 2017
Author: Amaury Forgeot d'Arc <amauryfa at gmail.com>
Branch: py3.5
Changeset: r91090:7ce52a9f8d1f
Date: 2017-04-19 00:53 +0200
http://bitbucket.org/pypy/pypy/changeset/7ce52a9f8d1f/
Log: Add support for rejecting lone surrogates in utf16 and utf32
decoders.
diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -591,7 +591,10 @@
def unicode_encode_utf_16_helper(s, size, errors,
errorhandler=None,
+ allow_surrogates=True,
byteorder='little'):
+ if errorhandler is None:
+ errorhandler = default_unicode_error_encode
if size == 0:
if byteorder == 'native':
result = StringBuilder(2)
@@ -604,34 +607,60 @@
_STORECHAR(result, 0xFEFF, BYTEORDER)
byteorder = BYTEORDER
- i = 0
- while i < size:
- ch = ord(s[i])
- i += 1
- ch2 = 0
- if ch >= 0x10000:
- ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF)
- ch = 0xD800 | ((ch-0x10000) >> 10)
+ pos = 0
+ while pos < size:
+ ch = ord(s[pos])
+ pos += 1
- _STORECHAR(result, ch, byteorder)
- if ch2:
- _STORECHAR(result, ch2, byteorder)
+ if ch < 0xD800:
+ _STORECHAR(result, ch, byteorder)
+ elif ch >= 0x10000:
+ _STORECHAR(result, 0xD800 | ((ch-0x10000) >> 10), byteorder)
+ _STORECHAR(result, 0xDC00 | ((ch-0x10000) & 0x3FF), byteorder)
+ elif ch >= 0xE000 or allow_surrogates:
+ _STORECHAR(result, ch, byteorder)
+ else:
+ ru, rs, pos = errorhandler(errors, 'utf16',
+ 'surrogates not allowed',
+ s, pos-1, pos)
+ if rs is not None:
+ # py3k only
+ if len(rs) % 2 != 0:
+ errorhandler('strict', 'utf16',
+ 'surrogates not allowed',
+ s, pos-1, pos)
+ result.append(rs)
+ continue
+ for ch in ru:
+ if ord(ch) < 0xD800:
+ _STORECHAR(result, ord(ch), byteorder)
+ else:
+ errorhandler('strict', 'utf16',
+ 'surrogates not allowed',
+ s, pos-1, pos)
+ continue
return result.build()
def unicode_encode_utf_16(s, size, errors,
- errorhandler=None):
- return unicode_encode_utf_16_helper(s, size, errors, errorhandler, "native")
+ errorhandler=None,
+ allow_surrogates=True):
+ return unicode_encode_utf_16_helper(s, size, errors, errorhandler,
+ allow_surrogates, "native")
def unicode_encode_utf_16_be(s, size, errors,
- errorhandler=None):
- return unicode_encode_utf_16_helper(s, size, errors, errorhandler, "big")
+ errorhandler=None,
+ allow_surrogates=True):
+ return unicode_encode_utf_16_helper(s, size, errors, errorhandler,
+ allow_surrogates, "big")
def unicode_encode_utf_16_le(s, size, errors,
- errorhandler=None):
- return unicode_encode_utf_16_helper(s, size, errors, errorhandler, "little")
+ errorhandler=None,
+ allow_surrogates=True):
+ return unicode_encode_utf_16_helper(s, size, errors, errorhandler,
+ allow_surrogates, "little")
# ____________________________________________________________
@@ -756,7 +785,10 @@
def unicode_encode_utf_32_helper(s, size, errors,
errorhandler=None,
+ allow_surrogates=True,
byteorder='little'):
+ if errorhandler is None:
+ errorhandler = default_unicode_error_encode
if size == 0:
if byteorder == 'native':
result = StringBuilder(4)
@@ -769,33 +801,57 @@
_STORECHAR32(result, 0xFEFF, BYTEORDER)
byteorder = BYTEORDER
- i = 0
- while i < size:
- ch = ord(s[i])
- i += 1
+ pos = 0
+ while pos < size:
+ ch = ord(s[pos])
+ pos += 1
ch2 = 0
- if MAXUNICODE < 65536 and 0xD800 <= ch <= 0xDBFF and i < size:
- ch2 = ord(s[i])
- if 0xDC00 <= ch2 <= 0xDFFF:
- ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
- i += 1
+ if 0xD800 <= ch < 0xDC00:
+ if not allow_surrogates:
+ ru, rs, pos = errorhandler(errors, 'utf32',
+ 'surrogates not allowed',
+ s, pos-1, pos)
+ if rs is not None:
+ # py3k only
+ if len(rs) % 4 != 0:
+ errorhandler('strict', 'utf32',
+ 'surrogates not allowed',
+ s, pos-1, pos)
+ result.append(rs)
+ continue
+ for ch in ru:
+ if ord(ch) < 0xD800:
+ _STORECHAR32(result, ord(ch), byteorder)
+ else:
+ errorhandler('strict', 'utf32',
+ 'surrogates not allowed',
+ s, pos-1, pos)
+ continue
+ elif MAXUNICODE < 65536 and pos < size:
+ ch2 = ord(s[pos])
+ if 0xDC00 <= ch2 < 0xE000:
+ ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
+ pos += 1
_STORECHAR32(result, ch, byteorder)
return result.build()
def unicode_encode_utf_32(s, size, errors,
- errorhandler=None):
- return unicode_encode_utf_32_helper(s, size, errors, errorhandler, "native")
+ errorhandler=None, allow_surrogates=True):
+ return unicode_encode_utf_32_helper(s, size, errors, errorhandler,
+ allow_surrogates, "native")
def unicode_encode_utf_32_be(s, size, errors,
- errorhandler=None):
- return unicode_encode_utf_32_helper(s, size, errors, errorhandler, "big")
+ errorhandler=None, allow_surrogates=True):
+ return unicode_encode_utf_32_helper(s, size, errors, errorhandler,
+ allow_surrogates, "big")
def unicode_encode_utf_32_le(s, size, errors,
- errorhandler=None):
- return unicode_encode_utf_32_helper(s, size, errors, errorhandler, "little")
+ errorhandler=None, allow_surrogates=True):
+ return unicode_encode_utf_32_helper(s, size, errors, errorhandler,
+ allow_surrogates, "little")
# ____________________________________________________________
diff --git a/rpython/rlib/test/test_runicode.py b/rpython/rlib/test/test_runicode.py
--- a/rpython/rlib/test/test_runicode.py
+++ b/rpython/rlib/test/test_runicode.py
@@ -223,6 +223,40 @@
py.test.raises(UnicodeDecodeError, runicode.str_decode_utf_16_le,
s, len(s), True)
+ def test_utf16_surrogates(self):
+ assert runicode.unicode_encode_utf_16_be(
+ u"\ud800", 1, None) == '\xd8\x00'
+ py.test.raises(UnicodeEncodeError, runicode.unicode_encode_utf_16_be,
+ u"\ud800", 1, None, allow_surrogates=False)
+ def replace_with(ru, rs):
+ def errorhandler(errors, enc, msg, u, startingpos, endingpos):
+ if errors == 'strict':
+ raise UnicodeEncodeError(enc, u, startingpos,
+ endingpos, msg)
+ return ru, rs, endingpos
+ return runicode.unicode_encode_utf_16_be(
+ u"<\ud800>", 3, None,
+ errorhandler, allow_surrogates=False)
+ assert replace_with(u'rep', None) == '\x00<\x00r\x00e\x00p\x00>'
+ assert replace_with(None, '\xca\xfe') == '\x00<\xca\xfe\x00>'
+
+ def test_utf32_surrogates(self):
+ assert runicode.unicode_encode_utf_32_be(
+ u"\ud800", 1, None) == '\x00\x00\xd8\x00'
+ py.test.raises(UnicodeEncodeError, runicode.unicode_encode_utf_32_be,
+ u"\ud800", 1, None, allow_surrogates=False)
+ def replace_with(ru, rs):
+ def errorhandler(errors, enc, msg, u, startingpos, endingpos):
+ if errors == 'strict':
+ raise UnicodeEncodeError(enc, u, startingpos,
+ endingpos, msg)
+ return ru, rs, endingpos
+ return runicode.unicode_encode_utf_32_be(
+ u"<\ud800>", 3, None,
+ errorhandler, allow_surrogates=False)
+ assert replace_with(u'rep', None) == u'<rep>'.encode('utf-32-be')
+ assert replace_with(None, '\xca\xfe\xca\xfe') == '\x00\x00\x00<\xca\xfe\xca\xfe\x00\x00\x00>'
+
def test_utf7_bugs(self):
u = u'A\u2262\u0391.'
assert runicode.unicode_encode_utf_7(u, len(u), None) == 'A+ImIDkQ.'
More information about the pypy-commit
mailing list