[pypy-svn] r75720 - in pypy/branch/interplevel-codecs/pypy: module/_codecs/test rlib
afa at codespeak.net
afa at codespeak.net
Thu Jul 1 16:29:21 CEST 2010
Author: afa
Date: Thu Jul 1 16:29:19 2010
New Revision: 75720
Modified:
pypy/branch/interplevel-codecs/pypy/module/_codecs/test/test_codecs.py
pypy/branch/interplevel-codecs/pypy/rlib/runicode.py
Log:
Decode utf-7 characters whenever possible,
to avoid integer overflow when the "inShift" excursion is too long.
Modified: pypy/branch/interplevel-codecs/pypy/module/_codecs/test/test_codecs.py
==============================================================================
--- pypy/branch/interplevel-codecs/pypy/module/_codecs/test/test_codecs.py (original)
+++ pypy/branch/interplevel-codecs/pypy/module/_codecs/test/test_codecs.py Thu Jul 1 16:29:19 2010
@@ -541,6 +541,9 @@
assert exc.start == 0
assert exc.end == 3
+ def test_utf7_surrogate(self):
+ raises(UnicodeDecodeError, '+3ADYAA-'.decode, 'utf-7')
+
def test_utf_16_encode_decode(self):
import codecs
x = u'123abc'
Modified: pypy/branch/interplevel-codecs/pypy/rlib/runicode.py
==============================================================================
--- pypy/branch/interplevel-codecs/pypy/rlib/runicode.py (original)
+++ pypy/branch/interplevel-codecs/pypy/rlib/runicode.py Thu Jul 1 16:29:19 2010
@@ -471,6 +471,32 @@
bits -= 6
return bits
+def _utf7_DECODE(s, result, errorhandler, errors,
+ pos, charsleft, bitsleft, surrogate):
+ while bitsleft >= 16:
+ outCh = (charsleft >> (bitsleft-16)) & 0xffff
+ bitsleft -= 16
+
+ if surrogate:
+ ## We have already generated an error for the high
+ ## surrogate so let's not bother seeing if the low
+ ## surrogate is correct or not
+ surrogate = False
+ elif 0xDC00 <= outCh <= 0xDFFF:
+ ## This is a surrogate pair. Unfortunately we can't
+ ## represent it in a 16-bit character
+ surrogate = True
+ msg = "code pairs are not supported"
+ res, pos = errorhandler(errors, 'utf-7',
+ msg, s, pos-1, pos)
+ result.append(res)
+ bitsleft = 0
+ break
+ else:
+ result.append(unichr(outCh))
+ return pos, charsleft, bitsleft, surrogate
+
+
def str_decode_utf_7(s, size, errors, final=False,
errorhandler=None):
if errorhandler is None:
@@ -495,27 +521,9 @@
inShift = 0
pos += 1
- while bitsleft >= 16:
- outCh = (charsleft >> (bitsleft-16)) & 0xffff
- bitsleft -= 16
-
- if surrogate:
- ## We have already generated an error for the high
- ## surrogate so let's not bother seeing if the low
- ## surrogate is correct or not
- surrogate = False
- elif 0xDC00 <= outCh <= 0xDFFF:
- ## This is a surrogate pair. Unfortunately we can't
- ## represent it in a 16-bit character
- surrogate = True
- msg = "code pairs are not supported"
- res, pos = errorhandler(errors, 'utf-7',
- msg, s, pos-1, pos)
- result.append(res)
- bitsleft = 0
- break
- else:
- result.append(unichr(outCh))
+ pos, charsleft, bitsleft, surrogate = _utf7_DECODE(
+ s, result, errorhandler, errors,
+ pos, charsleft, bitsleft, surrogate)
if bitsleft >= 6:
## The shift sequence has a partial character in it. If
## bitsleft < 6 then we could just classify it as padding
@@ -544,6 +552,10 @@
charsleft = (charsleft << 6) | _utf7_FROM_BASE64(ch)
bitsleft += 6
pos += 1
+
+ pos, charsleft, bitsleft, surrogate = _utf7_DECODE(
+ s, result, errorhandler, errors,
+ pos, charsleft, bitsleft, surrogate)
elif ch == '+':
startinpos = pos
pos += 1
More information about the Pypy-commit
mailing list