[Python-checkins] cpython: Issue #12016: Multibyte CJK decoders now resynchronize faster

victor.stinner python-checkins at python.org
Fri Jul 8 01:45:23 CEST 2011


http://hg.python.org/cpython/rev/16cbd84de848
changeset:   71247:16cbd84de848
user:        Victor Stinner <victor.stinner at haypocalc.com>
date:        Fri Jul 08 01:45:13 2011 +0200
summary:
  Issue #12016: Multibyte CJK decoders now resynchronize faster

They only ignore the first byte of an invalid byte sequence.

For example, b'\xff\n'.decode('gb2312', 'replace') gives '\ufffd\n' instead of
'\ufffd'.

files:
  Doc/whatsnew/3.3.rst               |  23 ++++
  Lib/test/test_codecencodings_cn.py |  21 ++-
  Lib/test/test_codecencodings_hk.py |   4 +-
  Lib/test/test_codecencodings_jp.py |  96 +++++++++++------
  Lib/test/test_codecencodings_kr.py |  25 ++-
  Lib/test/test_codecencodings_tw.py |   4 +-
  Lib/test/test_codecmaps_tw.py      |   3 +
  Misc/NEWS                          |   4 +
  Modules/cjkcodecs/_codecs_cn.c     |  14 +-
  Modules/cjkcodecs/_codecs_hk.c     |   2 +-
  Modules/cjkcodecs/_codecs_jp.c     |  34 +++---
  Modules/cjkcodecs/_codecs_kr.c     |  18 +-
  Modules/cjkcodecs/_codecs_tw.c     |   4 +-
  13 files changed, 159 insertions(+), 93 deletions(-)


diff --git a/Doc/whatsnew/3.3.rst b/Doc/whatsnew/3.3.rst
--- a/Doc/whatsnew/3.3.rst
+++ b/Doc/whatsnew/3.3.rst
@@ -68,6 +68,29 @@
 
 * Stub
 
+codecs
+------
+
+Multibyte CJK decoders now resynchronize faster. They only ignore the first
+byte of an invalid byte sequence. For example, b'\xff\n'.decode('gb2312',
+'replace') gives '�\n' instead of '�'.
+
+(http://bugs.python.org/issue12016)
+
+Don't reset incremental encoders of CJK codecs at each call to their encode()
+method anymore. For example: ::
+
+    $ ./python -q
+    >>> import codecs
+    >>> encoder = codecs.getincrementalencoder('hz')('strict')
+    >>> b''.join(encoder.encode(x) for x in '\u52ff\u65bd\u65bc\u4eba\u3002 Bye.')
+    b'~{NpJ)l6HK!#~} Bye.'
+
+This example gives b'~{Np~}~{J)~}~{l6~}~{HK~}~{!#~} Bye.' with older Python
+versions.
+
+(http://bugs.python.org/issue12100)
+
 faulthandler
 ------------
 
diff --git a/Lib/test/test_codecencodings_cn.py b/Lib/test/test_codecencodings_cn.py
--- a/Lib/test/test_codecencodings_cn.py
+++ b/Lib/test/test_codecencodings_cn.py
@@ -15,8 +15,8 @@
         # invalid bytes
         (b"abc\x81\x81\xc1\xc4", "strict",  None),
         (b"abc\xc8", "strict",  None),
-        (b"abc\x81\x81\xc1\xc4", "replace", "abc\ufffd\u804a"),
-        (b"abc\x81\x81\xc1\xc4\xc8", "replace", "abc\ufffd\u804a\ufffd"),
+        (b"abc\x81\x81\xc1\xc4", "replace", "abc\ufffd\ufffd\u804a"),
+        (b"abc\x81\x81\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\u804a\ufffd"),
         (b"abc\x81\x81\xc1\xc4", "ignore",  "abc\u804a"),
         (b"\xc1\x64", "strict", None),
     )
@@ -28,8 +28,8 @@
         # invalid bytes
         (b"abc\x80\x80\xc1\xc4", "strict",  None),
         (b"abc\xc8", "strict",  None),
-        (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\u804a"),
-        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\u804a\ufffd"),
+        (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\ufffd\u804a"),
+        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\u804a\ufffd"),
         (b"abc\x80\x80\xc1\xc4", "ignore",  "abc\u804a"),
         (b"\x83\x34\x83\x31", "strict", None),
         ("\u30fb", "strict", None),
@@ -42,11 +42,14 @@
         # invalid bytes
         (b"abc\x80\x80\xc1\xc4", "strict",  None),
         (b"abc\xc8", "strict",  None),
-        (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\u804a"),
-        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\u804a\ufffd"),
+        (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\ufffd\u804a"),
+        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\u804a\ufffd"),
         (b"abc\x80\x80\xc1\xc4", "ignore",  "abc\u804a"),
-        (b"abc\x84\x39\x84\x39\xc1\xc4", "replace", "abc\ufffd\u804a"),
+        (b"abc\x84\x39\x84\x39\xc1\xc4", "replace", "abc\ufffd9\ufffd9\u804a"),
         ("\u30fb", "strict", b"\x819\xa79"),
+        (b"abc\x84\x32\x80\x80def", "replace", 'abc\ufffd2\ufffd\ufffddef'),
+        (b"abc\x81\x30\x81\x30def", "strict", 'abc\x80def'),
+        (b"abc\x86\x30\x81\x30def", "replace", 'abc\ufffd0\ufffd0def'),
     )
     has_iso10646 = True
 
@@ -74,9 +77,11 @@
          '\u5df1\u6240\u4e0d\u6b32\uff0c\u52ff\u65bd\u65bc\u4eba\u3002'
          'Bye.\n'),
         # invalid bytes
-        (b'ab~cd', 'replace', 'ab\uFFFDd'),
+        (b'ab~cd', 'replace', 'ab\uFFFDcd'),
         (b'ab\xffcd', 'replace', 'ab\uFFFDcd'),
         (b'ab~{\x81\x81\x41\x44~}cd', 'replace', 'ab\uFFFD\uFFFD\u804Acd'),
+        (b'ab~{\x41\x44~}cd', 'replace', 'ab\u804Acd'),
+        (b"ab~{\x79\x79\x41\x44~}cd", "replace", "ab\ufffd\ufffd\u804acd"),
     )
 
 def test_main():
diff --git a/Lib/test/test_codecencodings_hk.py b/Lib/test/test_codecencodings_hk.py
--- a/Lib/test/test_codecencodings_hk.py
+++ b/Lib/test/test_codecencodings_hk.py
@@ -15,8 +15,8 @@
         # invalid bytes
         (b"abc\x80\x80\xc1\xc4", "strict",  None),
         (b"abc\xc8", "strict",  None),
-        (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\u8b10"),
-        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\u8b10\ufffd"),
+        (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\ufffd\u8b10"),
+        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\u8b10\ufffd"),
         (b"abc\x80\x80\xc1\xc4", "ignore",  "abc\u8b10"),
     )
 
diff --git a/Lib/test/test_codecencodings_jp.py b/Lib/test/test_codecencodings_jp.py
--- a/Lib/test/test_codecencodings_jp.py
+++ b/Lib/test/test_codecencodings_jp.py
@@ -15,50 +15,57 @@
         # invalid bytes
         (b"abc\x81\x00\x81\x00\x82\x84", "strict",  None),
         (b"abc\xf8", "strict",  None),
-        (b"abc\x81\x00\x82\x84", "replace", "abc\ufffd\uff44"),
-        (b"abc\x81\x00\x82\x84\x88", "replace", "abc\ufffd\uff44\ufffd"),
-        (b"abc\x81\x00\x82\x84", "ignore",  "abc\uff44"),
+        (b"abc\x81\x00\x82\x84", "replace", "abc\ufffd\x00\uff44"),
+        (b"abc\x81\x00\x82\x84\x88", "replace", "abc\ufffd\x00\uff44\ufffd"),
+        (b"abc\x81\x00\x82\x84", "ignore",  "abc\x00\uff44"),
+        (b"ab\xEBxy", "replace", "ab\uFFFDxy"),
+        (b"ab\xF0\x39xy", "replace", "ab\uFFFD9xy"),
+        (b"ab\xEA\xF0xy", "replace", 'ab\ufffd\ue038y'),
         # sjis vs cp932
         (b"\\\x7e", "replace", "\\\x7e"),
         (b"\x81\x5f\x81\x61\x81\x7c", "replace", "\uff3c\u2225\uff0d"),
     )
 
+euc_commontests = (
+    # invalid bytes
+    (b"abc\x80\x80\xc1\xc4", "strict",  None),
+    (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\ufffd\u7956"),
+    (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\u7956\ufffd"),
+    (b"abc\x80\x80\xc1\xc4", "ignore",  "abc\u7956"),
+    (b"abc\xc8", "strict",  None),
+    (b"abc\x8f\x83\x83", "replace", "abc\ufffd\ufffd\ufffd"),
+    (b"\x82\xFCxy", "replace", "\ufffd\ufffdxy"),
+    (b"\xc1\x64", "strict", None),
+    (b"\xa1\xc0", "strict", "\uff3c"),
+    (b"\xa1\xc0\\", "strict", "\uff3c\\"),
+    (b"\x8eXY", "replace", "\ufffdXY"),
+)
+
+class Test_EUC_JIS_2004(test_multibytecodec_support.TestBase,
+                        unittest.TestCase):
+    encoding = 'euc_jis_2004'
+    tstring = test_multibytecodec_support.load_teststring('euc_jisx0213')
+    codectests = euc_commontests
+    xmlcharnametest = (
+        "\xab\u211c\xbb = \u2329\u1234\u232a",
+        b"\xa9\xa8&real;\xa9\xb2 = &lang;&#4660;&rang;"
+    )
+
 class Test_EUC_JISX0213(test_multibytecodec_support.TestBase,
                         unittest.TestCase):
     encoding = 'euc_jisx0213'
     tstring = test_multibytecodec_support.load_teststring('euc_jisx0213')
-    codectests = (
-        # invalid bytes
-        (b"abc\x80\x80\xc1\xc4", "strict",  None),
-        (b"abc\xc8", "strict",  None),
-        (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\u7956"),
-        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\u7956\ufffd"),
-        (b"abc\x80\x80\xc1\xc4", "ignore",  "abc\u7956"),
-        (b"abc\x8f\x83\x83", "replace", "abc\ufffd"),
-        (b"\xc1\x64", "strict", None),
-        (b"\xa1\xc0", "strict", "\uff3c"),
-    )
+    codectests = euc_commontests
     xmlcharnametest = (
         "\xab\u211c\xbb = \u2329\u1234\u232a",
         b"\xa9\xa8&real;\xa9\xb2 = &lang;&#4660;&rang;"
     )
 
-eucjp_commontests = (
-    (b"abc\x80\x80\xc1\xc4", "strict",  None),
-    (b"abc\xc8", "strict",  None),
-    (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\u7956"),
-    (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\u7956\ufffd"),
-    (b"abc\x80\x80\xc1\xc4", "ignore",  "abc\u7956"),
-    (b"abc\x8f\x83\x83", "replace", "abc\ufffd"),
-    (b"\xc1\x64", "strict", None),
-)
-
 class Test_EUC_JP_COMPAT(test_multibytecodec_support.TestBase,
                          unittest.TestCase):
     encoding = 'euc_jp'
     tstring = test_multibytecodec_support.load_teststring('euc_jp')
-    codectests = eucjp_commontests + (
-        (b"\xa1\xc0\\", "strict", "\uff3c\\"),
+    codectests = euc_commontests + (
         ("\xa5", "strict", b"\x5c"),
         ("\u203e", "strict", b"\x7e"),
     )
@@ -66,8 +73,6 @@
 shiftjis_commonenctests = (
     (b"abc\x80\x80\x82\x84", "strict",  None),
     (b"abc\xf8", "strict",  None),
-    (b"abc\x80\x80\x82\x84", "replace", "abc\ufffd\uff44"),
-    (b"abc\x80\x80\x82\x84\x88", "replace", "abc\ufffd\uff44\ufffd"),
     (b"abc\x80\x80\x82\x84def", "ignore",  "abc\uff44def"),
 )
 
@@ -75,20 +80,41 @@
     encoding = 'shift_jis'
     tstring = test_multibytecodec_support.load_teststring('shift_jis')
     codectests = shiftjis_commonenctests + (
+        (b"abc\x80\x80\x82\x84", "replace", "abc\ufffd\ufffd\uff44"),
+        (b"abc\x80\x80\x82\x84\x88", "replace", "abc\ufffd\ufffd\uff44\ufffd"),
+
         (b"\\\x7e", "strict", "\\\x7e"),
         (b"\x81\x5f\x81\x61\x81\x7c", "strict", "\uff3c\u2016\u2212"),
+        (b"abc\x81\x39", "replace",  "abc\ufffd9"),
+        (b"abc\xEA\xFC", "replace",  "abc\ufffd\ufffd"),
+        (b"abc\xFF\x58", "replace",  "abc\ufffdX"),
+    )
+
+class Test_SJIS_2004(test_multibytecodec_support.TestBase, unittest.TestCase):
+    encoding = 'shift_jis_2004'
+    tstring = test_multibytecodec_support.load_teststring('shift_jis')
+    codectests = shiftjis_commonenctests + (
+        (b"\\\x7e", "strict", "\xa5\u203e"),
+        (b"\x81\x5f\x81\x61\x81\x7c", "strict", "\\\u2016\u2212"),
+        (b"abc\xEA\xFC", "strict",  "abc\u64bf"),
+        (b"\x81\x39xy", "replace",  "\ufffd9xy"),
+        (b"\xFF\x58xy", "replace",  "\ufffdXxy"),
+        (b"\x80\x80\x82\x84xy", "replace", "\ufffd\ufffd\uff44xy"),
+        (b"\x80\x80\x82\x84\x88xy", "replace", "\ufffd\ufffd\uff44\u5864y"),
+        (b"\xFC\xFBxy", "replace", '\ufffd\u95b4y'),
+    )
+    xmlcharnametest = (
+        "\xab\u211c\xbb = \u2329\u1234\u232a",
+        b"\x85G&real;\x85Q = &lang;&#4660;&rang;"
     )
 
 class Test_SJISX0213(test_multibytecodec_support.TestBase, unittest.TestCase):
     encoding = 'shift_jisx0213'
     tstring = test_multibytecodec_support.load_teststring('shift_jisx0213')
-    codectests = (
-        # invalid bytes
-        (b"abc\x80\x80\x82\x84", "strict",  None),
-        (b"abc\xf8", "strict",  None),
-        (b"abc\x80\x80\x82\x84", "replace", "abc\ufffd\uff44"),
-        (b"abc\x80\x80\x82\x84\x88", "replace", "abc\ufffd\uff44\ufffd"),
-        (b"abc\x80\x80\x82\x84def", "ignore",  "abc\uff44def"),
+    codectests = shiftjis_commonenctests + (
+        (b"abc\x80\x80\x82\x84", "replace", "abc\ufffd\ufffd\uff44"),
+        (b"abc\x80\x80\x82\x84\x88", "replace", "abc\ufffd\ufffd\uff44\ufffd"),
+
         # sjis vs cp932
         (b"\\\x7e", "replace", "\xa5\u203e"),
         (b"\x81\x5f\x81\x61\x81\x7c", "replace", "\x5c\u2016\u2212"),
diff --git a/Lib/test/test_codecencodings_kr.py b/Lib/test/test_codecencodings_kr.py
--- a/Lib/test/test_codecencodings_kr.py
+++ b/Lib/test/test_codecencodings_kr.py
@@ -15,8 +15,8 @@
         # invalid bytes
         (b"abc\x80\x80\xc1\xc4", "strict",  None),
         (b"abc\xc8", "strict",  None),
-        (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\uc894"),
-        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\uc894\ufffd"),
+        (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\ufffd\uc894"),
+        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\uc894\ufffd"),
         (b"abc\x80\x80\xc1\xc4", "ignore",  "abc\uc894"),
     )
 
@@ -27,8 +27,8 @@
         # invalid bytes
         (b"abc\x80\x80\xc1\xc4", "strict",  None),
         (b"abc\xc8", "strict",  None),
-        (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\uc894"),
-        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\uc894\ufffd"),
+        (b"abc\x80\x80\xc1\xc4", "replace", 'abc\ufffd\ufffd\uc894'),
+        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\uc894\ufffd"),
         (b"abc\x80\x80\xc1\xc4", "ignore",  "abc\uc894"),
 
         # composed make-up sequence errors
@@ -40,13 +40,14 @@
         (b"\xa4\xd4\xa4\xb6\xa4\xd0\xa4", "strict", None),
         (b"\xa4\xd4\xa4\xb6\xa4\xd0\xa4\xd4", "strict", "\uc4d4"),
         (b"\xa4\xd4\xa4\xb6\xa4\xd0\xa4\xd4x", "strict", "\uc4d4x"),
-        (b"a\xa4\xd4\xa4\xb6\xa4", "replace", "a\ufffd"),
+        (b"a\xa4\xd4\xa4\xb6\xa4", "replace", 'a\ufffd'),
         (b"\xa4\xd4\xa3\xb6\xa4\xd0\xa4\xd4", "strict", None),
         (b"\xa4\xd4\xa4\xb6\xa3\xd0\xa4\xd4", "strict", None),
         (b"\xa4\xd4\xa4\xb6\xa4\xd0\xa3\xd4", "strict", None),
-        (b"\xa4\xd4\xa4\xff\xa4\xd0\xa4\xd4", "replace", "\ufffd"),
-        (b"\xa4\xd4\xa4\xb6\xa4\xff\xa4\xd4", "replace", "\ufffd"),
-        (b"\xa4\xd4\xa4\xb6\xa4\xd0\xa4\xff", "replace", "\ufffd"),
+        (b"\xa4\xd4\xa4\xff\xa4\xd0\xa4\xd4", "replace", '\ufffd\u6e21\ufffd\u3160\ufffd'),
+        (b"\xa4\xd4\xa4\xb6\xa4\xff\xa4\xd4", "replace", '\ufffd\u6e21\ub544\ufffd\ufffd'),
+        (b"\xa4\xd4\xa4\xb6\xa4\xd0\xa4\xff", "replace", '\ufffd\u6e21\ub544\u572d\ufffd'),
+        (b"\xa4\xd4\xff\xa4\xd4\xa4\xb6\xa4\xd0\xa4\xd4", "replace", '\ufffd\ufffd\ufffd\uc4d4'),
         (b"\xc1\xc4", "strict", "\uc894"),
     )
 
@@ -57,9 +58,13 @@
         # invalid bytes
         (b"abc\x80\x80\xc1\xc4", "strict",  None),
         (b"abc\xc8", "strict",  None),
-        (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\ucd27"),
-        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ucd27\ufffd"),
+        (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\ufffd\ucd27"),
+        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\ucd27\ufffd"),
         (b"abc\x80\x80\xc1\xc4", "ignore",  "abc\ucd27"),
+        (b"\xD8abc", "replace",  "\uFFFDabc"),
+        (b"\xD8\xFFabc", "replace",  "\uFFFD\uFFFDabc"),
+        (b"\x84bxy", "replace",  "\uFFFDbxy"),
+        (b"\x8CBxy", "replace",  "\uFFFDBxy"),
     )
 
 def test_main():
diff --git a/Lib/test/test_codecencodings_tw.py b/Lib/test/test_codecencodings_tw.py
--- a/Lib/test/test_codecencodings_tw.py
+++ b/Lib/test/test_codecencodings_tw.py
@@ -15,8 +15,8 @@
         # invalid bytes
         (b"abc\x80\x80\xc1\xc4", "strict",  None),
         (b"abc\xc8", "strict",  None),
-        (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\u8b10"),
-        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\u8b10\ufffd"),
+        (b"abc\x80\x80\xc1\xc4", "replace", "abc\ufffd\ufffd\u8b10"),
+        (b"abc\x80\x80\xc1\xc4\xc8", "replace", "abc\ufffd\ufffd\u8b10\ufffd"),
         (b"abc\x80\x80\xc1\xc4", "ignore",  "abc\u8b10"),
     )
 
diff --git a/Lib/test/test_codecmaps_tw.py b/Lib/test/test_codecmaps_tw.py
--- a/Lib/test/test_codecmaps_tw.py
+++ b/Lib/test/test_codecmaps_tw.py
@@ -23,6 +23,9 @@
         (b'\xa2\xcc', '\u5341'),
         (b'\xa2\xce', '\u5345'),
     ]
+    codectests = (
+        (b"\xFFxy", "replace",  "\ufffdxy"),
+    )
 
 def test_main():
     support.run_unittest(__name__)
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -219,6 +219,10 @@
 Library
 -------
 
+- Issue #12016: Multibyte CJK decoders now resynchronize faster. They only
+  ignore the first byte of an invalid byte sequence. For example,
+  b'\xff\n'.decode('gb2312', 'replace') gives '\ufffd\n' instead of '\ufffd'.
+
 - Issue #12459: time.sleep() now raises a ValueError if the sleep length is
   negative, instead of an infinite sleep on Windows or raising an IOError on
   Linux for example, to have the same behaviour on all platforms.
diff --git a/Modules/cjkcodecs/_codecs_cn.c b/Modules/cjkcodecs/_codecs_cn.c
--- a/Modules/cjkcodecs/_codecs_cn.c
+++ b/Modules/cjkcodecs/_codecs_cn.c
@@ -85,7 +85,7 @@
         TRYMAP_DEC(gb2312, **outbuf, c ^ 0x80, IN2 ^ 0x80) {
             NEXT(2, 1)
         }
-        else return 2;
+        else return 1;
     }
 
     return 0;
@@ -141,7 +141,7 @@
         REQUIRE_INBUF(2)
 
         GBK_DECODE(c, IN2, **outbuf)
-        else return 2;
+        else return 1;
 
         NEXT(2, 1)
     }
@@ -267,7 +267,7 @@
             c3 = IN3;
             c4 = IN4;
             if (c < 0x81 || c3 < 0x81 || c4 < 0x30 || c4 > 0x39)
-                return 4;
+                return 1;
             c -= 0x81;  c2 -= 0x30;
             c3 -= 0x81; c4 -= 0x30;
 
@@ -292,12 +292,12 @@
                     continue;
                 }
             }
-            return 4;
+            return 1;
         }
 
         GBK_DECODE(c, c2, **outbuf)
         else TRYMAP_DEC(gb18030ext, **outbuf, c, c2);
-        else return 2;
+        else return 1;
 
         NEXT(2, 1)
     }
@@ -400,7 +400,7 @@
             else if (c2 == '\n')
                 ; /* line-continuation */
             else
-                return 2;
+                return 1;
             NEXT(2, 0);
             continue;
         }
@@ -419,7 +419,7 @@
                 NEXT(2, 1)
             }
             else
-                return 2;
+                return 1;
         }
     }
 
diff --git a/Modules/cjkcodecs/_codecs_hk.c b/Modules/cjkcodecs/_codecs_hk.c
--- a/Modules/cjkcodecs/_codecs_hk.c
+++ b/Modules/cjkcodecs/_codecs_hk.c
@@ -161,7 +161,7 @@
         case 0x8864: WRITE2(0x00ca, 0x030c); break;
         case 0x88a3: WRITE2(0x00ea, 0x0304); break;
         case 0x88a5: WRITE2(0x00ea, 0x030c); break;
-        default: return 2;
+        default: return 1;
         }
 
         NEXT(2, 2) /* all decoded codepoints are pairs, above. */
diff --git a/Modules/cjkcodecs/_codecs_jp.c b/Modules/cjkcodecs/_codecs_jp.c
--- a/Modules/cjkcodecs/_codecs_jp.c
+++ b/Modules/cjkcodecs/_codecs_jp.c
@@ -112,7 +112,7 @@
         TRYMAP_DEC(cp932ext, **outbuf, c, c2);
         else if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)){
             if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc)
-                return 2;
+                return 1;
 
             c = (c < 0xe0 ? c - 0x81 : c - 0xc1);
             c2 = (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41);
@@ -120,7 +120,7 @@
             c2 = (c2 < 0x5e ? c2 : c2 - 0x5e) + 0x21;
 
             TRYMAP_DEC(jisx0208, **outbuf, c, c2);
-            else return 2;
+            else return 1;
         }
         else if (c >= 0xf0 && c <= 0xf9) {
             if ((c2 >= 0x40 && c2 <= 0x7e) ||
@@ -128,10 +128,10 @@
                 OUT1(0xe000 + 188 * (c - 0xf0) +
                      (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41))
             else
-                return 2;
+                return 1;
         }
         else
-            return 2;
+            return 1;
 
         NEXT(2, 1)
     }
@@ -256,7 +256,7 @@
                 NEXT(2, 1)
             }
             else
-                return 2;
+                return 1;
         }
         else if (c == 0x8f) {
             unsigned char c2, c3;
@@ -274,7 +274,7 @@
                 continue;
             }
             else TRYMAP_DEC(jisx0212, **outbuf, c2, c3) ;
-            else return 3;
+            else return 1;
             NEXT(3, 1)
         }
         else {
@@ -300,7 +300,7 @@
                 NEXT(2, 2)
                 continue;
             }
-            else return 2;
+            else return 1;
             NEXT(2, 1)
         }
     }
@@ -388,7 +388,7 @@
                 NEXT(2, 1)
             }
             else
-                return 2;
+                return 1;
         }
         else if (c == 0x8f) {
             unsigned char c2, c3;
@@ -401,7 +401,7 @@
                 NEXT(3, 1)
             }
             else
-                return 3;
+                return 1;
         }
         else {
             unsigned char c2;
@@ -417,7 +417,7 @@
 #endif
                 TRYMAP_DEC(jisx0208, **outbuf,
                            c ^ 0x80, c2 ^ 0x80) ;
-            else return 2;
+            else return 1;
             NEXT(2, 1)
         }
     }
@@ -502,7 +502,7 @@
             REQUIRE_INBUF(2)
             c2 = IN2;
             if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc)
-                return 2;
+                return 1;
 
             c1 = (c < 0xe0 ? c - 0x81 : c - 0xc1);
             c2 = (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41);
@@ -522,10 +522,10 @@
                 continue;
             }
             else
-                return 2;
+                return 1;
         }
         else
-            return 2;
+            return 1;
 
         NEXT(1, 1) /* JIS X 0201 */
     }
@@ -645,7 +645,7 @@
             REQUIRE_INBUF(2)
             c2 = IN2;
             if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc)
-                return 2;
+                return 1;
 
             c1 = (c < 0xe0 ? c - 0x81 : c - 0xc1);
             c2 = (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41);
@@ -671,7 +671,7 @@
                     NEXT_OUT(2)
                 }
                 else
-                    return 2;
+                    return 1;
                 NEXT_IN(2)
             }
             else { /* Plane 2 */
@@ -689,13 +689,13 @@
                     continue;
                 }
                 else
-                    return 2;
+                    return 1;
                 NEXT(2, 1)
             }
             continue;
         }
         else
-            return 2;
+            return 1;
 
         NEXT(1, 1) /* JIS X 0201 */
     }
diff --git a/Modules/cjkcodecs/_codecs_kr.c b/Modules/cjkcodecs/_codecs_kr.c
--- a/Modules/cjkcodecs/_codecs_kr.c
+++ b/Modules/cjkcodecs/_codecs_kr.c
@@ -123,7 +123,7 @@
             if ((*inbuf)[2] != EUCKR_JAMO_FIRSTBYTE ||
                 (*inbuf)[4] != EUCKR_JAMO_FIRSTBYTE ||
                 (*inbuf)[6] != EUCKR_JAMO_FIRSTBYTE)
-                return 8;
+                return 1;
 
             c = (*inbuf)[3];
             if (0xa1 <= c && c <= 0xbe)
@@ -143,7 +143,7 @@
                 jong = NONE;
 
             if (cho == NONE || jung == NONE || jong == NONE)
-                return 8;
+                return 1;
 
             OUT1(0xac00 + cho*588 + jung*28 + jong);
             NEXT(8, 1)
@@ -152,7 +152,7 @@
             NEXT(2, 1)
         }
         else
-            return 2;
+            return 1;
     }
 
     return 0;
@@ -208,7 +208,7 @@
         REQUIRE_INBUF(2)
         TRYMAP_DEC(ksx1001, **outbuf, c ^ 0x80, IN2 ^ 0x80);
         else TRYMAP_DEC(cp949ext, **outbuf, c, IN2);
-        else return 2;
+        else return 1;
 
         NEXT(2, 1)
     }
@@ -375,7 +375,7 @@
             i_jong = johabidx_jongseong[c_jong];
 
             if (i_cho == NONE || i_jung == NONE || i_jong == NONE)
-                return 2;
+                return 1;
 
             /* we don't use U+1100 hangul jamo yet. */
             if (i_cho == FILL) {
@@ -391,7 +391,7 @@
                         OUT1(0x3100 |
                           johabjamo_jungseong[c_jung])
                     else
-                        return 2;
+                        return 1;
                 }
             } else {
                 if (i_jung == FILL) {
@@ -399,7 +399,7 @@
                         OUT1(0x3100 |
                           johabjamo_choseong[c_cho])
                     else
-                        return 2;
+                        return 1;
                 }
                 else
                     OUT1(0xac00 +
@@ -414,7 +414,7 @@
                 c2 < 0x31 || (c2 >= 0x80 && c2 < 0x91) ||
                 (c2 & 0x7f) == 0x7f ||
                 (c == 0xda && (c2 >= 0xa1 && c2 <= 0xd3)))
-                return 2;
+                return 1;
             else {
                 unsigned char t1, t2;
 
@@ -425,7 +425,7 @@
                 t2 = (t2 < 0x5e ? t2 : t2 - 0x5e) + 0x21;
 
                 TRYMAP_DEC(ksx1001, **outbuf, t1, t2);
-                else return 2;
+                else return 1;
                 NEXT(2, 1)
             }
         }
diff --git a/Modules/cjkcodecs/_codecs_tw.c b/Modules/cjkcodecs/_codecs_tw.c
--- a/Modules/cjkcodecs/_codecs_tw.c
+++ b/Modules/cjkcodecs/_codecs_tw.c
@@ -55,7 +55,7 @@
         TRYMAP_DEC(big5, **outbuf, c, IN2) {
             NEXT(2, 1)
         }
-        else return 2;
+        else return 1;
     }
 
     return 0;
@@ -109,7 +109,7 @@
 
         TRYMAP_DEC(cp950ext, **outbuf, c, IN2);
         else TRYMAP_DEC(big5, **outbuf, c, IN2);
-        else return 2;
+        else return 1;
 
         NEXT(2, 1)
     }

-- 
Repository URL: http://hg.python.org/cpython


More information about the Python-checkins mailing list