[Python-checkins] bpo-33578: Add getstate/setstate for CJK codec (GH-6984)

Thu Nov 1 06:48:56 EDT 2018

https://github.com/python/cpython/commit/ac22f6aa989f18c33c12615af1c66c73cf75d5e7
commit: ac22f6aa989f18c33c12615af1c66c73cf75d5e7
branch: master
author: Christopher Thorne <libcthorne at users.noreply.github.com>
committer: Miss Islington (bot) <31488909+miss-islington at users.noreply.github.com>
date: 2018-11-01T03:48:49-07:00
summary:

bpo-33578: Add getstate/setstate for CJK codec (GH-6984)



This implements getstate and setstate for the cjkcodecs multibyte incremental encoders/decoders, primarily to fix issues with seek/tell.

The encoder getstate/setstate is slightly tricky as the "state" is pending bytes + MultibyteCodec_State but only an integer can be returned. The approach I've taken is to encode this data into a long, similar to how .tell() encodes a "cookie_type" as a long.


https://bugs.python.org/issue33578

files:
A Misc/NEWS.d/next/Library/2018-06-08-23-55-34.bpo-33578.7oSsjG.rst
M Lib/test/test_io.py
M Lib/test/test_multibytecodec.py
M Misc/ACKS
M Modules/cjkcodecs/_codecs_cn.c
M Modules/cjkcodecs/clinic/multibytecodec.c.h
M Modules/cjkcodecs/multibytecodec.c
M Modules/cjkcodecs/multibytecodec.h

diff --git a/Lib/test/test_io.py b/Lib/test/test_io.py
index d927bb96ceb5..14352ff84fff 100644
--- a/Lib/test/test_io.py
+++ b/Lib/test/test_io.py
@@ -2971,6 +2971,34 @@ def test_seek_and_tell_with_data(data, min_pos=0):
         finally:
             StatefulIncrementalDecoder.codecEnabled = 0
 
+    def test_multibyte_seek_and_tell(self):
+        f = self.open(support.TESTFN, "w", encoding="euc_jp")
+        f.write("AB\n\u3046\u3048\n")
+        f.close()
+
+        f = self.open(support.TESTFN, "r", encoding="euc_jp")
+        self.assertEqual(f.readline(), "AB\n")
+        p0 = f.tell()
+        self.assertEqual(f.readline(), "\u3046\u3048\n")
+        p1 = f.tell()
+        f.seek(p0)
+        self.assertEqual(f.readline(), "\u3046\u3048\n")
+        self.assertEqual(f.tell(), p1)
+        f.close()
+
+    def test_seek_with_encoder_state(self):
+        f = self.open(support.TESTFN, "w", encoding="euc_jis_2004")
+        f.write("\u00e6\u0300")
+        p0 = f.tell()
+        f.write("\u00e6")
+        f.seek(p0)
+        f.write("\u0300")
+        f.close()
+
+        f = self.open(support.TESTFN, "r", encoding="euc_jis_2004")
+        self.assertEqual(f.readline(), "\u00e6\u0300\u0300")
+        f.close()
+
     def test_encoded_writes(self):
         data = "1234567890"
         tests = ("utf-16",
diff --git a/Lib/test/test_multibytecodec.py b/Lib/test/test_multibytecodec.py
index 01a1cd3c693c..8e8362b70fd0 100644
--- a/Lib/test/test_multibytecodec.py
+++ b/Lib/test/test_multibytecodec.py
@@ -117,6 +117,88 @@ def test_stateful_keep_buffer(self):
         self.assertRaises(UnicodeEncodeError, encoder.encode, '\u0123')
         self.assertEqual(encoder.encode('', True), b'\xa9\xdc')
 
+    def test_state_methods_with_buffer_state(self):
+        # euc_jis_2004 stores state as a buffer of pending bytes
+        encoder = codecs.getincrementalencoder('euc_jis_2004')()
+
+        initial_state = encoder.getstate()
+        self.assertEqual(encoder.encode('\u00e6\u0300'), b'\xab\xc4')
+        encoder.setstate(initial_state)
+        self.assertEqual(encoder.encode('\u00e6\u0300'), b'\xab\xc4')
+
+        self.assertEqual(encoder.encode('\u00e6'), b'')
+        partial_state = encoder.getstate()
+        self.assertEqual(encoder.encode('\u0300'), b'\xab\xc4')
+        encoder.setstate(partial_state)
+        self.assertEqual(encoder.encode('\u0300'), b'\xab\xc4')
+
+    def test_state_methods_with_non_buffer_state(self):
+        # iso2022_jp stores state without using a buffer
+        encoder = codecs.getincrementalencoder('iso2022_jp')()
+
+        self.assertEqual(encoder.encode('z'), b'z')
+        en_state = encoder.getstate()
+
+        self.assertEqual(encoder.encode('\u3042'), b'\x1b\x24\x42\x24\x22')
+        jp_state = encoder.getstate()
+        self.assertEqual(encoder.encode('z'), b'\x1b\x28\x42z')
+
+        encoder.setstate(jp_state)
+        self.assertEqual(encoder.encode('\u3042'), b'\x24\x22')
+
+        encoder.setstate(en_state)
+        self.assertEqual(encoder.encode('z'), b'z')
+
+    def test_getstate_returns_expected_value(self):
+        # Note: getstate is implemented such that these state values
+        # are expected to be the same across all builds of Python,
+        # regardless of x32/64 bit, endianness and compiler.
+
+        # euc_jis_2004 stores state as a buffer of pending bytes
+        buffer_state_encoder = codecs.getincrementalencoder('euc_jis_2004')()
+        self.assertEqual(buffer_state_encoder.getstate(), 0)
+        buffer_state_encoder.encode('\u00e6')
+        self.assertEqual(buffer_state_encoder.getstate(),
+                         int.from_bytes(
+                             b"\x02"
+                             b"\xc3\xa6"
+                             b"\x00\x00\x00\x00\x00\x00\x00\x00",
+                             'little'))
+        buffer_state_encoder.encode('\u0300')
+        self.assertEqual(buffer_state_encoder.getstate(), 0)
+
+        # iso2022_jp stores state without using a buffer
+        non_buffer_state_encoder = codecs.getincrementalencoder('iso2022_jp')()
+        self.assertEqual(non_buffer_state_encoder.getstate(),
+                         int.from_bytes(
+                             b"\x00"
+                             b"\x42\x42\x00\x00\x00\x00\x00\x00",
+                             'little'))
+        non_buffer_state_encoder.encode('\u3042')
+        self.assertEqual(non_buffer_state_encoder.getstate(),
+                         int.from_bytes(
+                             b"\x00"
+                             b"\xc2\x42\x00\x00\x00\x00\x00\x00",
+                             'little'))
+
+    def test_setstate_validates_input_size(self):
+        encoder = codecs.getincrementalencoder('euc_jp')()
+        pending_size_nine = int.from_bytes(
+            b"\x09"
+            b"\x00\x00\x00\x00\x00\x00\x00\x00"
+            b"\x00\x00\x00\x00\x00\x00\x00\x00",
+            'little')
+        self.assertRaises(UnicodeError, encoder.setstate, pending_size_nine)
+
+    def test_setstate_validates_input_bytes(self):
+        encoder = codecs.getincrementalencoder('euc_jp')()
+        invalid_utf8 = int.from_bytes(
+            b"\x01"
+            b"\xff"
+            b"\x00\x00\x00\x00\x00\x00\x00\x00",
+            'little')
+        self.assertRaises(UnicodeDecodeError, encoder.setstate, invalid_utf8)
+
     def test_issue5640(self):
         encoder = codecs.getincrementalencoder('shift-jis')('backslashreplace')
         self.assertEqual(encoder.encode('\xff'), b'\\xff')
@@ -165,6 +247,37 @@ def test_decode_unicode(self):
             decoder = codecs.getincrementaldecoder(enc)()
             self.assertRaises(TypeError, decoder.decode, "")
 
+    def test_state_methods(self):
+        decoder = codecs.getincrementaldecoder('euc_jp')()
+
+        # Decode a complete input sequence
+        self.assertEqual(decoder.decode(b'\xa4\xa6'), '\u3046')
+        pending1, _ = decoder.getstate()
+        self.assertEqual(pending1, b'')
+
+        # Decode first half of a partial input sequence
+        self.assertEqual(decoder.decode(b'\xa4'), '')
+        pending2, flags2 = decoder.getstate()
+        self.assertEqual(pending2, b'\xa4')
+
+        # Decode second half of a partial input sequence
+        self.assertEqual(decoder.decode(b'\xa6'), '\u3046')
+        pending3, _ = decoder.getstate()
+        self.assertEqual(pending3, b'')
+
+        # Jump back and decode second half of partial input sequence again
+        decoder.setstate((pending2, flags2))
+        self.assertEqual(decoder.decode(b'\xa6'), '\u3046')
+        pending4, _ = decoder.getstate()
+        self.assertEqual(pending4, b'')
+
+    def test_setstate_validates_input(self):
+        decoder = codecs.getincrementaldecoder('euc_jp')()
+        self.assertRaises(TypeError, decoder.setstate, 123)
+        self.assertRaises(TypeError, decoder.setstate, ("invalid", 0))
+        self.assertRaises(TypeError, decoder.setstate, (b"1234", "invalid"))
+        self.assertRaises(UnicodeError, decoder.setstate, (b"123456789", 0))
+
 class Test_StreamReader(unittest.TestCase):
     def test_bug1728403(self):
         try:
diff --git a/Misc/ACKS b/Misc/ACKS
index 043d604a3f96..08ff6d11fdd0 100644
--- a/Misc/ACKS
+++ b/Misc/ACKS
@@ -1626,6 +1626,7 @@ Nicolas M. Thiéry
 James Thomas
 Robin Thomas
 Brian Thorne
+Christopher Thorne
 Stephen Thorne
 Jeremy Thurgood
 Eric Tiedemann
diff --git a/Misc/NEWS.d/next/Library/2018-06-08-23-55-34.bpo-33578.7oSsjG.rst b/Misc/NEWS.d/next/Library/2018-06-08-23-55-34.bpo-33578.7oSsjG.rst
new file mode 100644
index 000000000000..4e2e4627dc54
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2018-06-08-23-55-34.bpo-33578.7oSsjG.rst
@@ -0,0 +1 @@
+Implement multibyte encoder/decoder state methods
diff --git a/Modules/cjkcodecs/_codecs_cn.c b/Modules/cjkcodecs/_codecs_cn.c
index 1fcc220b8db0..8a62f7e257c6 100644
--- a/Modules/cjkcodecs/_codecs_cn.c
+++ b/Modules/cjkcodecs/_codecs_cn.c
@@ -51,6 +51,12 @@
         ;                                                              \
     }
 
+/*
+ * codecs in this file use the first byte of MultibyteCodec_State.c[8]
+ * to store a 0 or 1 state value
+ */
+#define CN_STATE_OFFSET 0
+
 /*
  * GB2312 codec
  */
@@ -329,15 +335,15 @@ DECODER(gb18030)
 
 ENCODER_INIT(hz)
 {
-    state->i = 0;
+    state->c[CN_STATE_OFFSET] = 0;
     return 0;
 }
 
 ENCODER_RESET(hz)
 {
-    if (state->i != 0) {
+    if (state->c[CN_STATE_OFFSET] != 0) {
         WRITEBYTE2('~', '}');
-        state->i = 0;
+        state->c[CN_STATE_OFFSET] = 0;
         NEXT_OUT(2);
     }
     return 0;
@@ -350,10 +356,10 @@ ENCODER(hz)
         DBCHAR code;
 
         if (c < 0x80) {
-            if (state->i) {
+            if (state->c[CN_STATE_OFFSET]) {
                 WRITEBYTE2('~', '}');
                 NEXT_OUT(2);
-                state->i = 0;
+                state->c[CN_STATE_OFFSET] = 0;
             }
             WRITEBYTE1((unsigned char)c);
             NEXT(1, 1);
@@ -375,10 +381,10 @@ ENCODER(hz)
         if (code & 0x8000) /* MSB set: GBK */
             return 1;
 
-        if (state->i == 0) {
+        if (state->c[CN_STATE_OFFSET] == 0) {
             WRITEBYTE4('~', '{', code >> 8, code & 0xff);
             NEXT(1, 4);
-            state->i = 1;
+            state->c[CN_STATE_OFFSET] = 1;
         }
         else {
             WRITEBYTE2(code >> 8, code & 0xff);
@@ -391,13 +397,13 @@ ENCODER(hz)
 
 DECODER_INIT(hz)
 {
-    state->i = 0;
+    state->c[CN_STATE_OFFSET] = 0;
     return 0;
 }
 
 DECODER_RESET(hz)
 {
-    state->i = 0;
+    state->c[CN_STATE_OFFSET] = 0;
     return 0;
 }
 
@@ -411,14 +417,14 @@ DECODER(hz)
             unsigned char c2 = INBYTE2;
 
             REQUIRE_INBUF(2);
-            if (c2 == '~' && state->i == 0)
+            if (c2 == '~' && state->c[CN_STATE_OFFSET] == 0)
                 OUTCHAR('~');
-            else if (c2 == '{' && state->i == 0)
-                state->i = 1; /* set GB */
-            else if (c2 == '\n' && state->i == 0)
+            else if (c2 == '{' && state->c[CN_STATE_OFFSET] == 0)
+                state->c[CN_STATE_OFFSET] = 1; /* set GB */
+            else if (c2 == '\n' && state->c[CN_STATE_OFFSET] == 0)
                 ; /* line-continuation */
-            else if (c2 == '}' && state->i == 1)
-                state->i = 0; /* set ASCII */
+            else if (c2 == '}' && state->c[CN_STATE_OFFSET] == 1)
+                state->c[CN_STATE_OFFSET] = 0; /* set ASCII */
             else
                 return 1;
             NEXT_IN(2);
@@ -428,7 +434,7 @@ DECODER(hz)
         if (c & 0x80)
             return 1;
 
-        if (state->i == 0) { /* ASCII mode */
+        if (state->c[CN_STATE_OFFSET] == 0) { /* ASCII mode */
             OUTCHAR(c);
             NEXT_IN(1);
         }
diff --git a/Modules/cjkcodecs/clinic/multibytecodec.c.h b/Modules/cjkcodecs/clinic/multibytecodec.c.h
index 25857fc6d6f0..a58bb646a411 100644
--- a/Modules/cjkcodecs/clinic/multibytecodec.c.h
+++ b/Modules/cjkcodecs/clinic/multibytecodec.c.h
@@ -115,6 +115,50 @@ _multibytecodec_MultibyteIncrementalEncoder_encode(MultibyteIncrementalEncoderOb
     return return_value;
 }
 
+PyDoc_STRVAR(_multibytecodec_MultibyteIncrementalEncoder_getstate__doc__,
+"getstate($self, /)\n"
+"--\n"
+"\n");
+
+#define _MULTIBYTECODEC_MULTIBYTEINCREMENTALENCODER_GETSTATE_METHODDEF    \
+    {"getstate", (PyCFunction)_multibytecodec_MultibyteIncrementalEncoder_getstate, METH_NOARGS, _multibytecodec_MultibyteIncrementalEncoder_getstate__doc__},
+
+static PyObject *
+_multibytecodec_MultibyteIncrementalEncoder_getstate_impl(MultibyteIncrementalEncoderObject *self);
+
+static PyObject *
+_multibytecodec_MultibyteIncrementalEncoder_getstate(MultibyteIncrementalEncoderObject *self, PyObject *Py_UNUSED(ignored))
+{
+    return _multibytecodec_MultibyteIncrementalEncoder_getstate_impl(self);
+}
+
+PyDoc_STRVAR(_multibytecodec_MultibyteIncrementalEncoder_setstate__doc__,
+"setstate($self, state, /)\n"
+"--\n"
+"\n");
+
+#define _MULTIBYTECODEC_MULTIBYTEINCREMENTALENCODER_SETSTATE_METHODDEF    \
+    {"setstate", (PyCFunction)_multibytecodec_MultibyteIncrementalEncoder_setstate, METH_O, _multibytecodec_MultibyteIncrementalEncoder_setstate__doc__},
+
+static PyObject *
+_multibytecodec_MultibyteIncrementalEncoder_setstate_impl(MultibyteIncrementalEncoderObject *self,
+                                                          PyLongObject *statelong);
+
+static PyObject *
+_multibytecodec_MultibyteIncrementalEncoder_setstate(MultibyteIncrementalEncoderObject *self, PyObject *arg)
+{
+    PyObject *return_value = NULL;
+    PyLongObject *statelong;
+
+    if (!PyArg_Parse(arg, "O!:setstate", &PyLong_Type, &statelong)) {
+        goto exit;
+    }
+    return_value = _multibytecodec_MultibyteIncrementalEncoder_setstate_impl(self, statelong);
+
+exit:
+    return return_value;
+}
+
 PyDoc_STRVAR(_multibytecodec_MultibyteIncrementalEncoder_reset__doc__,
 "reset($self, /)\n"
 "--\n"
@@ -169,6 +213,50 @@ _multibytecodec_MultibyteIncrementalDecoder_decode(MultibyteIncrementalDecoderOb
     return return_value;
 }
 
+PyDoc_STRVAR(_multibytecodec_MultibyteIncrementalDecoder_getstate__doc__,
+"getstate($self, /)\n"
+"--\n"
+"\n");
+
+#define _MULTIBYTECODEC_MULTIBYTEINCREMENTALDECODER_GETSTATE_METHODDEF    \
+    {"getstate", (PyCFunction)_multibytecodec_MultibyteIncrementalDecoder_getstate, METH_NOARGS, _multibytecodec_MultibyteIncrementalDecoder_getstate__doc__},
+
+static PyObject *
+_multibytecodec_MultibyteIncrementalDecoder_getstate_impl(MultibyteIncrementalDecoderObject *self);
+
+static PyObject *
+_multibytecodec_MultibyteIncrementalDecoder_getstate(MultibyteIncrementalDecoderObject *self, PyObject *Py_UNUSED(ignored))
+{
+    return _multibytecodec_MultibyteIncrementalDecoder_getstate_impl(self);
+}
+
+PyDoc_STRVAR(_multibytecodec_MultibyteIncrementalDecoder_setstate__doc__,
+"setstate($self, state, /)\n"
+"--\n"
+"\n");
+
+#define _MULTIBYTECODEC_MULTIBYTEINCREMENTALDECODER_SETSTATE_METHODDEF    \
+    {"setstate", (PyCFunction)_multibytecodec_MultibyteIncrementalDecoder_setstate, METH_O, _multibytecodec_MultibyteIncrementalDecoder_setstate__doc__},
+
+static PyObject *
+_multibytecodec_MultibyteIncrementalDecoder_setstate_impl(MultibyteIncrementalDecoderObject *self,
+                                                          PyObject *state);
+
+static PyObject *
+_multibytecodec_MultibyteIncrementalDecoder_setstate(MultibyteIncrementalDecoderObject *self, PyObject *arg)
+{
+    PyObject *return_value = NULL;
+    PyObject *state;
+
+    if (!PyArg_Parse(arg, "O!:setstate", &PyTuple_Type, &state)) {
+        goto exit;
+    }
+    return_value = _multibytecodec_MultibyteIncrementalDecoder_setstate_impl(self, state);
+
+exit:
+    return return_value;
+}
+
 PyDoc_STRVAR(_multibytecodec_MultibyteIncrementalDecoder_reset__doc__,
 "reset($self, /)\n"
 "--\n"
@@ -330,4 +418,4 @@ PyDoc_STRVAR(_multibytecodec___create_codec__doc__,
 
 #define _MULTIBYTECODEC___CREATE_CODEC_METHODDEF    \
     {"__create_codec", (PyCFunction)_multibytecodec___create_codec, METH_O, _multibytecodec___create_codec__doc__},
-/*[clinic end generated code: output=680f59f4cfe63c25 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=2fa0a38494716b97 input=a9049054013a1b77]*/
diff --git a/Modules/cjkcodecs/multibytecodec.c b/Modules/cjkcodecs/multibytecodec.c
index 22172b043bcd..4633499a8abf 100644
--- a/Modules/cjkcodecs/multibytecodec.c
+++ b/Modules/cjkcodecs/multibytecodec.c
@@ -895,6 +895,93 @@ _multibytecodec_MultibyteIncrementalEncoder_encode_impl(MultibyteIncrementalEnco
     return encoder_encode_stateful(STATEFUL_ECTX(self), input, final);
 }
 
+/*[clinic input]
+_multibytecodec.MultibyteIncrementalEncoder.getstate
+[clinic start generated code]*/
+
+static PyObject *
+_multibytecodec_MultibyteIncrementalEncoder_getstate_impl(MultibyteIncrementalEncoderObject *self)
+/*[clinic end generated code: output=9794a5ace70d7048 input=4a2a82874ffa40bb]*/
+{
+    /* state made up of 1 byte for buffer size, up to MAXENCPENDING*4 bytes
+       for UTF-8 encoded buffer (each character can use up to 4
+       bytes), and required bytes for MultibyteCodec_State.c. A byte
+       array is used to avoid different compilers generating different
+       values for the same state, e.g. as a result of struct padding.
+    */
+    unsigned char statebytes[1 + MAXENCPENDING*4 + sizeof(self->state.c)];
+    Py_ssize_t statesize;
+    const char *pendingbuffer = NULL;
+    Py_ssize_t pendingsize;
+
+    if (self->pending != NULL) {
+        pendingbuffer = PyUnicode_AsUTF8AndSize(self->pending, &pendingsize);
+        if (pendingbuffer == NULL) {
+            return NULL;
+        }
+        if (pendingsize > MAXENCPENDING*4) {
+            PyErr_SetString(PyExc_UnicodeError, "pending buffer too large");
+            return NULL;
+        }
+        statebytes[0] = pendingsize;
+        memcpy(statebytes+1, pendingbuffer, pendingsize);
+        statesize = 1 + pendingsize;
+    } else {
+        statebytes[0] = 0;
+        statesize = 1;
+    }
+    memcpy(statebytes+statesize, self->state.c,
+           sizeof(self->state.c));
+    statesize += sizeof(self->state.c);
+
+    return (PyObject *)_PyLong_FromByteArray(statebytes, statesize,
+                                             1 /* little-endian */ ,
+                                             0 /* unsigned */ );
+}
+
+/*[clinic input]
+_multibytecodec.MultibyteIncrementalEncoder.setstate
+    state as statelong: object(type='PyLongObject *', subclass_of='&PyLong_Type')
+    /
+[clinic start generated code]*/
+
+static PyObject *
+_multibytecodec_MultibyteIncrementalEncoder_setstate_impl(MultibyteIncrementalEncoderObject *self,
+                                                          PyLongObject *statelong)
+/*[clinic end generated code: output=4e5e98ac1f4039ca input=c80fb5830d4d2f76]*/
+{
+    PyObject *pending = NULL;
+    unsigned char statebytes[1 + MAXENCPENDING*4 + sizeof(self->state.c)];
+
+    if (_PyLong_AsByteArray(statelong, statebytes, sizeof(statebytes),
+                            1 /* little-endian */ ,
+                            0 /* unsigned */ ) < 0) {
+        goto errorexit;
+    }
+
+    if (statebytes[0] > MAXENCPENDING*4) {
+        PyErr_SetString(PyExc_UnicodeError, "pending buffer too large");
+        return NULL;
+    }
+
+    pending = PyUnicode_DecodeUTF8((const char *)statebytes+1,
+                                   statebytes[0], "strict");
+    if (pending == NULL) {
+        goto errorexit;
+    }
+
+    Py_CLEAR(self->pending);
+    self->pending = pending;
+    memcpy(self->state.c, statebytes+1+statebytes[0],
+           sizeof(self->state.c));
+
+    Py_RETURN_NONE;
+
+errorexit:
+    Py_XDECREF(pending);
+    return NULL;
+}
+
 /*[clinic input]
 _multibytecodec.MultibyteIncrementalEncoder.reset
 [clinic start generated code]*/
@@ -919,6 +1006,8 @@ _multibytecodec_MultibyteIncrementalEncoder_reset_impl(MultibyteIncrementalEncod
 
 static struct PyMethodDef mbiencoder_methods[] = {
     _MULTIBYTECODEC_MULTIBYTEINCREMENTALENCODER_ENCODE_METHODDEF
+    _MULTIBYTECODEC_MULTIBYTEINCREMENTALENCODER_GETSTATE_METHODDEF
+    _MULTIBYTECODEC_MULTIBYTEINCREMENTALENCODER_SETSTATE_METHODDEF
     _MULTIBYTECODEC_MULTIBYTEINCREMENTALENCODER_RESET_METHODDEF
     {NULL, NULL},
 };
@@ -984,6 +1073,7 @@ mbiencoder_dealloc(MultibyteIncrementalEncoderObject *self)
 {
     PyObject_GC_UnTrack(self);
     ERROR_DECREF(self->errors);
+    Py_CLEAR(self->pending);
     Py_TYPE(self)->tp_free(self);
 }
 
@@ -1119,6 +1209,68 @@ _multibytecodec_MultibyteIncrementalDecoder_decode_impl(MultibyteIncrementalDeco
     return NULL;
 }
 
+/*[clinic input]
+_multibytecodec.MultibyteIncrementalDecoder.getstate
+[clinic start generated code]*/
+
+static PyObject *
+_multibytecodec_MultibyteIncrementalDecoder_getstate_impl(MultibyteIncrementalDecoderObject *self)
+/*[clinic end generated code: output=255009c4713b7f82 input=4006aa49bddbaa75]*/
+{
+    PyObject *buffer;
+
+    buffer = PyBytes_FromStringAndSize((const char *)self->pending,
+                                       self->pendingsize);
+    if (buffer == NULL) {
+        return NULL;
+    }
+
+    return make_tuple(buffer, (Py_ssize_t)*self->state.c);
+}
+
+/*[clinic input]
+_multibytecodec.MultibyteIncrementalDecoder.setstate
+    state: object(subclass_of='&PyTuple_Type')
+    /
+[clinic start generated code]*/
+
+static PyObject *
+_multibytecodec_MultibyteIncrementalDecoder_setstate_impl(MultibyteIncrementalDecoderObject *self,
+                                                          PyObject *state)
+/*[clinic end generated code: output=106b2fbca3e2dcc2 input=e5d794e8baba1a47]*/
+{
+    PyObject *buffer;
+    Py_ssize_t buffersize;
+    char *bufferstr;
+    unsigned long long flag;
+
+    if (!PyArg_ParseTuple(state, "SK;setstate(): illegal state argument",
+                          &buffer, &flag))
+    {
+        return NULL;
+    }
+
+    buffersize = PyBytes_Size(buffer);
+    if (buffersize == -1) {
+        return NULL;
+    }
+
+    if (buffersize > MAXDECPENDING) {
+        PyErr_SetString(PyExc_UnicodeError, "pending buffer too large");
+        return NULL;
+    }
+
+    bufferstr = PyBytes_AsString(buffer);
+    if (bufferstr == NULL) {
+        return NULL;
+    }
+    self->pendingsize = buffersize;
+    memcpy(self->pending, bufferstr, self->pendingsize);
+    memcpy(self->state.c, (unsigned char *)&flag, sizeof(flag));
+
+    Py_RETURN_NONE;
+}
+
 /*[clinic input]
 _multibytecodec.MultibyteIncrementalDecoder.reset
 [clinic start generated code]*/
@@ -1137,6 +1289,8 @@ _multibytecodec_MultibyteIncrementalDecoder_reset_impl(MultibyteIncrementalDecod
 
 static struct PyMethodDef mbidecoder_methods[] = {
     _MULTIBYTECODEC_MULTIBYTEINCREMENTALDECODER_DECODE_METHODDEF
+    _MULTIBYTECODEC_MULTIBYTEINCREMENTALDECODER_GETSTATE_METHODDEF
+    _MULTIBYTECODEC_MULTIBYTEINCREMENTALDECODER_SETSTATE_METHODDEF
     _MULTIBYTECODEC_MULTIBYTEINCREMENTALDECODER_RESET_METHODDEF
     {NULL, NULL},
 };
diff --git a/Modules/cjkcodecs/multibytecodec.h b/Modules/cjkcodecs/multibytecodec.h
index 5b8c22276b4b..6d34534ee685 100644
--- a/Modules/cjkcodecs/multibytecodec.h
+++ b/Modules/cjkcodecs/multibytecodec.h
@@ -16,12 +16,15 @@ typedef uint16_t ucs2_t, DBCHAR;
 typedef unsigned short ucs2_t, DBCHAR;
 #endif
 
-typedef union {
-    void *p;
-    int i;
+/*
+ * A struct that provides 8 bytes of state for multibyte
+ * codecs. Codecs are free to use this how they want. Note: if you
+ * need to add a new field to this struct, ensure that its byte order
+ * is independent of CPU endianness so that the return value of
+ * getstate doesn't differ between little and big endian CPUs.
+ */
+typedef struct {
     unsigned char c[8];
-    ucs2_t u2[4];
-    Py_UCS4 u4[2];
 } MultibyteCodec_State;
 
 typedef int (*mbcodec_init)(const void *config);