gh-127787: refactor helpers for `PyUnicodeErrorObject` internal interface (GH-127789)
https://github.com/python/cpython/commit/fa985bee6189aabac1c329f2de32aa9a4e8... commit: fa985bee6189aabac1c329f2de32aa9a4e88e550 branch: main author: Bénédikt Tran <10796600+picnixz@users.noreply.github.com> committer: encukou <encukou@gmail.com> date: 2025-01-03T13:37:02+01:00 summary: gh-127787: refactor helpers for `PyUnicodeErrorObject` internal interface (GH-127789) - Unify `get_unicode` and `get_string` in a single function. - Allow to retrieve the underlying `object` attribute, its size, and the adjusted 'start' and 'end', all at once. Add a new `_PyUnicodeError_GetParams` internal function for this. (In `exceptions.c`, it's somewhat common to not need all the attributes, but the compiler has opportunity to inline the function and optimize unneeded work away. Outside that file, we'll usually need all or most of them at once.) - Use a common implementation for the following functions: - `PyUnicode{Decode,Encode}Error_GetEncoding` - `PyUnicode{Decode,Encode,Translate}Error_GetObject` - `PyUnicode{Decode,Encode,Translate}Error_{Get,Set}Reason` - `PyUnicode{Decode,Encode,Translate}Error_{Get,Set}{Start,End}` files: M Include/cpython/pyerrors.h M Objects/exceptions.c diff --git a/Include/cpython/pyerrors.h b/Include/cpython/pyerrors.h index b36b4681f5dddb..49a6265e5eb02f 100644 --- a/Include/cpython/pyerrors.h +++ b/Include/cpython/pyerrors.h @@ -94,6 +94,12 @@ PyAPI_FUNC(void) _PyErr_ChainExceptions1(PyObject *); /* In exceptions.c */ +PyAPI_FUNC(int) _PyUnicodeError_GetParams( + PyObject *self, + PyObject **obj, Py_ssize_t *objlen, + Py_ssize_t *start, Py_ssize_t *end, + int as_bytes); + PyAPI_FUNC(PyObject*) PyUnstable_Exc_PrepReraiseStar( PyObject *orig, PyObject *excs); diff --git a/Objects/exceptions.c b/Objects/exceptions.c index 6880c24196cbb8..714f8c828afbc1 100644 --- a/Objects/exceptions.c +++ b/Objects/exceptions.c @@ -2667,47 +2667,167 @@ SimpleExtendsException(PyExc_Exception, ValueError, SimpleExtendsException(PyExc_ValueError, UnicodeError, "Unicode related error."); + +/* + * Check the validity of 'attr' as a unicode or bytes object depending + * on 'as_bytes' and return a new reference on it if it is the case. + * + * The 'name' is the attribute name and is only used for error reporting. + * + * On success, this returns a strong reference on 'attr'. + * On failure, this sets a TypeError and returns NULL. + */ static PyObject * -get_bytes(PyObject *attr, const char *name) +as_unicode_error_attribute(PyObject *attr, const char *name, int as_bytes) { - if (!attr) { - PyErr_Format(PyExc_TypeError, "%.200s attribute not set", name); + assert(as_bytes == 0 || as_bytes == 1); + if (attr == NULL) { + PyErr_Format(PyExc_TypeError, "%s attribute not set", name); return NULL; } - - if (!PyBytes_Check(attr)) { - PyErr_Format(PyExc_TypeError, "%.200s attribute must be bytes", name); + if (!(as_bytes ? PyBytes_Check(attr) : PyUnicode_Check(attr))) { + PyErr_Format(PyExc_TypeError, + "%s attribute must be %s", + name, + as_bytes ? "bytes" : "unicode"); return NULL; } return Py_NewRef(attr); } -static PyObject * -get_unicode(PyObject *attr, const char *name) -{ - if (!attr) { - PyErr_Format(PyExc_TypeError, "%.200s attribute not set", name); - return NULL; - } - if (!PyUnicode_Check(attr)) { +#define PyUnicodeError_Check(PTR) \ + PyObject_TypeCheck((PTR), (PyTypeObject *)PyExc_UnicodeError) +#define PyUnicodeError_CAST(PTR) \ + (assert(PyUnicodeError_Check(PTR)), ((PyUnicodeErrorObject *)(PTR))) + + +/* class names to use when reporting errors */ +#define Py_UNICODE_ENCODE_ERROR_NAME "UnicodeEncodeError" +#define Py_UNICODE_DECODE_ERROR_NAME "UnicodeDecodeError" +#define Py_UNICODE_TRANSLATE_ERROR_NAME "UnicodeTranslateError" + + +/* + * Check that 'self' is a UnicodeError object. + * + * On success, this returns 0. + * On failure, this sets a TypeError exception and returns -1. + * + * The 'expect_type' is the name of the expected type, which is + * only used for error reporting. + * + * As an implementation detail, the `PyUnicode*Error_*` functions + * currently allow *any* subclass of UnicodeError as 'self'. + * + * Use one of the `Py_UNICODE_*_ERROR_NAME` macros to avoid typos. + */ +static inline int +check_unicode_error_type(PyObject *self, const char *expect_type) +{ + assert(self != NULL); + if (!PyUnicodeError_Check(self)) { PyErr_Format(PyExc_TypeError, - "%.200s attribute must be unicode", name); - return NULL; + "expecting a %s object, got %T", expect_type, self); + return -1; } - return Py_NewRef(attr); + return 0; } -static int -set_unicodefromstring(PyObject **attr, const char *value) + +// --- PyUnicodeEncodeObject: internal helpers -------------------------------- +// +// In the helpers below, the caller is responsible to ensure that 'self' +// is a PyUnicodeErrorObject, although this is verified on DEBUG builds +// through PyUnicodeError_CAST(). + +/* + * Return the underlying (str) 'encoding' attribute of a UnicodeError object. + */ +static inline PyObject * +unicode_error_get_encoding_impl(PyObject *self) +{ + assert(self != NULL); + PyUnicodeErrorObject *exc = PyUnicodeError_CAST(self); + return as_unicode_error_attribute(exc->encoding, "encoding", false); +} + + +/* + * Return the underlying 'object' attribute of a UnicodeError object + * as a bytes or a string instance, depending on the 'as_bytes' flag. + */ +static inline PyObject * +unicode_error_get_object_impl(PyObject *self, int as_bytes) { - PyObject *obj = PyUnicode_FromString(value); - if (!obj) + assert(self != NULL); + PyUnicodeErrorObject *exc = PyUnicodeError_CAST(self); + return as_unicode_error_attribute(exc->object, "object", as_bytes); +} + + +/* + * Return the underlying (str) 'reason' attribute of a UnicodeError object. + */ +static inline PyObject * +unicode_error_get_reason_impl(PyObject *self) +{ + assert(self != NULL); + PyUnicodeErrorObject *exc = PyUnicodeError_CAST(self); + return as_unicode_error_attribute(exc->reason, "reason", false); +} + + +/* + * Set the underlying (str) 'reason' attribute of a UnicodeError object. + * + * Return 0 on success and -1 on failure. + */ +static inline int +unicode_error_set_reason_impl(PyObject *self, const char *reason) +{ + assert(self != NULL); + PyObject *value = PyUnicode_FromString(reason); + if (value == NULL) { return -1; - Py_XSETREF(*attr, obj); + } + PyUnicodeErrorObject *exc = PyUnicodeError_CAST(self); + Py_XSETREF(exc->reason, value); return 0; } + +/* + * Set the 'start' attribute of a UnicodeError object. + * + * Return 0 on success and -1 on failure. + */ +static inline int +unicode_error_set_start_impl(PyObject *self, Py_ssize_t start) +{ + assert(self != NULL); + PyUnicodeErrorObject *exc = PyUnicodeError_CAST(self); + exc->start = start; + return 0; +} + + +/* + * Set the 'end' attribute of a UnicodeError object. + * + * Return 0 on success and -1 on failure. + */ +static inline int +unicode_error_set_end_impl(PyObject *self, Py_ssize_t end) +{ + assert(self != NULL); + PyUnicodeErrorObject *exc = PyUnicodeError_CAST(self); + exc->end = end; + return 0; +} + +// --- PyUnicodeEncodeObject: internal getters -------------------------------- + /* * Adjust the (inclusive) 'start' value of a UnicodeError object. * @@ -2728,6 +2848,7 @@ unicode_error_adjust_start(Py_ssize_t start, Py_ssize_t objlen) return start; } + /* * Adjust the (exclusive) 'end' value of a UnicodeError object. * @@ -2748,134 +2869,162 @@ unicode_error_adjust_end(Py_ssize_t end, Py_ssize_t objlen) return end; } -#define _PyUnicodeError_CAST(PTR) ((PyUnicodeErrorObject *)(PTR)) -#define PyUnicodeError_Check(PTR) \ - PyObject_TypeCheck((PTR), (PyTypeObject *)PyExc_UnicodeError) -#define PyUnicodeError_CAST(PTR) \ - (assert(PyUnicodeError_Check(PTR)), _PyUnicodeError_CAST(PTR)) - -static inline int -check_unicode_error_type(PyObject *self, const char *expect_type) +/* + * Get various common parameters of a UnicodeError object. + * + * The caller is responsible to ensure that 'self' is a PyUnicodeErrorObject, + * although this condition is verified by this function on DEBUG builds. + * + * Return 0 on success and -1 on failure. + * + * Output parameters: + * + * obj A strong reference to the 'object' attribute. + * objlen The 'object' length. + * start The clipped 'start' attribute. + * end The clipped 'end' attribute. + * + * An output parameter can be NULL to indicate that + * the corresponding value does not need to be stored. + * + * Input parameter: + * + * as_bytes If 1, the error's 'object' attribute must be a bytes object, + * i.e. the call is for a `UnicodeDecodeError`. Otherwise, the + * 'object' attribute must be a string. + * + * A TypeError is raised if the 'object' type is incompatible. + */ +int +_PyUnicodeError_GetParams(PyObject *self, + PyObject **obj, Py_ssize_t *objlen, + Py_ssize_t *start, Py_ssize_t *end, + int as_bytes) { - if (!PyUnicodeError_Check(self)) { - PyErr_Format(PyExc_TypeError, - "expecting a %s object, got %T", expect_type, self); + assert(self != NULL); + assert(as_bytes == 0 || as_bytes == 1); + PyUnicodeErrorObject *exc = PyUnicodeError_CAST(self); + PyObject *r = as_unicode_error_attribute(exc->object, "object", as_bytes); + if (r == NULL) { return -1; } + + Py_ssize_t n = as_bytes ? PyBytes_GET_SIZE(r) : PyUnicode_GET_LENGTH(r); + if (objlen != NULL) { + *objlen = n; + } + if (start != NULL) { + *start = unicode_error_adjust_start(exc->start, n); + assert(*start >= 0); + assert(*start <= n); + } + if (end != NULL) { + *end = unicode_error_adjust_end(exc->end, n); + assert(*end >= 0); + assert(*end <= n); + } + if (obj != NULL) { + *obj = r; + } + else { + Py_DECREF(r); + } return 0; } -static inline PyUnicodeErrorObject * -as_unicode_error(PyObject *self, const char *expect_type) -{ - int rc = check_unicode_error_type(self, expect_type); - return rc < 0 ? NULL : _PyUnicodeError_CAST(self); -} +// --- PyUnicodeEncodeObject: 'encoding' getters ------------------------------ +// Note: PyUnicodeTranslateError does not have an 'encoding' attribute. PyObject * PyUnicodeEncodeError_GetEncoding(PyObject *self) { - PyUnicodeErrorObject *exc = as_unicode_error(self, "UnicodeEncodeError"); - return exc == NULL ? NULL : get_unicode(exc->encoding, "encoding"); + int rc = check_unicode_error_type(self, Py_UNICODE_ENCODE_ERROR_NAME); + return rc < 0 ? NULL : unicode_error_get_encoding_impl(self); } + PyObject * PyUnicodeDecodeError_GetEncoding(PyObject *self) { - PyUnicodeErrorObject *exc = as_unicode_error(self, "UnicodeDecodeError"); - return exc == NULL ? NULL : get_unicode(exc->encoding, "encoding"); + int rc = check_unicode_error_type(self, Py_UNICODE_DECODE_ERROR_NAME); + return rc < 0 ? NULL : unicode_error_get_encoding_impl(self); } + +// --- PyUnicodeEncodeObject: 'object' getters -------------------------------- + PyObject * PyUnicodeEncodeError_GetObject(PyObject *self) { - PyUnicodeErrorObject *exc = as_unicode_error(self, "UnicodeEncodeError"); - return exc == NULL ? NULL : get_unicode(exc->object, "object"); + int rc = check_unicode_error_type(self, Py_UNICODE_ENCODE_ERROR_NAME); + return rc < 0 ? NULL : unicode_error_get_object_impl(self, false); } + PyObject * PyUnicodeDecodeError_GetObject(PyObject *self) { - PyUnicodeErrorObject *exc = as_unicode_error(self, "UnicodeDecodeError"); - return exc == NULL ? NULL : get_bytes(exc->object, "object"); + int rc = check_unicode_error_type(self, Py_UNICODE_DECODE_ERROR_NAME); + return rc < 0 ? NULL : unicode_error_get_object_impl(self, true); } + PyObject * PyUnicodeTranslateError_GetObject(PyObject *self) { - PyUnicodeErrorObject *exc = as_unicode_error(self, "UnicodeTranslateError"); - return exc == NULL ? NULL : get_unicode(exc->object, "object"); + int rc = check_unicode_error_type(self, Py_UNICODE_TRANSLATE_ERROR_NAME); + return rc < 0 ? NULL : unicode_error_get_object_impl(self, false); } + +// --- PyUnicodeEncodeObject: 'start' getters --------------------------------- + +/* + * Specialization of _PyUnicodeError_GetParams() for the 'start' attribute. + * + * The caller is responsible to ensure that 'self' is a PyUnicodeErrorObject, + * although this condition is verified by this function on DEBUG builds. + */ +static inline int +unicode_error_get_start_impl(PyObject *self, Py_ssize_t *start, int as_bytes) +{ + assert(self != NULL); + return _PyUnicodeError_GetParams(self, NULL, NULL, start, NULL, as_bytes); +} + + int PyUnicodeEncodeError_GetStart(PyObject *self, Py_ssize_t *start) { - PyUnicodeErrorObject *exc = as_unicode_error(self, "UnicodeEncodeError"); - if (exc == NULL) { - return -1; - } - PyObject *obj = get_unicode(exc->object, "object"); - if (obj == NULL) { - return -1; - } - Py_ssize_t size = PyUnicode_GET_LENGTH(obj); - Py_DECREF(obj); - *start = unicode_error_adjust_start(exc->start, size); - return 0; + int rc = check_unicode_error_type(self, Py_UNICODE_ENCODE_ERROR_NAME); + return rc < 0 ? -1 : unicode_error_get_start_impl(self, start, false); } int PyUnicodeDecodeError_GetStart(PyObject *self, Py_ssize_t *start) { - PyUnicodeErrorObject *exc = as_unicode_error(self, "UnicodeDecodeError"); - if (exc == NULL) { - return -1; - } - PyObject *obj = get_bytes(exc->object, "object"); - if (obj == NULL) { - return -1; - } - Py_ssize_t size = PyBytes_GET_SIZE(obj); - Py_DECREF(obj); - *start = unicode_error_adjust_start(exc->start, size); - return 0; + int rc = check_unicode_error_type(self, Py_UNICODE_DECODE_ERROR_NAME); + return rc < 0 ? -1 : unicode_error_get_start_impl(self, start, true); } int PyUnicodeTranslateError_GetStart(PyObject *self, Py_ssize_t *start) { - PyUnicodeErrorObject *exc = as_unicode_error(self, "UnicodeTranslateError"); - if (exc == NULL) { - return -1; - } - PyObject *obj = get_unicode(exc->object, "object"); - if (obj == NULL) { - return -1; - } - Py_ssize_t size = PyUnicode_GET_LENGTH(obj); - Py_DECREF(obj); - *start = unicode_error_adjust_start(exc->start, size); - return 0; + int rc = check_unicode_error_type(self, Py_UNICODE_TRANSLATE_ERROR_NAME); + return rc < 0 ? -1 : unicode_error_get_start_impl(self, start, false); } -static inline int -unicode_error_set_start_impl(PyObject *self, Py_ssize_t start) -{ - PyUnicodeErrorObject *exc = _PyUnicodeError_CAST(self); - exc->start = start; - return 0; -} - +// --- PyUnicodeEncodeObject: 'start' setters --------------------------------- int PyUnicodeEncodeError_SetStart(PyObject *self, Py_ssize_t start) { - int rc = check_unicode_error_type(self, "UnicodeEncodeError"); + int rc = check_unicode_error_type(self, Py_UNICODE_ENCODE_ERROR_NAME); return rc < 0 ? -1 : unicode_error_set_start_impl(self, start); } @@ -2883,7 +3032,7 @@ PyUnicodeEncodeError_SetStart(PyObject *self, Py_ssize_t start) int PyUnicodeDecodeError_SetStart(PyObject *self, Py_ssize_t start) { - int rc = check_unicode_error_type(self, "UnicodeDecodeError"); + int rc = check_unicode_error_type(self, Py_UNICODE_DECODE_ERROR_NAME); return rc < 0 ? -1 : unicode_error_set_start_impl(self, start); } @@ -2891,78 +3040,57 @@ PyUnicodeDecodeError_SetStart(PyObject *self, Py_ssize_t start) int PyUnicodeTranslateError_SetStart(PyObject *self, Py_ssize_t start) { - int rc = check_unicode_error_type(self, "UnicodeTranslateError"); + int rc = check_unicode_error_type(self, Py_UNICODE_TRANSLATE_ERROR_NAME); return rc < 0 ? -1 : unicode_error_set_start_impl(self, start); } +// --- PyUnicodeEncodeObject: 'end' getters ----------------------------------- + +/* + * Specialization of _PyUnicodeError_GetParams() for the 'end' attribute. + * + * The caller is responsible to ensure that 'self' is a PyUnicodeErrorObject, + * although this condition is verified by this function on DEBUG builds. + */ +static inline int +unicode_error_get_end_impl(PyObject *self, Py_ssize_t *end, int as_bytes) +{ + assert(self != NULL); + return _PyUnicodeError_GetParams(self, NULL, NULL, NULL, end, as_bytes); +} + + int PyUnicodeEncodeError_GetEnd(PyObject *self, Py_ssize_t *end) { - PyUnicodeErrorObject *exc = as_unicode_error(self, "UnicodeEncodeError"); - if (exc == NULL) { - return -1; - } - PyObject *obj = get_unicode(exc->object, "object"); - if (obj == NULL) { - return -1; - } - Py_ssize_t size = PyUnicode_GET_LENGTH(obj); - Py_DECREF(obj); - *end = unicode_error_adjust_end(exc->end, size); - return 0; + int rc = check_unicode_error_type(self, Py_UNICODE_ENCODE_ERROR_NAME); + return rc < 0 ? -1 : unicode_error_get_end_impl(self, end, false); } int PyUnicodeDecodeError_GetEnd(PyObject *self, Py_ssize_t *end) { - PyUnicodeErrorObject *exc = as_unicode_error(self, "UnicodeDecodeError"); - if (exc == NULL) { - return -1; - } - PyObject *obj = get_bytes(exc->object, "object"); - if (obj == NULL) { - return -1; - } - Py_ssize_t size = PyBytes_GET_SIZE(obj); - Py_DECREF(obj); - *end = unicode_error_adjust_end(exc->end, size); - return 0; + int rc = check_unicode_error_type(self, Py_UNICODE_DECODE_ERROR_NAME); + return rc < 0 ? -1 : unicode_error_get_end_impl(self, end, true); } int PyUnicodeTranslateError_GetEnd(PyObject *self, Py_ssize_t *end) { - PyUnicodeErrorObject *exc = as_unicode_error(self, "UnicodeTranslateError"); - if (exc == NULL) { - return -1; - } - PyObject *obj = get_unicode(exc->object, "object"); - if (obj == NULL) { - return -1; - } - Py_ssize_t size = PyUnicode_GET_LENGTH(obj); - Py_DECREF(obj); - *end = unicode_error_adjust_end(exc->end, size); - return 0; + int rc = check_unicode_error_type(self, Py_UNICODE_TRANSLATE_ERROR_NAME); + return rc < 0 ? -1 : unicode_error_get_end_impl(self, end, false); } -static inline int -unicode_error_set_end_impl(PyObject *self, Py_ssize_t end) -{ - PyUnicodeErrorObject *exc = _PyUnicodeError_CAST(self); - exc->end = end; - return 0; -} - +// --- PyUnicodeEncodeObject: 'end' setters ----------------------------------- int PyUnicodeEncodeError_SetEnd(PyObject *self, Py_ssize_t end) { - int rc = check_unicode_error_type(self, "UnicodeEncodeError"); + int rc = check_unicode_error_type(self, Py_UNICODE_ENCODE_ERROR_NAME); return rc < 0 ? -1 : unicode_error_set_end_impl(self, end); } @@ -2970,7 +3098,7 @@ PyUnicodeEncodeError_SetEnd(PyObject *self, Py_ssize_t end) int PyUnicodeDecodeError_SetEnd(PyObject *self, Py_ssize_t end) { - int rc = check_unicode_error_type(self, "UnicodeDecodeError"); + int rc = check_unicode_error_type(self, Py_UNICODE_DECODE_ERROR_NAME); return rc < 0 ? -1 : unicode_error_set_end_impl(self, end); } @@ -2978,56 +3106,60 @@ PyUnicodeDecodeError_SetEnd(PyObject *self, Py_ssize_t end) int PyUnicodeTranslateError_SetEnd(PyObject *self, Py_ssize_t end) { - int rc = check_unicode_error_type(self, "UnicodeTranslateError"); + int rc = check_unicode_error_type(self, Py_UNICODE_TRANSLATE_ERROR_NAME); return rc < 0 ? -1 : unicode_error_set_end_impl(self, end); } +// --- PyUnicodeEncodeObject: 'reason' getters -------------------------------- + PyObject * PyUnicodeEncodeError_GetReason(PyObject *self) { - PyUnicodeErrorObject *exc = as_unicode_error(self, "UnicodeEncodeError"); - return exc == NULL ? NULL : get_unicode(exc->reason, "reason"); + int rc = check_unicode_error_type(self, Py_UNICODE_ENCODE_ERROR_NAME); + return rc < 0 ? NULL : unicode_error_get_reason_impl(self); } PyObject * PyUnicodeDecodeError_GetReason(PyObject *self) { - PyUnicodeErrorObject *exc = as_unicode_error(self, "UnicodeDecodeError"); - return exc == NULL ? NULL : get_unicode(exc->reason, "reason"); + int rc = check_unicode_error_type(self, Py_UNICODE_DECODE_ERROR_NAME); + return rc < 0 ? NULL : unicode_error_get_reason_impl(self); } PyObject * PyUnicodeTranslateError_GetReason(PyObject *self) { - PyUnicodeErrorObject *exc = as_unicode_error(self, "UnicodeTranslateError"); - return exc == NULL ? NULL : get_unicode(exc->reason, "reason"); + int rc = check_unicode_error_type(self, Py_UNICODE_TRANSLATE_ERROR_NAME); + return rc < 0 ? NULL : unicode_error_get_reason_impl(self); } +// --- PyUnicodeEncodeObject: 'reason' setters -------------------------------- + int PyUnicodeEncodeError_SetReason(PyObject *self, const char *reason) { - PyUnicodeErrorObject *exc = as_unicode_error(self, "UnicodeEncodeError"); - return exc == NULL ? -1 : set_unicodefromstring(&exc->reason, reason); + int rc = check_unicode_error_type(self, Py_UNICODE_ENCODE_ERROR_NAME); + return rc < 0 ? -1 : unicode_error_set_reason_impl(self, reason); } int PyUnicodeDecodeError_SetReason(PyObject *self, const char *reason) { - PyUnicodeErrorObject *exc = as_unicode_error(self, "UnicodeDecodeError"); - return exc == NULL ? -1 : set_unicodefromstring(&exc->reason, reason); + int rc = check_unicode_error_type(self, Py_UNICODE_DECODE_ERROR_NAME); + return rc < 0 ? -1 : unicode_error_set_reason_impl(self, reason); } int PyUnicodeTranslateError_SetReason(PyObject *self, const char *reason) { - PyUnicodeErrorObject *exc = as_unicode_error(self, "UnicodeTranslateError"); - return exc == NULL ? -1 : set_unicodefromstring(&exc->reason, reason); + int rc = check_unicode_error_type(self, Py_UNICODE_TRANSLATE_ERROR_NAME); + return rc < 0 ? -1 : unicode_error_set_reason_impl(self, reason); }
participants (1)
-
encukou