bpo-39219: Fix SyntaxError attributes in the tokenizer. (GH-17828)
https://github.com/python/cpython/commit/efd878cdb46d9c7038d93fb36eb1ff7dc5b... commit: efd878cdb46d9c7038d93fb36eb1ff7dc5baf9ec branch: 3.8 author: Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com> committer: GitHub <noreply@github.com> date: 2020-02-12T02:35:10-08:00 summary: bpo-39219: Fix SyntaxError attributes in the tokenizer. (GH-17828) * Always set the text attribute. * Correct the offset attribute for non-ascii sources. (cherry picked from commit 0cc6b5e559b8303b18fdd56c2befd900fe7b5e35) Co-authored-by: Serhiy Storchaka <storchaka@gmail.com> files: A Misc/NEWS.d/next/Core and Builtins/2020-01-05-13-36-08.bpo-39219.uHtKd4.rst M Lib/test/test_exceptions.py M Parser/tokenizer.c diff --git a/Lib/test/test_exceptions.py b/Lib/test/test_exceptions.py index 10c1e076464e2..3a32253157369 100644 --- a/Lib/test/test_exceptions.py +++ b/Lib/test/test_exceptions.py @@ -179,17 +179,25 @@ def ckmsg(src, msg, exception=SyntaxError): ckmsg(s, "inconsistent use of tabs and spaces in indentation", TabError) def testSyntaxErrorOffset(self): - def check(src, lineno, offset): + def check(src, lineno, offset, encoding='utf-8'): with self.assertRaises(SyntaxError) as cm: compile(src, '<fragment>', 'exec') self.assertEqual(cm.exception.lineno, lineno) self.assertEqual(cm.exception.offset, offset) + if cm.exception.text is not None: + if not isinstance(src, str): + src = src.decode(encoding, 'replace') + line = src.split('\n')[lineno-1] + self.assertEqual(cm.exception.text.rstrip('\n'), line) check('def fact(x):\n\treturn x!\n', 2, 10) check('1 +\n', 1, 4) check('def spam():\n print(1)\n print(2)', 3, 10) check('Python = "Python" +', 1, 20) check('Python = "\u1e54\xfd\u0163\u0125\xf2\xf1" +', 1, 20) + check(b'# -*- coding: cp1251 -*-\nPython = "\xcf\xb3\xf2\xee\xed" +', + 2, 19, encoding='cp1251') + check(b'Python = "\xcf\xb3\xf2\xee\xed" +', 1, 18) check('x = "a', 1, 7) check('lambda x: x = 2', 1, 1) @@ -205,6 +213,10 @@ def check(src, lineno, offset): check('0010 + 2', 1, 4) check('x = 32e-+4', 1, 8) check('x = 0o9', 1, 6) + check('\u03b1 = 0xI', 1, 6) + check(b'\xce\xb1 = 0xI', 1, 6) + check(b'# -*- coding: iso8859-7 -*-\n\xe1 = 0xI', 2, 6, + encoding='iso8859-7') # Errors thrown by symtable.c check('x = [(yield i) for i in range(3)]', 1, 5) diff --git a/Misc/NEWS.d/next/Core and Builtins/2020-01-05-13-36-08.bpo-39219.uHtKd4.rst b/Misc/NEWS.d/next/Core and Builtins/2020-01-05-13-36-08.bpo-39219.uHtKd4.rst new file mode 100644 index 0000000000000..dac8360df712c --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2020-01-05-13-36-08.bpo-39219.uHtKd4.rst @@ -0,0 +1,2 @@ +Syntax errors raised in the tokenizer now always set correct "text" and +"offset" attributes. diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index f73c32684c7b7..aecbcebb917e8 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -1,6 +1,7 @@ /* Tokenizer implementation */ +#define PY_SSIZE_T_CLEAN #include "Python.h" #include <ctype.h> @@ -1034,17 +1035,44 @@ tok_backup(struct tok_state *tok, int c) static int syntaxerror(struct tok_state *tok, const char *format, ...) { + PyObject *errmsg, *errtext, *args; va_list vargs; #ifdef HAVE_STDARG_PROTOTYPES va_start(vargs, format); #else va_start(vargs); #endif - PyErr_FormatV(PyExc_SyntaxError, format, vargs); + errmsg = PyUnicode_FromFormatV(format, vargs); va_end(vargs); - PyErr_SyntaxLocationObject(tok->filename, - tok->lineno, - (int)(tok->cur - tok->line_start)); + if (!errmsg) { + goto error; + } + + errtext = PyUnicode_DecodeUTF8(tok->line_start, tok->cur - tok->line_start, + "replace"); + if (!errtext) { + goto error; + } + int offset = (int)PyUnicode_GET_LENGTH(errtext); + Py_ssize_t line_len = strcspn(tok->line_start, "\n"); + if (line_len != tok->cur - tok->line_start) { + Py_DECREF(errtext); + errtext = PyUnicode_DecodeUTF8(tok->line_start, line_len, + "replace"); + } + if (!errtext) { + goto error; + } + + args = Py_BuildValue("(O(OiiN))", errmsg, + tok->filename, tok->lineno, offset, errtext); + if (args) { + PyErr_SetObject(PyExc_SyntaxError, args); + Py_DECREF(args); + } + +error: + Py_XDECREF(errmsg); tok->done = E_ERROR; return ERRORTOKEN; }
participants (1)
-
Miss Islington (bot)