[Python-checkins] gh-96268: Fix loading invalid UTF-8 (GH-96270)
miss-islington
webhook-mailer at python.org
Wed Sep 7 17:49:23 EDT 2022
https://github.com/python/cpython/commit/ffafa9b91da8731d21958209dd1478f48eaa2d09
commit: ffafa9b91da8731d21958209dd1478f48eaa2d09
branch: 3.11
author: Miss Islington (bot) <31488909+miss-islington at users.noreply.github.com>
committer: miss-islington <31488909+miss-islington at users.noreply.github.com>
date: 2022-09-07T14:49:17-07:00
summary:
gh-96268: Fix loading invalid UTF-8 (GH-96270)
This makes tokenizer.c:valid_utf8 match stringlib/codecs.h:decode_utf8.
It also fixes an off-by-one error introduced in 3.10 for the line number when the tokenizer reports bad UTF8.
(cherry picked from commit 8bc356a7dd50cbdb46d10b8c7e457832431f5d9e)
Co-authored-by: Michael Droettboom <mdboom at gmail.com>
files:
A Misc/NEWS.d/next/Core and Builtins/2022-08-25-10-19-34.gh-issue-96268.AbYrLB.rst
M Lib/test/test_source_encoding.py
M Parser/tokenizer.c
diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py
index d37914dcd74..e357264eb1d 100644
--- a/Lib/test/test_source_encoding.py
+++ b/Lib/test/test_source_encoding.py
@@ -248,8 +248,10 @@ def test_invalid_utf8(self):
# test it is to write actual files to disk.
# Each example is put inside a string at the top of the file so
- # it's an otherwise valid Python source file.
- template = b'"%s"\n'
+ # it's an otherwise valid Python source file. Put some newlines
+ # beforehand so we can assert that the error is reported on the
+ # correct line.
+ template = b'\n\n\n"%s"\n'
fn = TESTFN
self.addCleanup(unlink, fn)
@@ -257,7 +259,12 @@ def test_invalid_utf8(self):
def check(content):
with open(fn, 'wb') as fp:
fp.write(template % content)
- script_helper.assert_python_failure(fn)
+ rc, stdout, stderr = script_helper.assert_python_failure(fn)
+ # We want to assert that the python subprocess failed gracefully,
+ # not via a signal.
+ self.assertGreaterEqual(rc, 1)
+ self.assertIn(b"Non-UTF-8 code starting with", stderr)
+ self.assertIn(b"on line 4", stderr)
# continuation bytes in a sequence of 2, 3, or 4 bytes
continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)]
diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-08-25-10-19-34.gh-issue-96268.AbYrLB.rst b/Misc/NEWS.d/next/Core and Builtins/2022-08-25-10-19-34.gh-issue-96268.AbYrLB.rst
new file mode 100644
index 00000000000..987d85ff3ba
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2022-08-25-10-19-34.gh-issue-96268.AbYrLB.rst
@@ -0,0 +1,2 @@
+Loading a file with invalid UTF-8 will now report the broken character at
+the correct location.
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index b5ebcd044f8..8d9fbf5cf95 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -486,25 +486,59 @@ static void fp_ungetc(int c, struct tok_state *tok) {
/* Check whether the characters at s start a valid
UTF-8 sequence. Return the number of characters forming
- the sequence if yes, 0 if not. */
-static int valid_utf8(const unsigned char* s)
+ the sequence if yes, 0 if not. The special cases match
+ those in stringlib/codecs.h:utf8_decode.
+*/
+static int
+valid_utf8(const unsigned char* s)
{
int expected = 0;
int length;
- if (*s < 0x80)
+ if (*s < 0x80) {
/* single-byte code */
return 1;
- if (*s < 0xc0)
- /* following byte */
- return 0;
- if (*s < 0xE0)
+ }
+ else if (*s < 0xE0) {
+ /* \xC2\x80-\xDF\xBF -- 0080-07FF */
+ if (*s < 0xC2) {
+ /* invalid sequence
+ \x80-\xBF -- continuation byte
+ \xC0-\xC1 -- fake 0000-007F */
+ return 0;
+ }
expected = 1;
- else if (*s < 0xF0)
+ }
+ else if (*s < 0xF0) {
+ /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
+ if (*s == 0xE0 && *(s + 1) < 0xA0) {
+ /* invalid sequence
+ \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
+ return 0;
+ }
+ else if (*s == 0xED && *(s + 1) >= 0xA0) {
+ /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
+ will result in surrogates in range D800-DFFF. Surrogates are
+ not valid UTF-8 so they are rejected.
+ See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
+ (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
+ return 0;
+ }
expected = 2;
- else if (*s < 0xF8)
+ }
+ else if (*s < 0xF5) {
+ /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
+ if (*(s + 1) < 0x90 ? *s == 0xF0 : *s == 0xF4) {
+ /* invalid sequence -- one of:
+ \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF
+ \xF4\x90\x80\x80- -- 110000- overflow */
+ return 0;
+ }
expected = 3;
- else
+ }
+ else {
+ /* invalid start byte */
return 0;
+ }
length = expected + 1;
for (; expected; expected--)
if (s[expected] < 0x80 || s[expected] >= 0xC0)
@@ -525,14 +559,12 @@ ensure_utf8(char *line, struct tok_state *tok)
}
}
if (badchar) {
- /* Need to add 1 to the line number, since this line
- has not been counted, yet. */
PyErr_Format(PyExc_SyntaxError,
"Non-UTF-8 code starting with '\\x%.2x' "
"in file %U on line %i, "
"but no encoding declared; "
"see https://peps.python.org/pep-0263/ for details",
- badchar, tok->filename, tok->lineno + 1);
+ badchar, tok->filename, tok->lineno);
return 0;
}
return 1;
More information about the Python-checkins
mailing list