[Python-3000-checkins] r56621 - in python/branches/py3k-struni: Lib/test/badsyntax_pep3120.py Lib/test/test_pep3120.py Misc/NEWS Parser/tokenizer.c Python/ast.c
martin.v.loewis
python-3000-checkins at python.org
Sun Jul 29 20:10:02 CEST 2007
Author: martin.v.loewis
Date: Sun Jul 29 20:10:01 2007
New Revision: 56621
Added:
python/branches/py3k-struni/Lib/test/badsyntax_pep3120.py (contents, props changed)
python/branches/py3k-struni/Lib/test/test_pep3120.py
- copied, changed from r56608, python/branches/py3k-struni/Lib/test/test_pep263.py
Modified:
python/branches/py3k-struni/Misc/NEWS
python/branches/py3k-struni/Parser/tokenizer.c
python/branches/py3k-struni/Python/ast.c
Log:
Implement PEP 3120.
Added: python/branches/py3k-struni/Lib/test/badsyntax_pep3120.py
==============================================================================
--- (empty file)
+++ python/branches/py3k-struni/Lib/test/badsyntax_pep3120.py Sun Jul 29 20:10:01 2007
@@ -0,0 +1 @@
+print("böse")
Copied: python/branches/py3k-struni/Lib/test/test_pep3120.py (from r56608, python/branches/py3k-struni/Lib/test/test_pep263.py)
==============================================================================
Binary files. No diff available.
Modified: python/branches/py3k-struni/Misc/NEWS
==============================================================================
--- python/branches/py3k-struni/Misc/NEWS (original)
+++ python/branches/py3k-struni/Misc/NEWS Sun Jul 29 20:10:01 2007
@@ -26,6 +26,8 @@
Core and Builtins
-----------------
+- PEP 3120: Change default encoding to UTF-8.
+
- PEP 3123: Use proper C inheritance for PyObject.
- Removed the __oct__ and __hex__ special methods and added a bin()
Modified: python/branches/py3k-struni/Parser/tokenizer.c
==============================================================================
--- python/branches/py3k-struni/Parser/tokenizer.c (original)
+++ python/branches/py3k-struni/Parser/tokenizer.c Sun Jul 29 20:10:01 2007
@@ -444,6 +444,34 @@
ungetc(c, tok->fp);
}
+/* Check whether the characters at s start a valid
+ UTF-8 sequence. Return the number of characters forming
+ the sequence if yes, 0 if not. */
+static int valid_utf8(const unsigned char* s)
+{
+ int expected = 0;
+ int length;
+ if (*s < 0x80)
+ /* single-byte code */
+ return 1;
+ if (*s < 0xc0)
+ /* following byte */
+ return 0;
+ if (*s < 0xE0)
+ expected = 1;
+ else if (*s < 0xF0)
+ expected = 2;
+ else if (*s < 0xF8)
+ expected = 3;
+ else
+ return 0;
+ length = expected + 1;
+ for (; expected; expected--)
+ if (s[expected] < 0x80 || s[expected] >= 0xC0)
+ return 0;
+ return length;
+}
+
/* Read a line of input from TOK. Determine encoding
if necessary. */
@@ -478,12 +506,13 @@
}
}
#ifndef PGEN
- /* The default encoding is ASCII, so make sure we don't have any
- non-ASCII bytes in it. */
+ /* The default encoding is UTF-8, so make sure we don't have any
+ non-UTF-8 sequences in it. */
if (line && !tok->encoding) {
unsigned char *c;
- for (c = (unsigned char *)line; *c; c++)
- if (*c > 127) {
+ int length;
+ for (c = (unsigned char *)line; *c; c += length)
+ if (!(length = valid_utf8(c))) {
badchar = *c;
break;
}
@@ -493,7 +522,7 @@
/* Need to add 1 to the line number, since this line
has not been counted, yet. */
sprintf(buf,
- "Non-ASCII character '\\x%.2x' "
+ "Non-UTF-8 code starting with '\\x%.2x' "
"in file %.200s on line %i, "
"but no encoding declared; "
"see http://www.python.org/peps/pep-0263.html for details",
Modified: python/branches/py3k-struni/Python/ast.c
==============================================================================
--- python/branches/py3k-struni/Python/ast.c (original)
+++ python/branches/py3k-struni/Python/ast.c Sun Jul 29 20:10:01 2007
@@ -203,7 +203,8 @@
c.c_encoding = STR(n);
n = CHILD(n, 0);
} else {
- c.c_encoding = NULL;
+ /* PEP 3120 */
+ c.c_encoding = "utf-8";
}
c.c_arena = arena;
More information about the Python-3000-checkins
mailing list