[Patches] Unicode Patch Set 2000-03-31

M.-A. Lemburg mal@lemburg.com
Fri, 31 Mar 2000 10:35:29 +0200


This is a multi-part message in MIME format.
--------------4953D72560CE2BCDA7B64DD4
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit

The attached patch set for the Unicode implementation includes
the following changes:

1. Error reporting in the codec registry and lookup mechanism
   is enhanced to be more informative.

2. The large unicode database table is broken in pages of
   4k entries each. This should fix compiler problems on some
   platforms.

-- 
Marc-Andre Lemburg
______________________________________________________________________
Business:                                      http://www.lemburg.com/
Python Pages:                           http://www.lemburg.com/python/
--------------4953D72560CE2BCDA7B64DD4
Content-Type: text/plain; charset=us-ascii;
 name="Unicode-Implementation-2000-03-31.patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline;
 filename="Unicode-Implementation-2000-03-31.patch"

diff -u -rP -x *.o -x *.pyc -x Makefile -x *~ -x *.so -x add2lib -x pgen -x buildno -x config.* -x libpython* -x python -x Setup -x Setup.local -x Setup.thread -x hassignal -x Makefile.pre -x *.bak -x *.s -x DEADJOE -x Demo -x CVS CVS-Python/Lib/codecs.py Python+Unicode/Lib/codecs.py
--- CVS-Python/Lib/codecs.py	Sat Mar 25 11:56:29 2000
+++ Python+Unicode/Lib/codecs.py	Wed Mar 29 18:25:57 2000
@@ -11,7 +11,11 @@
 
 ### Registry and builtin stateless codec functions
 
-from _codecs import *
+try:
+    from _codecs import *
+except ImportError,why:
+    raise SystemError,\
+          'Failed to load the builtin codecs: %s' % why
 
 ### Constants
 
Only in CVS-Python/Lib: distutils
diff -u -rP -x *.o -x *.pyc -x Makefile -x *~ -x *.so -x add2lib -x pgen -x buildno -x config.* -x libpython* -x python -x Setup -x Setup.local -x Setup.thread -x hassignal -x Makefile.pre -x *.bak -x *.s -x DEADJOE -x Demo -x CVS CVS-Python/Lib/encodings/aliases.py Python+Unicode/Lib/encodings/aliases.py
--- CVS-Python/Lib/encodings/aliases.py	Sat Mar 11 00:17:18 2000
+++ Python+Unicode/Lib/encodings/aliases.py	Thu Mar 30 01:42:03 2000
@@ -24,8 +24,8 @@
     'u16': 'utf_16',
     'utf_16be': 'utf_16_be',
     'utf_16le': 'utf_16_le',
-    'UnicodeBigUnmarked': 'utf_16_be',
-    'UnicodeLittleUnmarked': 'utf_16_le',
+    'unicodebigunmarked': 'utf_16_be',
+    'unicodelittleunmarked': 'utf_16_le',
 
     # ASCII
     'us_ascii': 'ascii',
@@ -47,11 +47,11 @@
     'iso_8859_9': 'iso8859_9',
 
     # Mac
-    'MacCentralEurope': 'mac_latin2',
-    'MacCyrillic': 'mac_cyrillic',
-    'MacGreek': 'mac_greek',
-    'MacIceland': 'mac_iceland',
-    'MacRoman': 'mac_roman',
-    'MacTurkish': 'mac_turkish',
+    'maccentraleurope': 'mac_latin2',
+    'maccyrillic': 'mac_cyrillic',
+    'macgreek': 'mac_greek',
+    'maciceland': 'mac_iceland',
+    'macroman': 'mac_roman',
+    'macturkish': 'mac_turkish',
 
 }
Only in CVS-Python/Lib/test/output: test_extcall
Only in CVS-Python/Lib/test/output: test_winreg
Only in CVS-Python/Lib/test: test_extcall.py
Only in CVS-Python/Lib/test: test_winreg.py
diff -u -rP -x *.o -x *.pyc -x Makefile -x *~ -x *.so -x add2lib -x pgen -x buildno -x config.* -x libpython* -x python -x Setup -x Setup.local -x Setup.thread -x hassignal -x Makefile.pre -x *.bak -x *.s -x DEADJOE -x Demo -x CVS CVS-Python/Modules/unicodedata.c Python+Unicode/Modules/unicodedata.c
--- CVS-Python/Modules/unicodedata.c	Sat Mar 11 00:10:21 2000
+++ Python+Unicode/Modules/unicodedata.c	Fri Mar 31 10:27:45 2000
@@ -13,6 +13,18 @@
 #include "Python.h"
 #include "unicodedatabase.h"
 
+/* --- Helpers ------------------------------------------------------------ */
+
+static 
+const _PyUnicode_DatabaseRecord *unicode_db(register int i)
+{
+    register int page = i >> 12;
+    
+    if (page < sizeof(_PyUnicode_Database))
+	return &_PyUnicode_Database[page][i & 0x0fff];
+    return &_PyUnicode_Database[0][0];
+}
+
 /* --- Module API --------------------------------------------------------- */
 
 static PyObject *
@@ -132,7 +144,7 @@
 			"need a single Unicode character as parameter");
 	goto onError;
     }
-    index = (int)_PyUnicode_Database[(int)*PyUnicode_AS_UNICODE(v)].category;
+    index = (int)unicode_db((int)*PyUnicode_AS_UNICODE(v))->category;
     if (index < 0 || 
 	index > sizeof(_PyUnicode_CategoryNames) / 
 	        sizeof(_PyUnicode_CategoryNames[0])) {
@@ -162,8 +174,7 @@
 			"need a single Unicode character as parameter");
 	goto onError;
     }
-    index = (int)_PyUnicode_Database[
-			  (int)*PyUnicode_AS_UNICODE(v)].bidirectional;
+    index = (int)unicode_db((int)*PyUnicode_AS_UNICODE(v))->bidirectional;
     if (index < 0 || 
 	index > sizeof(_PyUnicode_CategoryNames) / 
 	        sizeof(_PyUnicode_CategoryNames[0])) {
@@ -193,8 +204,7 @@
 			"need a single Unicode character as parameter");
 	goto onError;
     }
-    value = (int)_PyUnicode_Database[
-                          (int)*PyUnicode_AS_UNICODE(v)].combining;
+    value = (int)unicode_db((int)*PyUnicode_AS_UNICODE(v))->combining;
     return PyInt_FromLong(value);
     
  onError:
@@ -216,7 +226,7 @@
 			"need a single Unicode character as parameter");
 	goto onError;
     }
-    value = (int)_PyUnicode_Database[(int)*PyUnicode_AS_UNICODE(v)].mirrored;
+    value = (int)unicode_db((int)*PyUnicode_AS_UNICODE(v))->mirrored;
     return PyInt_FromLong(value);
     
  onError:
@@ -238,7 +248,7 @@
 			"need a single Unicode character as parameter");
 	goto onError;
     }
-    value = _PyUnicode_Database[(int)*PyUnicode_AS_UNICODE(v)].decomposition;
+    value = unicode_db((int)*PyUnicode_AS_UNICODE(v))->decomposition;
     if (value == NULL)
 	return PyString_FromString("");
     else
diff -u -rP -x *.o -x *.pyc -x Makefile -x *~ -x *.so -x add2lib -x pgen -x buildno -x config.* -x libpython* -x python -x Setup -x Setup.local -x Setup.thread -x hassignal -x Makefile.pre -x *.bak -x *.s -x DEADJOE -x Demo -x CVS CVS-Python/Modules/unicodedatabase.c Python+Unicode/Modules/unicodedatabase.c
--- CVS-Python/Modules/unicodedatabase.c	Sat Mar 11 00:08:02 2000
+++ Python+Unicode/Modules/unicodedatabase.c	Fri Mar 31 10:18:32 2000
@@ -87,9 +87,9 @@
     0  /* Sentinel */
 };
 
-/* --- Unicode Database --------------------------------------------------- */
+/* --- Unicode Database Pages --------------------------------------------- */
 
-const _PyUnicode_DatabaseRecord _PyUnicode_Database[65536] = {
+const _PyUnicode_DatabaseRecord _PyUnicode_Database_0[4096] = {
     
 /* U+0000 */ { 13, 0, 15, 0, 0 },
 /* U+0001 */ { 13, 0, 15, 0, 0 },
@@ -4187,6 +4187,11 @@
 /* U+0ffd */ { 0, 0, 0, 0, 0 },
 /* U+0ffe */ { 0, 0, 0, 0, 0 },
 /* U+0fff */ { 0, 0, 0, 0, 0 },
+
+};
+
+const _PyUnicode_DatabaseRecord _PyUnicode_Database_1[4096] = {
+    
 /* U+1000 */ { 19, 0, 1, 0, 0 },
 /* U+1001 */ { 19, 0, 1, 0, 0 },
 /* U+1002 */ { 19, 0, 1, 0, 0 },
@@ -8283,6 +8288,11 @@
 /* U+1ffd */ { 29, 0, 19, 0, "00B4" },
 /* U+1ffe */ { 29, 0, 19, 0, "<compat> 0020 0314" },
 /* U+1fff */ { 0, 0, 0, 0, 0 },
+
+};
+
+const _PyUnicode_DatabaseRecord _PyUnicode_Database_2[4096] = {
+    
 /* U+2000 */ { 10, 0, 18, 0, "2002" },
 /* U+2001 */ { 10, 0, 18, 0, "2003" },
 /* U+2002 */ { 10, 0, 18, 0, "<compat> 0020" },
@@ -12379,6 +12389,11 @@
 /* U+2ffd */ { 0, 0, 0, 0, 0 },
 /* U+2ffe */ { 0, 0, 0, 0, 0 },
 /* U+2fff */ { 0, 0, 0, 0, 0 },
+
+};
+
+const _PyUnicode_DatabaseRecord _PyUnicode_Database_3[4096] = {
+    
 /* U+3000 */ { 10, 0, 18, 0, "<wide> 0020" },
 /* U+3001 */ { 26, 0, 19, 0, 0 },
 /* U+3002 */ { 26, 0, 19, 0, 0 },
@@ -16475,6 +16490,11 @@
 /* U+3ffd */ { 0, 0, 0, 0, 0 },
 /* U+3ffe */ { 0, 0, 0, 0, 0 },
 /* U+3fff */ { 0, 0, 0, 0, 0 },
+
+};
+
+const _PyUnicode_DatabaseRecord _PyUnicode_Database_4[4096] = {
+    
 /* U+4000 */ { 0, 0, 0, 0, 0 },
 /* U+4001 */ { 0, 0, 0, 0, 0 },
 /* U+4002 */ { 0, 0, 0, 0, 0 },
@@ -20571,6 +20591,11 @@
 /* U+4ffd */ { 0, 0, 0, 0, 0 },
 /* U+4ffe */ { 0, 0, 0, 0, 0 },
 /* U+4fff */ { 0, 0, 0, 0, 0 },
+
+};
+
+const _PyUnicode_DatabaseRecord _PyUnicode_Database_5[4096] = {
+    
 /* U+5000 */ { 0, 0, 0, 0, 0 },
 /* U+5001 */ { 0, 0, 0, 0, 0 },
 /* U+5002 */ { 0, 0, 0, 0, 0 },
@@ -24667,6 +24692,11 @@
 /* U+5ffd */ { 0, 0, 0, 0, 0 },
 /* U+5ffe */ { 0, 0, 0, 0, 0 },
 /* U+5fff */ { 0, 0, 0, 0, 0 },
+
+};
+
+const _PyUnicode_DatabaseRecord _PyUnicode_Database_6[4096] = {
+    
 /* U+6000 */ { 0, 0, 0, 0, 0 },
 /* U+6001 */ { 0, 0, 0, 0, 0 },
 /* U+6002 */ { 0, 0, 0, 0, 0 },
@@ -28763,6 +28793,11 @@
 /* U+6ffd */ { 0, 0, 0, 0, 0 },
 /* U+6ffe */ { 0, 0, 0, 0, 0 },
 /* U+6fff */ { 0, 0, 0, 0, 0 },
+
+};
+
+const _PyUnicode_DatabaseRecord _PyUnicode_Database_7[4096] = {
+    
 /* U+7000 */ { 0, 0, 0, 0, 0 },
 /* U+7001 */ { 0, 0, 0, 0, 0 },
 /* U+7002 */ { 0, 0, 0, 0, 0 },
@@ -32859,6 +32894,11 @@
 /* U+7ffd */ { 0, 0, 0, 0, 0 },
 /* U+7ffe */ { 0, 0, 0, 0, 0 },
 /* U+7fff */ { 0, 0, 0, 0, 0 },
+
+};
+
+const _PyUnicode_DatabaseRecord _PyUnicode_Database_8[4096] = {
+    
 /* U+8000 */ { 0, 0, 0, 0, 0 },
 /* U+8001 */ { 0, 0, 0, 0, 0 },
 /* U+8002 */ { 0, 0, 0, 0, 0 },
@@ -36955,6 +36995,11 @@
 /* U+8ffd */ { 0, 0, 0, 0, 0 },
 /* U+8ffe */ { 0, 0, 0, 0, 0 },
 /* U+8fff */ { 0, 0, 0, 0, 0 },
+
+};
+
+const _PyUnicode_DatabaseRecord _PyUnicode_Database_9[4096] = {
+    
 /* U+9000 */ { 0, 0, 0, 0, 0 },
 /* U+9001 */ { 0, 0, 0, 0, 0 },
 /* U+9002 */ { 0, 0, 0, 0, 0 },
@@ -41051,6 +41096,11 @@
 /* U+9ffd */ { 0, 0, 0, 0, 0 },
 /* U+9ffe */ { 0, 0, 0, 0, 0 },
 /* U+9fff */ { 0, 0, 0, 0, 0 },
+
+};
+
+const _PyUnicode_DatabaseRecord _PyUnicode_Database_10[4096] = {
+    
 /* U+a000 */ { 19, 0, 1, 0, 0 },
 /* U+a001 */ { 19, 0, 1, 0, 0 },
 /* U+a002 */ { 19, 0, 1, 0, 0 },
@@ -45147,6 +45197,11 @@
 /* U+affd */ { 0, 0, 0, 0, 0 },
 /* U+affe */ { 0, 0, 0, 0, 0 },
 /* U+afff */ { 0, 0, 0, 0, 0 },
+
+};
+
+const _PyUnicode_DatabaseRecord _PyUnicode_Database_11[4096] = {
+    
 /* U+b000 */ { 0, 0, 0, 0, 0 },
 /* U+b001 */ { 0, 0, 0, 0, 0 },
 /* U+b002 */ { 0, 0, 0, 0, 0 },
@@ -49243,6 +49298,11 @@
 /* U+bffd */ { 0, 0, 0, 0, 0 },
 /* U+bffe */ { 0, 0, 0, 0, 0 },
 /* U+bfff */ { 0, 0, 0, 0, 0 },
+
+};
+
+const _PyUnicode_DatabaseRecord _PyUnicode_Database_12[4096] = {
+    
 /* U+c000 */ { 0, 0, 0, 0, 0 },
 /* U+c001 */ { 0, 0, 0, 0, 0 },
 /* U+c002 */ { 0, 0, 0, 0, 0 },
@@ -53339,6 +53399,11 @@
 /* U+cffd */ { 0, 0, 0, 0, 0 },
 /* U+cffe */ { 0, 0, 0, 0, 0 },
 /* U+cfff */ { 0, 0, 0, 0, 0 },
+
+};
+
+const _PyUnicode_DatabaseRecord _PyUnicode_Database_13[4096] = {
+    
 /* U+d000 */ { 0, 0, 0, 0, 0 },
 /* U+d001 */ { 0, 0, 0, 0, 0 },
 /* U+d002 */ { 0, 0, 0, 0, 0 },
@@ -57435,6 +57500,11 @@
 /* U+dffd */ { 0, 0, 0, 0, 0 },
 /* U+dffe */ { 0, 0, 0, 0, 0 },
 /* U+dfff */ { 15, 0, 1, 0, 0 },
+
+};
+
+const _PyUnicode_DatabaseRecord _PyUnicode_Database_14[4096] = {
+    
 /* U+e000 */ { 16, 0, 1, 0, 0 },
 /* U+e001 */ { 0, 0, 0, 0, 0 },
 /* U+e002 */ { 0, 0, 0, 0, 0 },
@@ -61531,6 +61601,11 @@
 /* U+effd */ { 0, 0, 0, 0, 0 },
 /* U+effe */ { 0, 0, 0, 0, 0 },
 /* U+efff */ { 0, 0, 0, 0, 0 },
+
+};
+
+const _PyUnicode_DatabaseRecord _PyUnicode_Database_15[4096] = {
+    
 /* U+f000 */ { 0, 0, 0, 0, 0 },
 /* U+f001 */ { 0, 0, 0, 0, 0 },
 /* U+f002 */ { 0, 0, 0, 0, 0 },
@@ -65627,4 +65702,27 @@
 /* U+fffd */ { 30, 0, 19, 0, 0 },
 /* U+fffe */ { 0, 0, 0, 0, 0 },
 /* U+ffff */ { 0, 0, 0, 0, 0 },
+
+};
+
+/* --- Unicode Database --------------------------------------------------- */
+
+const _PyUnicode_DatabaseRecord *_PyUnicode_Database[16] = {
+    _PyUnicode_Database_0,
+    _PyUnicode_Database_1,
+    _PyUnicode_Database_2,
+    _PyUnicode_Database_3,
+    _PyUnicode_Database_4,
+    _PyUnicode_Database_5,
+    _PyUnicode_Database_6,
+    _PyUnicode_Database_7,
+    _PyUnicode_Database_8,
+    _PyUnicode_Database_9,
+    _PyUnicode_Database_10,
+    _PyUnicode_Database_11,
+    _PyUnicode_Database_12,
+    _PyUnicode_Database_13,
+    _PyUnicode_Database_14,
+    _PyUnicode_Database_15,
 };
+
diff -u -rP -x *.o -x *.pyc -x Makefile -x *~ -x *.so -x add2lib -x pgen -x buildno -x config.* -x libpython* -x python -x Setup -x Setup.local -x Setup.thread -x hassignal -x Makefile.pre -x *.bak -x *.s -x DEADJOE -x Demo -x CVS CVS-Python/Modules/unicodedatabase.h Python+Unicode/Modules/unicodedatabase.h
--- CVS-Python/Modules/unicodedatabase.h	Sat Mar 11 00:08:04 2000
+++ Python+Unicode/Modules/unicodedatabase.h	Fri Mar 31 10:16:03 2000
@@ -30,4 +30,4 @@
 
 /* --- Unicode Database --------------------------------------------------- */
 
-extern const _PyUnicode_DatabaseRecord _PyUnicode_Database[65536];
+extern const _PyUnicode_DatabaseRecord *_PyUnicode_Database[16];
Only in CVS-Python/Objects: .#stringobject.c.2.59
Only in CVS-Python/Objects: stringobject.c.orig
diff -u -rP -x *.o -x *.pyc -x Makefile -x *~ -x *.so -x add2lib -x pgen -x buildno -x config.* -x libpython* -x python -x Setup -x Setup.local -x Setup.thread -x hassignal -x Makefile.pre -x *.bak -x *.s -x DEADJOE -x Demo -x CVS CVS-Python/Objects/unicodeobject.c Python+Unicode/Objects/unicodeobject.c
--- CVS-Python/Objects/unicodeobject.c	Tue Mar 28 09:19:18 2000
+++ Python+Unicode/Objects/unicodeobject.c	Tue Mar 28 15:47:04 2000
@@ -1483,7 +1483,9 @@
 }
 
 #ifdef MS_WIN32
+
 /* --- MBCS codecs for Windows -------------------------------------------- */
+
 PyObject *PyUnicode_DecodeMBCS(const char *s,
 				int size,
 				const char *errors)
@@ -1536,6 +1538,7 @@
     }
     return repr;
 }
+
 #endif /* MS_WIN32 */
 
 /* --- Character Mapping Codec -------------------------------------------- */
Only in CVS-Python/PC: winreg.c
Only in CVS-Python/PCbuild: python16.dsp
Only in CVS-Python/PCbuild: python16.wse
Only in CVS-Python/PCbuild: winreg.dsp
diff -u -rP -x *.o -x *.pyc -x Makefile -x *~ -x *.so -x add2lib -x pgen -x buildno -x config.* -x libpython* -x python -x Setup -x Setup.local -x Setup.thread -x hassignal -x Makefile.pre -x *.bak -x *.s -x DEADJOE -x Demo -x CVS CVS-Python/Python/codecs.c Python+Unicode/Python/codecs.c
--- CVS-Python/Python/codecs.c	Fri Mar 24 22:21:29 2000
+++ Python+Unicode/Python/codecs.c	Wed Mar 29 18:17:19 2000
@@ -27,42 +27,61 @@
    This is done in a lazy way so that the Unicode implementation does
    not downgrade startup time of scripts not needing it.
 
-   Errors are silently ignored by this function. Only one try is made.
+   ImportErrors are silently ignored by this function. Only one try is
+   made.
 
 */
 
 static
-void import_encodings() 
+int import_encodings() 
 {
     PyObject *mod;
     
     import_encodings_called = 1;
     mod = PyImport_ImportModule("encodings");
     if (mod == NULL) {
-	PyErr_Clear();
-	return;
+	if (PyErr_ExceptionMatches(PyExc_ImportError)) {
+	    /* Ignore ImportErrors... this is done so that
+	       distributions can disable the encodings package. Note
+	       that other errors are not masked, e.g. SystemErrors
+	       raised to inform the user of an error in the Python
+	       configuration are still reported back to the user. */
+	    PyErr_Clear();
+	    return 0;
+	}
+	return -1;
     }
     Py_DECREF(mod);
+    return 0;
 }
 
 /* Register a new codec search function.
 
+   As side effect, this tries to load the encodings package, if not
+   yet done, to make sure that it is always first in the list of
+   search functions.
+
    The search_function's refcount is incremented by this function. */
 
 int PyCodec_Register(PyObject *search_function)
 {
-    if (!import_encodings_called)
-	import_encodings();
+    if (!import_encodings_called) {
+	if (import_encodings())
+	    goto onError;
+    }
     if (search_function == NULL) {
 	PyErr_BadArgument();
-	return -1;
+	goto onError;
     }
     if (!PyCallable_Check(search_function)) {
 	PyErr_SetString(PyExc_TypeError,
 			"argument must be callable");
-	return -1;
+	goto onError;
     }
     return PyList_Append(_PyCodec_SearchPath, search_function);
+
+ onError:
+    return -1;
 }
 
 static
@@ -89,20 +108,29 @@
    characters. This makes encodings looked up through this mechanism
    effectively case-insensitive.
 
-   If no codec is found, a KeyError is set and NULL returned.  */
+   If no codec is found, a KeyError is set and NULL returned. 
+
+   As side effect, this tries to load the encodings package, if not
+   yet done. This is part of the lazy load strategy for the encodings
+   package.
+
+*/
 
 PyObject *_PyCodec_Lookup(const char *encoding)
 {
     PyObject *result, *args = NULL, *v;
     int i, len;
 
-    if (_PyCodec_SearchCache == NULL || _PyCodec_SearchPath == NULL) {
+    if (_PyCodec_SearchCache == NULL || 
+	_PyCodec_SearchPath == NULL) {
 	PyErr_SetString(PyExc_SystemError,
 			"codec module not properly initialized");
 	goto onError;
     }
-    if (!import_encodings_called)
-	import_encodings();
+    if (!import_encodings_called) {
+	if (import_encodings())
+	    goto onError;
+    }
 
     /* Convert the encoding to a lower-cased Python string */
     v = lowercasestring(encoding);
@@ -127,6 +155,12 @@
     len = PyList_Size(_PyCodec_SearchPath);
     if (len < 0)
 	goto onError;
+    if (len == 0) {
+	PyErr_SetString(PyExc_LookupError,
+			"no codec search functions registered: "
+			"can't find encoding");
+	goto onError;
+    }
 
     for (i = 0; i < len; i++) {
 	PyObject *func;

--------------4953D72560CE2BCDA7B64DD4--