[Python-checkins] cpython: add str.casefold() (closes #13752)

benjamin.peterson python-checkins at python.org
Sat Jan 14 19:23:39 CET 2012


http://hg.python.org/cpython/rev/d4669f43d05f
changeset:   74405:d4669f43d05f
user:        Benjamin Peterson <benjamin at python.org>
date:        Sat Jan 14 13:23:30 2012 -0500
summary:
  add str.casefold() (closes #13752)

files:
  Doc/library/stdtypes.rst         |    8 +
  Include/unicodeobject.h          |    5 +
  Lib/test/test_unicode.py         |    8 +
  Misc/NEWS                        |    2 +
  Objects/unicodectype.c           |   25 +-
  Objects/unicodeobject.c          |   35 +
  Objects/unicodetype_db.h         |  497 ++++++++++++++----
  Tools/unicode/makeunicodedata.py |   50 +-
  8 files changed, 493 insertions(+), 137 deletions(-)


diff --git a/Doc/library/stdtypes.rst b/Doc/library/stdtypes.rst
--- a/Doc/library/stdtypes.rst
+++ b/Doc/library/stdtypes.rst
@@ -1002,6 +1002,14 @@
    rest lowercased.
 
 
+.. method:: str.casefold()
+
+   Return a casefolded copy of the string. Casefolded strings may be used for
+   caseless matching. For example, ``"MASSE".casefold() == "maße".casefold()``.
+
+   .. versionadded:: 3.3
+
+
 .. method:: str.center(width[, fillchar])
 
    Return centered in a string of length *width*. Padding is done using the
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -2023,6 +2023,11 @@
     Py_UCS4 *res
     );
 
+PyAPI_FUNC(int) _PyUnicode_ToFoldedFull(
+    Py_UCS4 ch,       /* Unicode character */
+    Py_UCS4 *res
+    );
+
 PyAPI_FUNC(int) _PyUnicode_IsCaseIgnorable(
     Py_UCS4 ch         /* Unicode character */
     );
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -565,6 +565,14 @@
         self.assertEqual('\U0008fffe'.lower(), '\U0008fffe')
         self.assertEqual('\u2177'.lower(), '\u2177')
 
+    def test_casefold(self):
+        self.assertEqual('hello'.casefold(), 'hello')
+        self.assertEqual('hELlo'.casefold(), 'hello')
+        self.assertEqual('ß'.casefold(), 'ss')
+        self.assertEqual('fi'.casefold(), 'fi')
+        self.assertEqual('\u03a3'.casefold(), '\u03c3')
+        self.assertEqual('A\u0345\u03a3'.casefold(), 'a\u03b9\u03c3')
+
     def test_upper(self):
         string_tests.CommonTest.test_upper(self)
         self.assertEqual('\U0001044F'.upper(), '\U00010427')
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,6 +10,8 @@
 Core and Builtins
 -----------------
 
+- Issue #13752: Add a casefold() method to str.
+
 - Issue #13761: Add a "flush" keyword argument to the print() function,
   used to ensure flushing the output stream.
 
diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c
--- a/Objects/unicodectype.c
+++ b/Objects/unicodectype.c
@@ -185,7 +185,7 @@
     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
 
     if (ctype->flags & EXTENDED_CASE_MASK)
-        return _PyUnicode_ExtendedCase[ctype->upper & 0xFFFFFF];
+        return _PyUnicode_ExtendedCase[ctype->upper & 0xFFFF];
     return ctype->upper ? ctype->upper : ch;
 }
 
@@ -197,7 +197,7 @@
     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
 
     if (ctype->flags & EXTENDED_CASE_MASK)
-        return _PyUnicode_ExtendedCase[ctype->lower & 0xFFFFFF];
+        return _PyUnicode_ExtendedCase[ctype->lower & 0xFFFF];
     return ctype->lower ? ctype->lower : ch;
 }
 
@@ -206,7 +206,7 @@
     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
 
     if (ctype->flags & EXTENDED_CASE_MASK) {
-        int index = ctype->lower & 0xFFFFFF;
+        int index = ctype->lower & 0xFFFF;
         int n = ctype->lower >> 24;
         int i;
         for (i = 0; i < n; i++)
@@ -222,7 +222,7 @@
     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
 
     if (ctype->flags & EXTENDED_CASE_MASK) {
-        int index = ctype->title & 0xFFFFFF;
+        int index = ctype->title & 0xFFFF;
         int n = ctype->title >> 24;
         int i;
         for (i = 0; i < n; i++)
@@ -238,7 +238,7 @@
     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
 
     if (ctype->flags & EXTENDED_CASE_MASK) {
-        int index = ctype->upper & 0xFFFFFF;
+        int index = ctype->upper & 0xFFFF;
         int n = ctype->upper >> 24;
         int i;
         for (i = 0; i < n; i++)
@@ -249,6 +249,21 @@
     return 1;
 }
 
+int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res)
+{
+    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
+
+    if (ctype->flags & EXTENDED_CASE_MASK && (ctype->lower >> 20) & 7) {
+        int index = (ctype->lower & 0xFFFF) + (ctype->lower >> 24);
+        int n = (ctype->lower >> 20) & 7;
+        int i;
+        for (i = 0; i < n; i++)
+            res[i] = _PyUnicode_ExtendedCase[index + i];
+        return n;
+    }
+    return _PyUnicode_ToLowerFull(ch, res);
+}
+
 int _PyUnicode_IsCased(Py_UCS4 ch)
 {
     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -9577,6 +9577,24 @@
 }
 
 static Py_ssize_t
+do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
+{
+    Py_ssize_t i, k = 0;
+
+    for (i = 0; i < length; i++) {
+        Py_UCS4 c = PyUnicode_READ(kind, data, i);
+        Py_UCS4 mapped[3];
+        int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
+        for (j = 0; j < n_res; j++) {
+            if (mapped[j] > *maxchar)
+                *maxchar = mapped[j];
+            res[k++] = mapped[j];
+        }
+    }
+    return k;
+}
+
+static Py_ssize_t
 do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
 {
     Py_ssize_t i, k = 0;
@@ -10501,6 +10519,22 @@
     return case_operation(self, do_capitalize);
 }
 
+PyDoc_STRVAR(casefold__doc__,
+             "S.casefold() -> str\n\
+\n\
+Return a version of S suitable for caseless comparisons.");
+
+static PyObject *
+unicode_casefold(PyObject *self)
+{
+    if (PyUnicode_READY(self) == -1)
+        return NULL;
+    if (PyUnicode_IS_ASCII(self))
+        return ascii_upper_or_lower(self, 1);
+    return case_operation(self, do_casefold);
+}
+
+
 /* Argument converter.  Coerces to a single unicode character */
 
 static int
@@ -12998,6 +13032,7 @@
     {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
     {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
     {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
+    {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
     {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
     {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
     {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
diff --git a/Objects/unicodetype_db.h b/Objects/unicodetype_db.h
--- a/Objects/unicodetype_db.h
+++ b/Objects/unicodetype_db.h
@@ -76,7 +76,7 @@
     {0, 0, 0, 0, 0, 4096},
     {0, 0, 0, 0, 2, 3076},
     {0, 0, 0, 0, 3, 3076},
-    {924, 181, 924, 0, 0, 9993},
+    {16777218, 17825792, 16777218, 0, 0, 26377},
     {0, 0, 0, 0, 0, 5632},
     {0, 0, 0, 0, 1, 3076},
     {0, 0, 0, 0, 0, 3072},
@@ -110,7 +110,7 @@
     {220, 252, 220, 0, 0, 10113},
     {221, 253, 221, 0, 0, 10113},
     {222, 254, 222, 0, 0, 10113},
-    {33554433, 16777216, 33554435, 0, 0, 26377},
+    {33554438, 18874371, 33554440, 0, 0, 26377},
     {192, 224, 192, 0, 0, 9993},
     {193, 225, 193, 0, 0, 9993},
     {194, 226, 194, 0, 0, 9993},
@@ -190,7 +190,7 @@
     {300, 301, 300, 0, 0, 9993},
     {302, 303, 302, 0, 0, 10113},
     {302, 303, 302, 0, 0, 9993},
-    {16777223, 33554437, 16777223, 0, 0, 26497},
+    {16777228, 33554442, 16777228, 0, 0, 26497},
     {73, 305, 73, 0, 0, 9993},
     {306, 307, 306, 0, 0, 10113},
     {306, 307, 306, 0, 0, 9993},
@@ -214,7 +214,7 @@
     {325, 326, 325, 0, 0, 9993},
     {327, 328, 327, 0, 0, 10113},
     {327, 328, 327, 0, 0, 9993},
-    {33554441, 16777224, 33554441, 0, 0, 26377},
+    {33554448, 18874381, 33554448, 0, 0, 26377},
     {330, 331, 330, 0, 0, 10113},
     {330, 331, 330, 0, 0, 9993},
     {332, 333, 332, 0, 0, 10113},
@@ -268,7 +268,7 @@
     {379, 380, 379, 0, 0, 9993},
     {381, 382, 381, 0, 0, 10113},
     {381, 382, 381, 0, 0, 9993},
-    {83, 383, 83, 0, 0, 9993},
+    {16777236, 17825810, 16777236, 0, 0, 26377},
     {579, 384, 579, 0, 0, 9993},
     {385, 595, 385, 0, 0, 10113},
     {386, 387, 386, 0, 0, 10113},
@@ -371,7 +371,7 @@
     {492, 493, 492, 0, 0, 9993},
     {494, 495, 494, 0, 0, 10113},
     {494, 495, 494, 0, 0, 9993},
-    {33554444, 16777227, 33554444, 0, 0, 26377},
+    {33554456, 18874389, 33554456, 0, 0, 26377},
     {497, 499, 498, 0, 0, 10113},
     {497, 499, 498, 0, 0, 10049},
     {497, 499, 498, 0, 0, 9993},
@@ -490,7 +490,7 @@
     {439, 658, 439, 0, 0, 9993},
     {0, 0, 0, 0, 0, 14089},
     {0, 0, 0, 0, 0, 5889},
-    {921, 837, 921, 0, 0, 13832},
+    {16777244, 17825818, 16777244, 0, 0, 30216},
     {880, 881, 880, 0, 0, 10113},
     {880, 881, 880, 0, 0, 9993},
     {882, 883, 882, 0, 0, 10113},
@@ -508,7 +508,7 @@
     {908, 972, 908, 0, 0, 10113},
     {910, 973, 910, 0, 0, 10113},
     {911, 974, 911, 0, 0, 10113},
-    {50331663, 16777230, 50331663, 0, 0, 26377},
+    {50331681, 19922973, 50331681, 0, 0, 26377},
     {913, 945, 913, 0, 0, 10113},
     {914, 946, 914, 0, 0, 10113},
     {915, 947, 915, 0, 0, 10113},
@@ -539,7 +539,7 @@
     {904, 941, 904, 0, 0, 9993},
     {905, 942, 905, 0, 0, 9993},
     {906, 943, 906, 0, 0, 9993},
-    {50331667, 16777234, 50331667, 0, 0, 26377},
+    {50331688, 19922980, 50331688, 0, 0, 26377},
     {913, 945, 913, 0, 0, 9993},
     {914, 946, 914, 0, 0, 9993},
     {915, 947, 915, 0, 0, 9993},
@@ -557,7 +557,7 @@
     {927, 959, 927, 0, 0, 9993},
     {928, 960, 928, 0, 0, 9993},
     {929, 961, 929, 0, 0, 9993},
-    {931, 962, 931, 0, 0, 9993},
+    {16777261, 17825835, 16777261, 0, 0, 26377},
     {931, 963, 931, 0, 0, 9993},
     {932, 964, 932, 0, 0, 9993},
     {933, 965, 933, 0, 0, 9993},
@@ -571,11 +571,11 @@
     {910, 973, 910, 0, 0, 9993},
     {911, 974, 911, 0, 0, 9993},
     {975, 983, 975, 0, 0, 10113},
-    {914, 976, 914, 0, 0, 9993},
-    {920, 977, 920, 0, 0, 9993},
+    {16777264, 17825838, 16777264, 0, 0, 26377},
+    {16777267, 17825841, 16777267, 0, 0, 26377},
     {0, 0, 0, 0, 0, 10113},
-    {934, 981, 934, 0, 0, 9993},
-    {928, 982, 928, 0, 0, 9993},
+    {16777270, 17825844, 16777270, 0, 0, 26377},
+    {16777273, 17825847, 16777273, 0, 0, 26377},
     {975, 983, 975, 0, 0, 9993},
     {984, 985, 984, 0, 0, 10113},
     {984, 985, 984, 0, 0, 9993},
@@ -601,11 +601,11 @@
     {1004, 1005, 1004, 0, 0, 9993},
     {1006, 1007, 1006, 0, 0, 10113},
     {1006, 1007, 1006, 0, 0, 9993},
-    {922, 1008, 922, 0, 0, 9993},
-    {929, 1009, 929, 0, 0, 9993},
+    {16777276, 17825850, 16777276, 0, 0, 26377},
+    {16777279, 17825853, 16777279, 0, 0, 26377},
     {1017, 1010, 1017, 0, 0, 9993},
     {1012, 952, 1012, 0, 0, 10113},
-    {917, 1013, 917, 0, 0, 9993},
+    {16777282, 17825856, 16777282, 0, 0, 26377},
     {1015, 1016, 1015, 0, 0, 10113},
     {1015, 1016, 1015, 0, 0, 9993},
     {1017, 1010, 1017, 0, 0, 10113},
@@ -978,7 +978,7 @@
     {1364, 1412, 1364, 0, 0, 9993},
     {1365, 1413, 1365, 0, 0, 9993},
     {1366, 1414, 1366, 0, 0, 9993},
-    {33554455, 16777238, 33554457, 0, 0, 26377},
+    {33554502, 18874435, 33554504, 0, 0, 26377},
     {0, 0, 0, 0, 0, 1537},
     {4256, 11520, 4256, 0, 0, 10113},
     {4257, 11521, 4257, 0, 0, 10113},
@@ -1180,13 +1180,13 @@
     {7826, 7827, 7826, 0, 0, 9993},
     {7828, 7829, 7828, 0, 0, 10113},
     {7828, 7829, 7828, 0, 0, 9993},
-    {33554460, 16777243, 33554460, 0, 0, 26377},
-    {33554463, 16777246, 33554463, 0, 0, 26377},
-    {33554466, 16777249, 33554466, 0, 0, 26377},
-    {33554469, 16777252, 33554469, 0, 0, 26377},
-    {33554472, 16777255, 33554472, 0, 0, 26377},
-    {7776, 7835, 7776, 0, 0, 9993},
-    {7838, 223, 7838, 0, 0, 10113},
+    {33554509, 18874442, 33554509, 0, 0, 26377},
+    {33554514, 18874447, 33554514, 0, 0, 26377},
+    {33554519, 18874452, 33554519, 0, 0, 26377},
+    {33554524, 18874457, 33554524, 0, 0, 26377},
+    {33554529, 18874462, 33554529, 0, 0, 26377},
+    {16777317, 17825891, 16777317, 0, 0, 26377},
+    {16777321, 18874470, 16777321, 0, 0, 26497},
     {7840, 7841, 7840, 0, 0, 10113},
     {7840, 7841, 7840, 0, 0, 9993},
     {7842, 7843, 7842, 0, 0, 10113},
@@ -1355,13 +1355,13 @@
     {8011, 8003, 8011, 0, 0, 10113},
     {8012, 8004, 8012, 0, 0, 10113},
     {8013, 8005, 8013, 0, 0, 10113},
-    {33554475, 16777258, 33554475, 0, 0, 26377},
+    {33554541, 18874474, 33554541, 0, 0, 26377},
     {8025, 8017, 8025, 0, 0, 9993},
-    {50331694, 16777261, 50331694, 0, 0, 26377},
+    {50331763, 19923055, 50331763, 0, 0, 26377},
     {8027, 8019, 8027, 0, 0, 9993},
-    {50331698, 16777265, 50331698, 0, 0, 26377},
+    {50331770, 19923062, 50331770, 0, 0, 26377},
     {8029, 8021, 8029, 0, 0, 9993},
-    {50331702, 16777269, 50331702, 0, 0, 26377},
+    {50331777, 19923069, 50331777, 0, 0, 26377},
     {8031, 8023, 8031, 0, 0, 9993},
     {8025, 8017, 8025, 0, 0, 10113},
     {8027, 8019, 8027, 0, 0, 10113},
@@ -1397,110 +1397,110 @@
     {8171, 8059, 8171, 0, 0, 9993},
     {8186, 8060, 8186, 0, 0, 9993},
     {8187, 8061, 8187, 0, 0, 9993},
-    {33554490, 16777273, 16777276, 0, 0, 26377},
-    {33554494, 16777277, 16777280, 0, 0, 26377},
-    {33554498, 16777281, 16777284, 0, 0, 26377},
-    {33554502, 16777285, 16777288, 0, 0, 26377},
-    {33554506, 16777289, 16777292, 0, 0, 26377},
-    {33554510, 16777293, 16777296, 0, 0, 26377},
-    {33554514, 16777297, 16777300, 0, 0, 26377},
-    {33554518, 16777301, 16777304, 0, 0, 26377},
-    {33554522, 16777305, 16777308, 0, 0, 26433},
-    {33554526, 16777309, 16777312, 0, 0, 26433},
-    {33554530, 16777313, 16777316, 0, 0, 26433},
-    {33554534, 16777317, 16777320, 0, 0, 26433},
-    {33554538, 16777321, 16777324, 0, 0, 26433},
-    {33554542, 16777325, 16777328, 0, 0, 26433},
-    {33554546, 16777329, 16777332, 0, 0, 26433},
-    {33554550, 16777333, 16777336, 0, 0, 26433},
-    {33554554, 16777337, 16777340, 0, 0, 26377},
-    {33554558, 16777341, 16777344, 0, 0, 26377},
-    {33554562, 16777345, 16777348, 0, 0, 26377},
-    {33554566, 16777349, 16777352, 0, 0, 26377},
-    {33554570, 16777353, 16777356, 0, 0, 26377},
-    {33554574, 16777357, 16777360, 0, 0, 26377},
-    {33554578, 16777361, 16777364, 0, 0, 26377},
-    {33554582, 16777365, 16777368, 0, 0, 26377},
-    {33554586, 16777369, 16777372, 0, 0, 26433},
-    {33554590, 16777373, 16777376, 0, 0, 26433},
-    {33554594, 16777377, 16777380, 0, 0, 26433},
-    {33554598, 16777381, 16777384, 0, 0, 26433},
-    {33554602, 16777385, 16777388, 0, 0, 26433},
-    {33554606, 16777389, 16777392, 0, 0, 26433},
-    {33554610, 16777393, 16777396, 0, 0, 26433},
-    {33554614, 16777397, 16777400, 0, 0, 26433},
-    {33554618, 16777401, 16777404, 0, 0, 26377},
-    {33554622, 16777405, 16777408, 0, 0, 26377},
-    {33554626, 16777409, 16777412, 0, 0, 26377},
-    {33554630, 16777413, 16777416, 0, 0, 26377},
-    {33554634, 16777417, 16777420, 0, 0, 26377},
-    {33554638, 16777421, 16777424, 0, 0, 26377},
-    {33554642, 16777425, 16777428, 0, 0, 26377},
-    {33554646, 16777429, 16777432, 0, 0, 26377},
-    {33554650, 16777433, 16777436, 0, 0, 26433},
-    {33554654, 16777437, 16777440, 0, 0, 26433},
-    {33554658, 16777441, 16777444, 0, 0, 26433},
-    {33554662, 16777445, 16777448, 0, 0, 26433},
-    {33554666, 16777449, 16777452, 0, 0, 26433},
-    {33554670, 16777453, 16777456, 0, 0, 26433},
-    {33554674, 16777457, 16777460, 0, 0, 26433},
-    {33554678, 16777461, 16777464, 0, 0, 26433},
+    {33554567, 18874500, 16777353, 0, 0, 26377},
+    {33554573, 18874506, 16777359, 0, 0, 26377},
+    {33554579, 18874512, 16777365, 0, 0, 26377},
+    {33554585, 18874518, 16777371, 0, 0, 26377},
+    {33554591, 18874524, 16777377, 0, 0, 26377},
+    {33554597, 18874530, 16777383, 0, 0, 26377},
+    {33554603, 18874536, 16777389, 0, 0, 26377},
+    {33554609, 18874542, 16777395, 0, 0, 26377},
+    {33554615, 18874548, 16777401, 0, 0, 26433},
+    {33554621, 18874554, 16777407, 0, 0, 26433},
+    {33554627, 18874560, 16777413, 0, 0, 26433},
+    {33554633, 18874566, 16777419, 0, 0, 26433},
+    {33554639, 18874572, 16777425, 0, 0, 26433},
+    {33554645, 18874578, 16777431, 0, 0, 26433},
+    {33554651, 18874584, 16777437, 0, 0, 26433},
+    {33554657, 18874590, 16777443, 0, 0, 26433},
+    {33554663, 18874596, 16777449, 0, 0, 26377},
+    {33554669, 18874602, 16777455, 0, 0, 26377},
+    {33554675, 18874608, 16777461, 0, 0, 26377},
+    {33554681, 18874614, 16777467, 0, 0, 26377},
+    {33554687, 18874620, 16777473, 0, 0, 26377},
+    {33554693, 18874626, 16777479, 0, 0, 26377},
+    {33554699, 18874632, 16777485, 0, 0, 26377},
+    {33554705, 18874638, 16777491, 0, 0, 26377},
+    {33554711, 18874644, 16777497, 0, 0, 26433},
+    {33554717, 18874650, 16777503, 0, 0, 26433},
+    {33554723, 18874656, 16777509, 0, 0, 26433},
+    {33554729, 18874662, 16777515, 0, 0, 26433},
+    {33554735, 18874668, 16777521, 0, 0, 26433},
+    {33554741, 18874674, 16777527, 0, 0, 26433},
+    {33554747, 18874680, 16777533, 0, 0, 26433},
+    {33554753, 18874686, 16777539, 0, 0, 26433},
+    {33554759, 18874692, 16777545, 0, 0, 26377},
+    {33554765, 18874698, 16777551, 0, 0, 26377},
+    {33554771, 18874704, 16777557, 0, 0, 26377},
+    {33554777, 18874710, 16777563, 0, 0, 26377},
+    {33554783, 18874716, 16777569, 0, 0, 26377},
+    {33554789, 18874722, 16777575, 0, 0, 26377},
+    {33554795, 18874728, 16777581, 0, 0, 26377},
+    {33554801, 18874734, 16777587, 0, 0, 26377},
+    {33554807, 18874740, 16777593, 0, 0, 26433},
+    {33554813, 18874746, 16777599, 0, 0, 26433},
+    {33554819, 18874752, 16777605, 0, 0, 26433},
+    {33554825, 18874758, 16777611, 0, 0, 26433},
+    {33554831, 18874764, 16777617, 0, 0, 26433},
+    {33554837, 18874770, 16777623, 0, 0, 26433},
+    {33554843, 18874776, 16777629, 0, 0, 26433},
+    {33554849, 18874782, 16777635, 0, 0, 26433},
     {8120, 8112, 8120, 0, 0, 9993},
     {8121, 8113, 8121, 0, 0, 9993},
-    {33554682, 16777465, 33554684, 0, 0, 26377},
-    {33554687, 16777470, 16777473, 0, 0, 26377},
-    {33554691, 16777474, 33554693, 0, 0, 26377},
-    {33554696, 16777479, 33554696, 0, 0, 26377},
-    {50331915, 16777482, 50331918, 0, 0, 26377},
+    {33554855, 18874788, 33554857, 0, 0, 26377},
+    {33554862, 18874795, 16777648, 0, 0, 26377},
+    {33554868, 18874801, 33554870, 0, 0, 26377},
+    {33554875, 18874808, 33554875, 0, 0, 26377},
+    {50332097, 19923389, 50332100, 0, 0, 26377},
     {8120, 8112, 8120, 0, 0, 10113},
     {8121, 8113, 8121, 0, 0, 10113},
     {8122, 8048, 8122, 0, 0, 10113},
     {8123, 8049, 8123, 0, 0, 10113},
-    {33554706, 16777489, 16777492, 0, 0, 26433},
-    {921, 8126, 921, 0, 0, 9993},
-    {33554710, 16777493, 33554712, 0, 0, 26377},
-    {33554715, 16777498, 16777501, 0, 0, 26377},
-    {33554719, 16777502, 33554721, 0, 0, 26377},
-    {33554724, 16777507, 33554724, 0, 0, 26377},
-    {50331943, 16777510, 50331946, 0, 0, 26377},
+    {33554890, 18874823, 16777676, 0, 0, 26433},
+    {16777679, 17826253, 16777679, 0, 0, 26377},
+    {33554899, 18874832, 33554901, 0, 0, 26377},
+    {33554906, 18874839, 16777692, 0, 0, 26377},
+    {33554912, 18874845, 33554914, 0, 0, 26377},
+    {33554919, 18874852, 33554919, 0, 0, 26377},
+    {50332141, 19923433, 50332144, 0, 0, 26377},
     {8136, 8050, 8136, 0, 0, 10113},
     {8137, 8051, 8137, 0, 0, 10113},
     {8138, 8052, 8138, 0, 0, 10113},
     {8139, 8053, 8139, 0, 0, 10113},
-    {33554734, 16777517, 16777520, 0, 0, 26433},
+    {33554934, 18874867, 16777720, 0, 0, 26433},
     {8152, 8144, 8152, 0, 0, 9993},
     {8153, 8145, 8153, 0, 0, 9993},
-    {50331954, 16777521, 50331954, 0, 0, 26377},
-    {50331958, 16777525, 50331958, 0, 0, 26377},
-    {33554746, 16777529, 33554746, 0, 0, 26377},
-    {50331965, 16777532, 50331965, 0, 0, 26377},
+    {50332157, 19923449, 50332157, 0, 0, 26377},
+    {50332164, 19923456, 50332164, 0, 0, 26377},
+    {33554954, 18874887, 33554954, 0, 0, 26377},
+    {50332176, 19923468, 50332176, 0, 0, 26377},
     {8152, 8144, 8152, 0, 0, 10113},
     {8153, 8145, 8153, 0, 0, 10113},
     {8154, 8054, 8154, 0, 0, 10113},
     {8155, 8055, 8155, 0, 0, 10113},
     {8168, 8160, 8168, 0, 0, 9993},
     {8169, 8161, 8169, 0, 0, 9993},
-    {50331969, 16777536, 50331969, 0, 0, 26377},
-    {50331973, 16777540, 50331973, 0, 0, 26377},
-    {33554761, 16777544, 33554761, 0, 0, 26377},
+    {50332183, 19923475, 50332183, 0, 0, 26377},
+    {50332190, 19923482, 50332190, 0, 0, 26377},
+    {33554980, 18874913, 33554980, 0, 0, 26377},
     {8172, 8165, 8172, 0, 0, 9993},
-    {33554764, 16777547, 33554764, 0, 0, 26377},
-    {50331983, 16777550, 50331983, 0, 0, 26377},
+    {33554985, 18874918, 33554985, 0, 0, 26377},
+    {50332207, 19923499, 50332207, 0, 0, 26377},
     {8168, 8160, 8168, 0, 0, 10113},
     {8169, 8161, 8169, 0, 0, 10113},
     {8170, 8058, 8170, 0, 0, 10113},
     {8171, 8059, 8171, 0, 0, 10113},
     {8172, 8165, 8172, 0, 0, 10113},
-    {33554771, 16777554, 33554773, 0, 0, 26377},
-    {33554776, 16777559, 16777562, 0, 0, 26377},
-    {33554780, 16777563, 33554782, 0, 0, 26377},
-    {33554785, 16777568, 33554785, 0, 0, 26377},
-    {50332004, 16777571, 50332007, 0, 0, 26377},
+    {33554997, 18874930, 33554999, 0, 0, 26377},
+    {33555004, 18874937, 16777790, 0, 0, 26377},
+    {33555010, 18874943, 33555012, 0, 0, 26377},
+    {33555017, 18874950, 33555017, 0, 0, 26377},
+    {50332239, 19923531, 50332242, 0, 0, 26377},
     {8184, 8056, 8184, 0, 0, 10113},
     {8185, 8057, 8185, 0, 0, 10113},
     {8186, 8060, 8186, 0, 0, 10113},
     {8187, 8061, 8187, 0, 0, 10113},
-    {33554795, 16777578, 16777581, 0, 0, 26433},
+    {33555032, 18874965, 16777818, 0, 0, 26433},
     {0, 0, 0, 0, 0, 3076},
     {0, 0, 0, 0, 4, 3076},
     {0, 0, 0, 0, 5, 3076},
@@ -2037,18 +2037,18 @@
     {42918, 42919, 42918, 0, 0, 9993},
     {42920, 42921, 42920, 0, 0, 10113},
     {42920, 42921, 42920, 0, 0, 9993},
-    {33554799, 16777582, 33554801, 0, 0, 26377},
-    {33554804, 16777587, 33554806, 0, 0, 26377},
-    {33554809, 16777592, 33554811, 0, 0, 26377},
-    {50332030, 16777597, 50332033, 0, 0, 26377},
-    {50332037, 16777604, 50332040, 0, 0, 26377},
-    {33554828, 16777611, 33554830, 0, 0, 26377},
-    {33554833, 16777616, 33554835, 0, 0, 26377},
-    {33554838, 16777621, 33554840, 0, 0, 26377},
-    {33554843, 16777626, 33554845, 0, 0, 26377},
-    {33554848, 16777631, 33554850, 0, 0, 26377},
-    {33554853, 16777636, 33554855, 0, 0, 26377},
-    {33554858, 16777641, 33554860, 0, 0, 26377},
+    {33555038, 18874971, 33555040, 0, 0, 26377},
+    {33555045, 18874978, 33555047, 0, 0, 26377},
+    {33555052, 18874985, 33555054, 0, 0, 26377},
+    {50332276, 19923568, 50332279, 0, 0, 26377},
+    {50332286, 19923578, 50332289, 0, 0, 26377},
+    {33555079, 18875012, 33555081, 0, 0, 26377},
+    {33555086, 18875019, 33555088, 0, 0, 26377},
+    {33555093, 18875026, 33555095, 0, 0, 26377},
+    {33555100, 18875033, 33555102, 0, 0, 26377},
+    {33555107, 18875040, 33555109, 0, 0, 26377},
+    {33555114, 18875047, 33555116, 0, 0, 26377},
+    {33555121, 18875054, 33555123, 0, 0, 26377},
     {0, 0, 0, 0, 0, 1025},
     {65313, 65345, 65313, 0, 0, 10113},
     {65314, 65346, 65314, 0, 0, 10113},
@@ -2188,7 +2188,12 @@
 /* extended case mappings */
 
 const Py_UCS4 _PyUnicode_ExtendedCase[] = {
+    181,
+    956,
+    924,
     223,
+    115,
+    115,
     83,
     83,
     83,
@@ -2198,263 +2203,440 @@
     304,
     329,
     700,
+    110,
+    700,
     78,
+    383,
+    115,
+    83,
     496,
+    106,
+    780,
     74,
     780,
+    837,
+    953,
+    921,
     912,
+    953,
+    776,
+    769,
     921,
     776,
     769,
     944,
+    965,
+    776,
+    769,
     933,
     776,
     769,
+    962,
+    963,
+    931,
+    976,
+    946,
+    914,
+    977,
+    952,
+    920,
+    981,
+    966,
+    934,
+    982,
+    960,
+    928,
+    1008,
+    954,
+    922,
+    1009,
+    961,
+    929,
+    1013,
+    949,
+    917,
     1415,
+    1381,
+    1410,
     1333,
     1362,
     1333,
     1410,
     7830,
+    104,
+    817,
     72,
     817,
     7831,
+    116,
+    776,
     84,
     776,
     7832,
+    119,
+    778,
     87,
     778,
     7833,
+    121,
+    778,
     89,
     778,
     7834,
+    97,
+    702,
     65,
     702,
+    7835,
+    7777,
+    7776,
+    223,
+    115,
+    115,
+    7838,
     8016,
+    965,
+    787,
     933,
     787,
     8018,
+    965,
+    787,
+    768,
     933,
     787,
     768,
     8020,
+    965,
+    787,
+    769,
     933,
     787,
     769,
     8022,
+    965,
+    787,
+    834,
     933,
     787,
     834,
     8064,
+    7936,
+    953,
     7944,
     921,
     8072,
     8065,
+    7937,
+    953,
     7945,
     921,
     8073,
     8066,
+    7938,
+    953,
     7946,
     921,
     8074,
     8067,
+    7939,
+    953,
     7947,
     921,
     8075,
     8068,
+    7940,
+    953,
     7948,
     921,
     8076,
     8069,
+    7941,
+    953,
     7949,
     921,
     8077,
     8070,
+    7942,
+    953,
     7950,
     921,
     8078,
     8071,
+    7943,
+    953,
     7951,
     921,
     8079,
     8064,
+    7936,
+    953,
     7944,
     921,
     8072,
     8065,
+    7937,
+    953,
     7945,
     921,
     8073,
     8066,
+    7938,
+    953,
     7946,
     921,
     8074,
     8067,
+    7939,
+    953,
     7947,
     921,
     8075,
     8068,
+    7940,
+    953,
     7948,
     921,
     8076,
     8069,
+    7941,
+    953,
     7949,
     921,
     8077,
     8070,
+    7942,
+    953,
     7950,
     921,
     8078,
     8071,
+    7943,
+    953,
     7951,
     921,
     8079,
     8080,
+    7968,
+    953,
     7976,
     921,
     8088,
     8081,
+    7969,
+    953,
     7977,
     921,
     8089,
     8082,
+    7970,
+    953,
     7978,
     921,
     8090,
     8083,
+    7971,
+    953,
     7979,
     921,
     8091,
     8084,
+    7972,
+    953,
     7980,
     921,
     8092,
     8085,
+    7973,
+    953,
     7981,
     921,
     8093,
     8086,
+    7974,
+    953,
     7982,
     921,
     8094,
     8087,
+    7975,
+    953,
     7983,
     921,
     8095,
     8080,
+    7968,
+    953,
     7976,
     921,
     8088,
     8081,
+    7969,
+    953,
     7977,
     921,
     8089,
     8082,
+    7970,
+    953,
     7978,
     921,
     8090,
     8083,
+    7971,
+    953,
     7979,
     921,
     8091,
     8084,
+    7972,
+    953,
     7980,
     921,
     8092,
     8085,
+    7973,
+    953,
     7981,
     921,
     8093,
     8086,
+    7974,
+    953,
     7982,
     921,
     8094,
     8087,
+    7975,
+    953,
     7983,
     921,
     8095,
     8096,
+    8032,
+    953,
     8040,
     921,
     8104,
     8097,
+    8033,
+    953,
     8041,
     921,
     8105,
     8098,
+    8034,
+    953,
     8042,
     921,
     8106,
     8099,
+    8035,
+    953,
     8043,
     921,
     8107,
     8100,
+    8036,
+    953,
     8044,
     921,
     8108,
     8101,
+    8037,
+    953,
     8045,
     921,
     8109,
     8102,
+    8038,
+    953,
     8046,
     921,
     8110,
     8103,
+    8039,
+    953,
     8047,
     921,
     8111,
     8096,
+    8032,
+    953,
     8040,
     921,
     8104,
     8097,
+    8033,
+    953,
     8041,
     921,
     8105,
     8098,
+    8034,
+    953,
     8042,
     921,
     8106,
     8099,
+    8035,
+    953,
     8043,
     921,
     8107,
     8100,
+    8036,
+    953,
     8044,
     921,
     8108,
     8101,
+    8037,
+    953,
     8045,
     921,
     8109,
     8102,
+    8038,
+    953,
     8046,
     921,
     8110,
     8103,
+    8039,
+    953,
     8047,
     921,
     8111,
     8114,
+    8048,
+    953,
     8122,
     921,
     8122,
     837,
     8115,
+    945,
+    953,
     913,
     921,
     8124,
     8116,
+    940,
+    953,
     902,
     921,
     902,
     837,
     8118,
+    945,
+    834,
     913,
     834,
     8119,
+    945,
+    834,
+    953,
     913,
     834,
     921,
@@ -2462,27 +2644,43 @@
     834,
     837,
     8115,
+    945,
+    953,
     913,
     921,
     8124,
+    8126,
+    953,
+    921,
     8130,
+    8052,
+    953,
     8138,
     921,
     8138,
     837,
     8131,
+    951,
+    953,
     919,
     921,
     8140,
     8132,
+    942,
+    953,
     905,
     921,
     905,
     837,
     8134,
+    951,
+    834,
     919,
     834,
     8135,
+    951,
+    834,
+    953,
     919,
     834,
     921,
@@ -2490,60 +2688,97 @@
     834,
     837,
     8131,
+    951,
+    953,
     919,
     921,
     8140,
     8146,
+    953,
+    776,
+    768,
     921,
     776,
     768,
     8147,
+    953,
+    776,
+    769,
     921,
     776,
     769,
     8150,
+    953,
+    834,
     921,
     834,
     8151,
+    953,
+    776,
+    834,
     921,
     776,
     834,
     8162,
+    965,
+    776,
+    768,
     933,
     776,
     768,
     8163,
+    965,
+    776,
+    769,
     933,
     776,
     769,
     8164,
+    961,
+    787,
     929,
     787,
     8166,
+    965,
+    834,
     933,
     834,
     8167,
+    965,
+    776,
+    834,
     933,
     776,
     834,
     8178,
+    8060,
+    953,
     8186,
     921,
     8186,
     837,
     8179,
+    969,
+    953,
     937,
     921,
     8188,
     8180,
+    974,
+    953,
     911,
     921,
     911,
     837,
     8182,
+    969,
+    834,
     937,
     834,
     8183,
+    969,
+    834,
+    953,
     937,
     834,
     921,
@@ -2551,25 +2786,36 @@
     834,
     837,
     8179,
+    969,
+    953,
     937,
     921,
     8188,
     64256,
+    102,
+    102,
     70,
     70,
     70,
     102,
     64257,
+    102,
+    105,
     70,
     73,
     70,
     105,
     64258,
+    102,
+    108,
     70,
     76,
     70,
     108,
     64259,
+    102,
+    102,
+    105,
     70,
     70,
     73,
@@ -2577,6 +2823,9 @@
     102,
     105,
     64260,
+    102,
+    102,
+    108,
     70,
     70,
     76,
@@ -2584,36 +2833,50 @@
     102,
     108,
     64261,
+    115,
+    116,
     83,
     84,
     83,
     116,
     64262,
+    115,
+    116,
     83,
     84,
     83,
     116,
     64275,
+    1396,
+    1398,
     1348,
     1350,
     1348,
     1398,
     64276,
+    1396,
+    1381,
     1348,
     1333,
     1348,
     1381,
     64277,
+    1396,
+    1387,
     1348,
     1339,
     1348,
     1387,
     64278,
+    1406,
+    1398,
     1358,
     1350,
     1358,
     1398,
     64279,
+    1396,
+    1389,
     1348,
     1341,
     1348,
diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@@ -49,6 +49,7 @@
 NAME_ALIASES = "NameAliases%s.txt"
 NAMED_SEQUENCES = "NamedSequences%s.txt"
 SPECIAL_CASING = "SpecialCasing%s.txt"
+CASE_FOLDING = "CaseFolding%s.txt"
 
 # Private Use Areas -- in planes 1, 15, 16
 PUA_1 = range(0xE000, 0xF900)
@@ -424,28 +425,36 @@
             if "Case_Ignorable" in properties:
                 flags |= CASE_IGNORABLE_MASK
             sc = unicode.special_casing.get(char)
+            cf = unicode.case_folding.get(char, [char])
+            if record[12]:
+                upper = int(record[12], 16)
+            else:
+                upper = char
+            if record[13]:
+                lower = int(record[13], 16)
+            else:
+                lower = char
+            if record[14]:
+                title = int(record[14], 16)
+            else:
+                title = upper
+            if sc is None and cf != [lower]:
+                sc = ([lower], [title], [upper])
             if sc is None:
-                if record[12]:
-                    upper = int(record[12], 16)
-                else:
-                    upper = char
-                if record[13]:
-                    lower = int(record[13], 16)
-                else:
-                    lower = char
-                if record[14]:
-                    title = int(record[14], 16)
-                else:
-                    title = upper
                 if upper == lower == title:
                     upper = lower = title = 0
             else:
-                # This happens when some character maps to more than one
-                # character in uppercase, lowercase, or titlecase. The extra
-                # characters are stored in a different array.
+                # This happens either when some character maps to more than one
+                # character in uppercase, lowercase, or titlecase or the
+                # casefolded version of the character is different from the
+                # lowercase. The extra characters are stored in a different
+                # array.
                 flags |= EXTENDED_CASE_MASK
                 lower = len(extra_casing) | (len(sc[0]) << 24)
                 extra_casing.extend(sc[0])
+                if cf != sc[0]:
+                    lower |= len(cf) << 20
+                    extra_casing.extend(cf)
                 upper = len(extra_casing) | (len(sc[2]) << 24)
                 extra_casing.extend(sc[2])
                 # Title is probably equal to upper.
@@ -1107,6 +1116,17 @@
                 title = [int(char, 16) for char in data[2].split()]
                 upper = [int(char, 16) for char in data[3].split()]
                 sc[c] = (lower, title, upper)
+        cf = self.case_folding = {}
+        if version != '3.2.0':
+            with open_data(CASE_FOLDING, version) as file:
+                for s in file:
+                    s = s[:-1].split('#', 1)[0]
+                    if not s:
+                        continue
+                    data = s.split("; ")
+                    if data[1] in "CF":
+                        c = int(data[0], 16)
+                        cf[c] = [int(char, 16) for char in data[2].split()]
 
     def uselatin1(self):
         # restrict character range to ISO Latin 1

-- 
Repository URL: http://hg.python.org/cpython


More information about the Python-checkins mailing list