[issue2771] Test issue

Ezio Melotti report at bugs.python.org
Thu Oct 6 12:50:53 CEST 2011


Ezio Melotti <ezio.melotti at gmail.com> added the comment:

test attachments

----------
Added file: http://bugs.python.org/file23325/unnamed
Added file: http://bugs.python.org/file23326/issue12753-3.diff

_______________________________________
Python tracker <report at bugs.python.org>
<http://bugs.python.org/issue2771>
_______________________________________
-------------- next part --------------
test attachments<br>
<br><br>
-------------- next part --------------
diff --git a/Doc/library/unicodedata.rst b/Doc/library/unicodedata.rst
--- a/Doc/library/unicodedata.rst
+++ b/Doc/library/unicodedata.rst
@@ -29,6 +29,9 @@
    Look up character by name.  If a character with the given name is found, return
    the corresponding character.  If not found, :exc:`KeyError` is raised.
 
+   .. versionchanged:: 3.3
+      Support for name aliases [#]_ and named sequences [#]_ has been added.
+
 
 .. function:: name(chr[, default])
 
@@ -160,3 +163,9 @@
    >>> unicodedata.bidirectional('\u0660') # 'A'rabic, 'N'umber
    'AN'
 
+
+.. rubric:: Footnotes
+
+.. [#] http://www.unicode.org/Public/6.0.0/ucd/NameAliases.txt
+
+.. [#] http://www.unicode.org/Public/6.0.0/ucd/NamedSequences.txt
diff --git a/Doc/reference/lexical_analysis.rst b/Doc/reference/lexical_analysis.rst
--- a/Doc/reference/lexical_analysis.rst
+++ b/Doc/reference/lexical_analysis.rst
@@ -492,13 +492,13 @@
 +-----------------+---------------------------------+-------+
 | Escape Sequence | Meaning                         | Notes |
 +=================+=================================+=======+
-| ``\N{name}``    | Character named *name* in the   |       |
+| ``\N{name}``    | Character named *name* in the   | \(4)  |
 |                 | Unicode database                |       |
 +-----------------+---------------------------------+-------+
-| ``\uxxxx``      | Character with 16-bit hex value | \(4)  |
+| ``\uxxxx``      | Character with 16-bit hex value | \(5)  |
 |                 | *xxxx*                          |       |
 +-----------------+---------------------------------+-------+
-| ``\Uxxxxxxxx``  | Character with 32-bit hex value | \(5)  |
+| ``\Uxxxxxxxx``  | Character with 32-bit hex value | \(6)  |
 |                 | *xxxxxxxx*                      |       |
 +-----------------+---------------------------------+-------+
 
@@ -516,10 +516,14 @@
    with the given value.
 
 (4)
+   .. versionchanged:: 3.3
+      Support for name aliases [#]_ has been added.
+
+(5)
    Individual code units which form parts of a surrogate pair can be encoded using
    this escape sequence.  Exactly four hex digits are required.
 
-(5)
+(6)
    Any Unicode character can be encoded this way, but characters outside the Basic
    Multilingual Plane (BMP) will be encoded using a surrogate pair if Python is
    compiled to use 16-bit code units (the default).  Exactly eight hex digits
@@ -706,3 +710,8 @@
 occurrence outside string literals and comments is an unconditional error::
 
    $       ?       `
+
+
+.. rubric:: Footnotes
+
+.. [#] http://www.unicode.org/Public/6.0.0/ucd/NameAliases.txt
diff --git a/Lib/test/test_ucn.py b/Lib/test/test_ucn.py
--- a/Lib/test/test_ucn.py
+++ b/Lib/test/test_ucn.py
@@ -8,8 +8,11 @@
 """#"
 
 import unittest
+import unicodedata
 
 from test import support
+from http.client import HTTPException
+from test.test_normalization import check_version
 
 class UnicodeNamesTest(unittest.TestCase):
 
@@ -59,8 +62,6 @@
         )
 
     def test_ascii_letters(self):
-        import unicodedata
-
         for char in "".join(map(chr, range(ord("a"), ord("z")))):
             name = "LATIN SMALL LETTER %s" % char.upper()
             code = unicodedata.lookup(name)
@@ -81,7 +82,6 @@
         self.checkletter("HANGUL SYLLABLE HWEOK", "\ud6f8")
         self.checkletter("HANGUL SYLLABLE HIH", "\ud7a3")
 
-        import unicodedata
         self.assertRaises(ValueError, unicodedata.name, "\ud7a4")
 
     def test_cjk_unified_ideographs(self):
@@ -97,14 +97,11 @@
         self.checkletter("CJK UNIFIED IDEOGRAPH-2B81D", "\U0002B81D")
 
     def test_bmp_characters(self):
-        import unicodedata
-        count = 0
         for code in range(0x10000):
             char = chr(code)
             name = unicodedata.name(char, None)
             if name is not None:
                 self.assertEqual(unicodedata.lookup(name), char)
-                count += 1
 
     def test_misc_symbols(self):
         self.checkletter("PILCROW SIGN", "\u00b6")
@@ -112,8 +109,65 @@
         self.checkletter("HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK", "\uFF9F")
         self.checkletter("FULLWIDTH LATIN SMALL LETTER A", "\uFF41")
 
+    def test_aliases(self):
+        # Check that the aliases defined in the NameAliases.txt file work.
+        # This should be updated when new aliases are added or the file
+        # should be downloaded and parsed instead.  See #12753.
+        aliases = [
+            ('LATIN CAPITAL LETTER GHA', 0x01A2),
+            ('LATIN SMALL LETTER GHA', 0x01A3),
+            ('KANNADA LETTER LLLA', 0x0CDE),
+            ('LAO LETTER FO FON', 0x0E9D),
+            ('LAO LETTER FO FAY', 0x0E9F),
+            ('LAO LETTER RO', 0x0EA3),
+            ('LAO LETTER LO', 0x0EA5),
+            ('TIBETAN MARK BKA- SHOG GI MGO RGYAN', 0x0FD0),
+            ('YI SYLLABLE ITERATION MARK', 0xA015),
+            ('PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET', 0xFE18),
+            ('BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS', 0x1D0C5)
+        ]
+        for alias, codepoint in aliases:
+            self.checkletter(alias, chr(codepoint))
+            name = unicodedata.name(chr(codepoint))
+            self.assertNotEqual(name, alias)
+            self.assertEqual(unicodedata.lookup(alias),
+                             unicodedata.lookup(name))
+
+    def test_named_sequences_sample(self):
+        # Check a few named sequences.  See #12753.
+        sequences = [
+            ('LATIN SMALL LETTER R WITH TILDE', '\u0072\u0303'),
+            ('TAMIL SYLLABLE SAI', '\u0BB8\u0BC8'),
+            ('TAMIL SYLLABLE MOO', '\u0BAE\u0BCB'),
+            ('TAMIL SYLLABLE NNOO', '\u0BA3\u0BCB'),
+            ('TAMIL CONSONANT KSS', '\u0B95\u0BCD\u0BB7\u0BCD'),
+        ]
+        for seqname, codepoints in sequences:
+            self.assertEqual(unicodedata.lookup(seqname), codepoints)
+            with self.assertRaises(SyntaxError):
+                self.checkletter(seqname, None)
+
+    def test_named_sequences_full(self):
+        # Check all the named sequences
+        url = ("http://www.unicode.org/Public/%s/ucd/NamedSequences.txt" %
+               unicodedata.unidata_version)
+        try:
+            testdata = support.open_urlresource(url, encoding="utf-8",
+                                                check=check_version)
+        except (IOError, HTTPException):
+            self.skipTest("Could not retrieve " + url)
+        self.addCleanup(testdata.close)
+        for line in testdata:
+            line = line.strip()
+            if not line or line.startswith('#'):
+                continue
+            seqname, codepoints = line.split(';')
+            codepoints = ''.join(chr(int(cp, 16)) for cp in codepoints.split())
+            self.assertEqual(unicodedata.lookup(seqname), codepoints)
+            with self.assertRaises(SyntaxError):
+                self.checkletter(seqname, None)
+
     def test_errors(self):
-        import unicodedata
         self.assertRaises(TypeError, unicodedata.name)
         self.assertRaises(TypeError, unicodedata.name, 'xx')
         self.assertRaises(TypeError, unicodedata.lookup)
diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c
--- a/Modules/unicodedata.c
+++ b/Modules/unicodedata.c
@@ -1054,7 +1054,7 @@
 static int
 _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
 {
-    unsigned int h, v;
+    unsigned int h, v, k;
     unsigned int mask = code_size-1;
     unsigned int i, incr;
 
@@ -1100,6 +1100,17 @@
         return 1;
     }
 
+    /* check for aliases defined in NameAliases.txt */
+    for (k=0; k<aliases_count; k++) {
+        /* name might not be nul-terminated, so it's necessary to check
+           that the len of the two names is the same before comparing them */
+        if ((name_aliases[k].namelen == namelen) &&
+            (strncmp(name, name_aliases[k].name, namelen) == 0)) {
+            *code = name_aliases[k].codepoint;
+            return 1;
+        }
+    }
+
     /* the following is the same as python's dictionary lookup, with
        only minor changes.  see the makeunicodedata script for more
        details */
@@ -1176,6 +1187,26 @@
     return PyUnicode_FromString(name);
 }
 
+static PyObject *
+_lookup_named_sequences(char* name) {
+    int low, mid, high, cmp;
+    low = 0;
+    high = named_sequences_count;
+    while (low <= high) {
+        mid = (low+high) / 2;
+        cmp = strcmp(name, named_sequences[mid].name);
+        if (cmp < 0)
+            high = mid - 1;
+        else if (cmp > 0)
+            low = mid + 1;
+        else
+            return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
+                                             named_sequences[mid].seq,
+                                             named_sequences[mid].seqlen);
+    }
+    return NULL;
+}
+
 PyDoc_STRVAR(unicodedata_lookup__doc__,
 "lookup(name)\n\
 \n\
@@ -1187,6 +1218,7 @@
 unicodedata_lookup(PyObject* self, PyObject* args)
 {
     Py_UCS4 code;
+    PyObject *codes; /* for named sequences */
 
     char* name;
     int namelen;
@@ -1194,9 +1226,13 @@
         return NULL;
 
     if (!_getcode(self, name, namelen, &code)) {
-        PyErr_Format(PyExc_KeyError, "undefined character name '%s'",
-                     name);
-        return NULL;
+        /* if the normal lookup fails try with named sequences */
+        codes = _lookup_named_sequences(name);
+        if (codes == NULL) {
+            PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
+            return NULL;
+        }
+        return codes;
     }
 
     return PyUnicode_FromOrdinal(code);
diff --git a/Modules/unicodename_db.h b/Modules/unicodename_db.h
--- a/Modules/unicodename_db.h
+++ b/Modules/unicodename_db.h
@@ -18811,3 +18811,452 @@
 #define code_magic 47
 #define code_size 32768
 #define code_poly 32771
+
+typedef struct Alias {
+    char *name;
+    int namelen;
+    int codepoint;
+} alias;
+
+static const int aliases_count = 11;
+static const alias name_aliases[] = {
+    {"LATIN CAPITAL LETTER GHA", 24, 0x01A2},
+    {"LATIN SMALL LETTER GHA", 22, 0x01A3},
+    {"KANNADA LETTER LLLA", 19, 0x0CDE},
+    {"LAO LETTER FO FON", 17, 0x0E9D},
+    {"LAO LETTER FO FAY", 17, 0x0E9F},
+    {"LAO LETTER RO", 13, 0x0EA3},
+    {"LAO LETTER LO", 13, 0x0EA5},
+    {"TIBETAN MARK BKA- SHOG GI MGO RGYAN", 35, 0x0FD0},
+    {"YI SYLLABLE ITERATION MARK", 26, 0xA015},
+    {"PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET", 61, 0xFE18},
+    {"BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS", 52, 0x1D0C5},
+};
+
+typedef struct NamedSequence {
+    char *name;
+    int seqlen;
+    Py_UCS2 seq[4];
+} named_sequence;
+
+static const int named_sequences_count = 418;
+static const named_sequence named_sequences[] = {
+    {"BENGALI LETTER KHINYA", 3, {0x0995, 0x09CD, 0x09B7}},
+    {"GEORGIAN LETTER U-BRJGU", 2, {0x10E3, 0x0302}},
+    {"HIRAGANA LETTER BIDAKUON NGA", 2, {0x304B, 0x309A}},
+    {"HIRAGANA LETTER BIDAKUON NGE", 2, {0x3051, 0x309A}},
+    {"HIRAGANA LETTER BIDAKUON NGI", 2, {0x304D, 0x309A}},
+    {"HIRAGANA LETTER BIDAKUON NGO", 2, {0x3053, 0x309A}},
+    {"HIRAGANA LETTER BIDAKUON NGU", 2, {0x304F, 0x309A}},
+    {"KATAKANA LETTER AINU CE", 2, {0x30BB, 0x309A}},
+    {"KATAKANA LETTER AINU P", 2, {0x31F7, 0x309A}},
+    {"KATAKANA LETTER AINU TO", 2, {0x30C8, 0x309A}},
+    {"KATAKANA LETTER AINU TU", 2, {0x30C4, 0x309A}},
+    {"KATAKANA LETTER BIDAKUON NGA", 2, {0x30AB, 0x309A}},
+    {"KATAKANA LETTER BIDAKUON NGE", 2, {0x30B1, 0x309A}},
+    {"KATAKANA LETTER BIDAKUON NGI", 2, {0x30AD, 0x309A}},
+    {"KATAKANA LETTER BIDAKUON NGO", 2, {0x30B3, 0x309A}},
+    {"KATAKANA LETTER BIDAKUON NGU", 2, {0x30AF, 0x309A}},
+    {"KHMER CONSONANT SIGN COENG BA", 2, {0x17D2, 0x1794}},
+    {"KHMER CONSONANT SIGN COENG CA", 2, {0x17D2, 0x1785}},
+    {"KHMER CONSONANT SIGN COENG CHA", 2, {0x17D2, 0x1786}},
+    {"KHMER CONSONANT SIGN COENG CHO", 2, {0x17D2, 0x1788}},
+    {"KHMER CONSONANT SIGN COENG CO", 2, {0x17D2, 0x1787}},
+    {"KHMER CONSONANT SIGN COENG DA", 2, {0x17D2, 0x178A}},
+    {"KHMER CONSONANT SIGN COENG DO", 2, {0x17D2, 0x178C}},
+    {"KHMER CONSONANT SIGN COENG HA", 2, {0x17D2, 0x17A0}},
+    {"KHMER CONSONANT SIGN COENG KA", 2, {0x17D2, 0x1780}},
+    {"KHMER CONSONANT SIGN COENG KHA", 2, {0x17D2, 0x1781}},
+    {"KHMER CONSONANT SIGN COENG KHO", 2, {0x17D2, 0x1783}},
+    {"KHMER CONSONANT SIGN COENG KO", 2, {0x17D2, 0x1782}},
+    {"KHMER CONSONANT SIGN COENG LA", 2, {0x17D2, 0x17A1}},
+    {"KHMER CONSONANT SIGN COENG LO", 2, {0x17D2, 0x179B}},
+    {"KHMER CONSONANT SIGN COENG MO", 2, {0x17D2, 0x1798}},
+    {"KHMER CONSONANT SIGN COENG NA", 2, {0x17D2, 0x178E}},
+    {"KHMER CONSONANT SIGN COENG NGO", 2, {0x17D2, 0x1784}},
+    {"KHMER CONSONANT SIGN COENG NO", 2, {0x17D2, 0x1793}},
+    {"KHMER CONSONANT SIGN COENG NYO", 2, {0x17D2, 0x1789}},
+    {"KHMER CONSONANT SIGN COENG PHA", 2, {0x17D2, 0x1795}},
+    {"KHMER CONSONANT SIGN COENG PHO", 2, {0x17D2, 0x1797}},
+    {"KHMER CONSONANT SIGN COENG PO", 2, {0x17D2, 0x1796}},
+    {"KHMER CONSONANT SIGN COENG RO", 2, {0x17D2, 0x179A}},
+    {"KHMER CONSONANT SIGN COENG SA", 2, {0x17D2, 0x179F}},
+    {"KHMER CONSONANT SIGN COENG SHA", 2, {0x17D2, 0x179D}},
+    {"KHMER CONSONANT SIGN COENG SSA", 2, {0x17D2, 0x179E}},
+    {"KHMER CONSONANT SIGN COENG TA", 2, {0x17D2, 0x178F}},
+    {"KHMER CONSONANT SIGN COENG THA", 2, {0x17D2, 0x1790}},
+    {"KHMER CONSONANT SIGN COENG THO", 2, {0x17D2, 0x1792}},
+    {"KHMER CONSONANT SIGN COENG TO", 2, {0x17D2, 0x1791}},
+    {"KHMER CONSONANT SIGN COENG TTHA", 2, {0x17D2, 0x178B}},
+    {"KHMER CONSONANT SIGN COENG TTHO", 2, {0x17D2, 0x178D}},
+    {"KHMER CONSONANT SIGN COENG VO", 2, {0x17D2, 0x179C}},
+    {"KHMER CONSONANT SIGN COENG YO", 2, {0x17D2, 0x1799}},
+    {"KHMER INDEPENDENT VOWEL SIGN COENG QE", 2, {0x17D2, 0x17AF}},
+    {"KHMER INDEPENDENT VOWEL SIGN COENG QU", 2, {0x17D2, 0x17A7}},
+    {"KHMER INDEPENDENT VOWEL SIGN COENG RY", 2, {0x17D2, 0x17AB}},
+    {"KHMER INDEPENDENT VOWEL SIGN COENG RYY", 2, {0x17D2, 0x17AC}},
+    {"KHMER VOWEL SIGN AAM", 2, {0x17B6, 0x17C6}},
+    {"KHMER VOWEL SIGN COENG QA", 2, {0x17D2, 0x17A2}},
+    {"KHMER VOWEL SIGN OM", 2, {0x17BB, 0x17C6}},
+    {"LATIN CAPITAL LETTER A WITH MACRON AND GRAVE", 2, {0x0100, 0x0300}},
+    {"LATIN CAPITAL LETTER A WITH OGONEK AND ACUTE", 2, {0x0104, 0x0301}},
+    {"LATIN CAPITAL LETTER A WITH OGONEK AND TILDE", 2, {0x0104, 0x0303}},
+    {"LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND CARON", 2, {0x00CA, 0x030C}},
+    {"LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND MACRON", 2, {0x00CA, 0x0304}},
+    {"LATIN CAPITAL LETTER E WITH DOT ABOVE AND ACUTE", 2, {0x0116, 0x0301}},
+    {"LATIN CAPITAL LETTER E WITH DOT ABOVE AND TILDE", 2, {0x0116, 0x0303}},
+    {"LATIN CAPITAL LETTER E WITH OGONEK AND ACUTE", 2, {0x0118, 0x0301}},
+    {"LATIN CAPITAL LETTER E WITH OGONEK AND TILDE", 2, {0x0118, 0x0303}},
+    {"LATIN CAPITAL LETTER E WITH VERTICAL LINE BELOW", 2, {0x0045, 0x0329}},
+    {"LATIN CAPITAL LETTER E WITH VERTICAL LINE BELOW AND ACUTE", 2, {0x00C9, 0x0329}},
+    {"LATIN CAPITAL LETTER E WITH VERTICAL LINE BELOW AND GRAVE", 2, {0x00C8, 0x0329}},
+    {"LATIN CAPITAL LETTER I WITH MACRON AND GRAVE", 2, {0x012A, 0x0300}},
+    {"LATIN CAPITAL LETTER I WITH OGONEK AND ACUTE", 2, {0x012E, 0x0301}},
+    {"LATIN CAPITAL LETTER I WITH OGONEK AND TILDE", 2, {0x012E, 0x0303}},
+    {"LATIN CAPITAL LETTER J WITH TILDE", 2, {0x004A, 0x0303}},
+    {"LATIN CAPITAL LETTER L WITH TILDE", 2, {0x004C, 0x0303}},
+    {"LATIN CAPITAL LETTER M WITH TILDE", 2, {0x004D, 0x0303}},
+    {"LATIN CAPITAL LETTER O WITH VERTICAL LINE BELOW", 2, {0x004F, 0x0329}},
+    {"LATIN CAPITAL LETTER O WITH VERTICAL LINE BELOW AND ACUTE", 2, {0x00D3, 0x0329}},
+    {"LATIN CAPITAL LETTER O WITH VERTICAL LINE BELOW AND GRAVE", 2, {0x00D2, 0x0329}},
+    {"LATIN CAPITAL LETTER R WITH TILDE", 2, {0x0052, 0x0303}},
+    {"LATIN CAPITAL LETTER S WITH VERTICAL LINE BELOW", 2, {0x0053, 0x0329}},
+    {"LATIN CAPITAL LETTER U WITH MACRON AND ACUTE", 2, {0x016A, 0x0301}},
+    {"LATIN CAPITAL LETTER U WITH MACRON AND GRAVE", 2, {0x016A, 0x0300}},
+    {"LATIN CAPITAL LETTER U WITH MACRON AND TILDE", 2, {0x016A, 0x0303}},
+    {"LATIN CAPITAL LETTER U WITH OGONEK AND ACUTE", 2, {0x0172, 0x0301}},
+    {"LATIN CAPITAL LETTER U WITH OGONEK AND TILDE", 2, {0x0172, 0x0303}},
+    {"LATIN SMALL LETTER A WITH MACRON AND GRAVE", 2, {0x0101, 0x0300}},
+    {"LATIN SMALL LETTER A WITH OGONEK AND ACUTE", 2, {0x0105, 0x0301}},
+    {"LATIN SMALL LETTER A WITH OGONEK AND TILDE", 2, {0x0105, 0x0303}},
+    {"LATIN SMALL LETTER AE WITH GRAVE", 2, {0x00E6, 0x0300}},
+    {"LATIN SMALL LETTER E WITH CIRCUMFLEX AND CARON", 2, {0x00EA, 0x030C}},
+    {"LATIN SMALL LETTER E WITH CIRCUMFLEX AND MACRON", 2, {0x00EA, 0x0304}},
+    {"LATIN SMALL LETTER E WITH DOT ABOVE AND ACUTE", 2, {0x0117, 0x0301}},
+    {"LATIN SMALL LETTER E WITH DOT ABOVE AND TILDE", 2, {0x0117, 0x0303}},
+    {"LATIN SMALL LETTER E WITH OGONEK AND ACUTE", 2, {0x0119, 0x0301}},
+    {"LATIN SMALL LETTER E WITH OGONEK AND TILDE", 2, {0x0119, 0x0303}},
+    {"LATIN SMALL LETTER E WITH VERTICAL LINE BELOW", 2, {0x0065, 0x0329}},
+    {"LATIN SMALL LETTER E WITH VERTICAL LINE BELOW AND ACUTE", 2, {0x00E9, 0x0329}},
+    {"LATIN SMALL LETTER E WITH VERTICAL LINE BELOW AND GRAVE", 2, {0x00E8, 0x0329}},
+    {"LATIN SMALL LETTER HOOKED SCHWA WITH ACUTE", 2, {0x025A, 0x0301}},
+    {"LATIN SMALL LETTER HOOKED SCHWA WITH GRAVE", 2, {0x025A, 0x0300}},
+    {"LATIN SMALL LETTER I WITH DOT ABOVE AND ACUTE", 3, {0x0069, 0x0307, 0x0301}},
+    {"LATIN SMALL LETTER I WITH DOT ABOVE AND GRAVE", 3, {0x0069, 0x0307, 0x0300}},
+    {"LATIN SMALL LETTER I WITH DOT ABOVE AND TILDE", 3, {0x0069, 0x0307, 0x0303}},
+    {"LATIN SMALL LETTER I WITH MACRON AND GRAVE", 2, {0x012B, 0x0300}},
+    {"LATIN SMALL LETTER I WITH OGONEK AND DOT ABOVE AND ACUTE", 3, {0x012F, 0x0307, 0x0301}},
+    {"LATIN SMALL LETTER I WITH OGONEK AND DOT ABOVE AND TILDE", 3, {0x012F, 0x0307, 0x0303}},
+    {"LATIN SMALL LETTER J WITH DOT ABOVE AND TILDE", 3, {0x006A, 0x0307, 0x0303}},
+    {"LATIN SMALL LETTER L WITH TILDE", 2, {0x006C, 0x0303}},
+    {"LATIN SMALL LETTER M WITH TILDE", 2, {0x006D, 0x0303}},
+    {"LATIN SMALL LETTER NG WITH TILDE ABOVE", 3, {0x006E, 0x0360, 0x0067}},
+    {"LATIN SMALL LETTER O WITH VERTICAL LINE BELOW", 2, {0x006F, 0x0329}},
+    {"LATIN SMALL LETTER O WITH VERTICAL LINE BELOW AND ACUTE", 2, {0x00F3, 0x0329}},
+    {"LATIN SMALL LETTER O WITH VERTICAL LINE BELOW AND GRAVE", 2, {0x00F2, 0x0329}},
+    {"LATIN SMALL LETTER OPEN O WITH ACUTE", 2, {0x0254, 0x0301}},
+    {"LATIN SMALL LETTER OPEN O WITH GRAVE", 2, {0x0254, 0x0300}},
+    {"LATIN SMALL LETTER R WITH TILDE", 2, {0x0072, 0x0303}},
+    {"LATIN SMALL LETTER S WITH VERTICAL LINE BELOW", 2, {0x0073, 0x0329}},
+    {"LATIN SMALL LETTER SCHWA WITH ACUTE", 2, {0x0259, 0x0301}},
+    {"LATIN SMALL LETTER SCHWA WITH GRAVE", 2, {0x0259, 0x0300}},
+    {"LATIN SMALL LETTER TURNED V WITH ACUTE", 2, {0x028C, 0x0301}},
+    {"LATIN SMALL LETTER TURNED V WITH GRAVE", 2, {0x028C, 0x0300}},
+    {"LATIN SMALL LETTER U WITH MACRON AND ACUTE", 2, {0x016B, 0x0301}},
+    {"LATIN SMALL LETTER U WITH MACRON AND GRAVE", 2, {0x016B, 0x0300}},
+    {"LATIN SMALL LETTER U WITH MACRON AND TILDE", 2, {0x016B, 0x0303}},
+    {"LATIN SMALL LETTER U WITH OGONEK AND ACUTE", 2, {0x0173, 0x0301}},
+    {"LATIN SMALL LETTER U WITH OGONEK AND TILDE", 2, {0x0173, 0x0303}},
+    {"MODIFIER LETTER EXTRA-HIGH EXTRA-LOW CONTOUR TONE BAR", 2, {0x02E5, 0x02E9}},
+    {"MODIFIER LETTER EXTRA-LOW EXTRA-HIGH CONTOUR TONE BAR", 2, {0x02E9, 0x02E5}},
+    {"TAMIL CONSONANT C", 2, {0x0B9A, 0x0BCD}},
+    {"TAMIL CONSONANT H", 2, {0x0BB9, 0x0BCD}},
+    {"TAMIL CONSONANT J", 2, {0x0B9C, 0x0BCD}},
+    {"TAMIL CONSONANT K", 2, {0x0B95, 0x0BCD}},
+    {"TAMIL CONSONANT KSS", 4, {0x0B95, 0x0BCD, 0x0BB7, 0x0BCD}},
+    {"TAMIL CONSONANT L", 2, {0x0BB2, 0x0BCD}},
+    {"TAMIL CONSONANT LL", 2, {0x0BB3, 0x0BCD}},
+    {"TAMIL CONSONANT LLL", 2, {0x0BB4, 0x0BCD}},
+    {"TAMIL CONSONANT M", 2, {0x0BAE, 0x0BCD}},
+    {"TAMIL CONSONANT N", 2, {0x0BA8, 0x0BCD}},
+    {"TAMIL CONSONANT NG", 2, {0x0B99, 0x0BCD}},
+    {"TAMIL CONSONANT NN", 2, {0x0BA3, 0x0BCD}},
+    {"TAMIL CONSONANT NNN", 2, {0x0BA9, 0x0BCD}},
+    {"TAMIL CONSONANT NY", 2, {0x0B9E, 0x0BCD}},
+    {"TAMIL CONSONANT P", 2, {0x0BAA, 0x0BCD}},
+    {"TAMIL CONSONANT R", 2, {0x0BB0, 0x0BCD}},
+    {"TAMIL CONSONANT RR", 2, {0x0BB1, 0x0BCD}},
+    {"TAMIL CONSONANT S", 2, {0x0BB8, 0x0BCD}},
+    {"TAMIL CONSONANT SH", 2, {0x0BB6, 0x0BCD}},
+    {"TAMIL CONSONANT SS", 2, {0x0BB7, 0x0BCD}},
+    {"TAMIL CONSONANT T", 2, {0x0BA4, 0x0BCD}},
+    {"TAMIL CONSONANT TT", 2, {0x0B9F, 0x0BCD}},
+    {"TAMIL CONSONANT V", 2, {0x0BB5, 0x0BCD}},
+    {"TAMIL CONSONANT Y", 2, {0x0BAF, 0x0BCD}},
+    {"TAMIL SYLLABLE CAA", 2, {0x0B9A, 0x0BBE}},
+    {"TAMIL SYLLABLE CAI", 2, {0x0B9A, 0x0BC8}},
+    {"TAMIL SYLLABLE CAU", 2, {0x0B9A, 0x0BCC}},
+    {"TAMIL SYLLABLE CE", 2, {0x0B9A, 0x0BC6}},
+    {"TAMIL SYLLABLE CEE", 2, {0x0B9A, 0x0BC7}},
+    {"TAMIL SYLLABLE CI", 2, {0x0B9A, 0x0BBF}},
+    {"TAMIL SYLLABLE CII", 2, {0x0B9A, 0x0BC0}},
+    {"TAMIL SYLLABLE CO", 2, {0x0B9A, 0x0BCA}},
+    {"TAMIL SYLLABLE COO", 2, {0x0B9A, 0x0BCB}},
+    {"TAMIL SYLLABLE CU", 2, {0x0B9A, 0x0BC1}},
+    {"TAMIL SYLLABLE CUU", 2, {0x0B9A, 0x0BC2}},
+    {"TAMIL SYLLABLE HAA", 2, {0x0BB9, 0x0BBE}},
+    {"TAMIL SYLLABLE HAI", 2, {0x0BB9, 0x0BC8}},
+    {"TAMIL SYLLABLE HAU", 2, {0x0BB9, 0x0BCC}},
+    {"TAMIL SYLLABLE HE", 2, {0x0BB9, 0x0BC6}},
+    {"TAMIL SYLLABLE HEE", 2, {0x0BB9, 0x0BC7}},
+    {"TAMIL SYLLABLE HI", 2, {0x0BB9, 0x0BBF}},
+    {"TAMIL SYLLABLE HII", 2, {0x0BB9, 0x0BC0}},
+    {"TAMIL SYLLABLE HO", 2, {0x0BB9, 0x0BCA}},
+    {"TAMIL SYLLABLE HOO", 2, {0x0BB9, 0x0BCB}},
+    {"TAMIL SYLLABLE HU", 2, {0x0BB9, 0x0BC1}},
+    {"TAMIL SYLLABLE HUU", 2, {0x0BB9, 0x0BC2}},
+    {"TAMIL SYLLABLE JAA", 2, {0x0B9C, 0x0BBE}},
+    {"TAMIL SYLLABLE JAI", 2, {0x0B9C, 0x0BC8}},
+    {"TAMIL SYLLABLE JAU", 2, {0x0B9C, 0x0BCC}},
+    {"TAMIL SYLLABLE JE", 2, {0x0B9C, 0x0BC6}},
+    {"TAMIL SYLLABLE JEE", 2, {0x0B9C, 0x0BC7}},
+    {"TAMIL SYLLABLE JI", 2, {0x0B9C, 0x0BBF}},
+    {"TAMIL SYLLABLE JII", 2, {0x0B9C, 0x0BC0}},
+    {"TAMIL SYLLABLE JO", 2, {0x0B9C, 0x0BCA}},
+    {"TAMIL SYLLABLE JOO", 2, {0x0B9C, 0x0BCB}},
+    {"TAMIL SYLLABLE JU", 2, {0x0B9C, 0x0BC1}},
+    {"TAMIL SYLLABLE JUU", 2, {0x0B9C, 0x0BC2}},
+    {"TAMIL SYLLABLE KAA", 2, {0x0B95, 0x0BBE}},
+    {"TAMIL SYLLABLE KAI", 2, {0x0B95, 0x0BC8}},
+    {"TAMIL SYLLABLE KAU", 2, {0x0B95, 0x0BCC}},
+    {"TAMIL SYLLABLE KE", 2, {0x0B95, 0x0BC6}},
+    {"TAMIL SYLLABLE KEE", 2, {0x0B95, 0x0BC7}},
+    {"TAMIL SYLLABLE KI", 2, {0x0B95, 0x0BBF}},
+    {"TAMIL SYLLABLE KII", 2, {0x0B95, 0x0BC0}},
+    {"TAMIL SYLLABLE KO", 2, {0x0B95, 0x0BCA}},
+    {"TAMIL SYLLABLE KOO", 2, {0x0B95, 0x0BCB}},
+    {"TAMIL SYLLABLE KSSA", 3, {0x0B95, 0x0BCD, 0x0BB7}},
+    {"TAMIL SYLLABLE KSSAA", 4, {0x0B95, 0x0BCD, 0x0BB7, 0x0BBE}},
+    {"TAMIL SYLLABLE KSSAI", 4, {0x0B95, 0x0BCD, 0x0BB7, 0x0BC8}},
+    {"TAMIL SYLLABLE KSSAU", 4, {0x0B95, 0x0BCD, 0x0BB7, 0x0BCC}},
+    {"TAMIL SYLLABLE KSSE", 4, {0x0B95, 0x0BCD, 0x0BB7, 0x0BC6}},
+    {"TAMIL SYLLABLE KSSEE", 4, {0x0B95, 0x0BCD, 0x0BB7, 0x0BC7}},
+    {"TAMIL SYLLABLE KSSI", 4, {0x0B95, 0x0BCD, 0x0BB7, 0x0BBF}},
+    {"TAMIL SYLLABLE KSSII", 4, {0x0B95, 0x0BCD, 0x0BB7, 0x0BC0}},
+    {"TAMIL SYLLABLE KSSO", 4, {0x0B95, 0x0BCD, 0x0BB7, 0x0BCA}},
+    {"TAMIL SYLLABLE KSSOO", 4, {0x0B95, 0x0BCD, 0x0BB7, 0x0BCB}},
+    {"TAMIL SYLLABLE KSSU", 4, {0x0B95, 0x0BCD, 0x0BB7, 0x0BC1}},
+    {"TAMIL SYLLABLE KSSUU", 4, {0x0B95, 0x0BCD, 0x0BB7, 0x0BC2}},
+    {"TAMIL SYLLABLE KU", 2, {0x0B95, 0x0BC1}},
+    {"TAMIL SYLLABLE KUU", 2, {0x0B95, 0x0BC2}},
+    {"TAMIL SYLLABLE LAA", 2, {0x0BB2, 0x0BBE}},
+    {"TAMIL SYLLABLE LAI", 2, {0x0BB2, 0x0BC8}},
+    {"TAMIL SYLLABLE LAU", 2, {0x0BB2, 0x0BCC}},
+    {"TAMIL SYLLABLE LE", 2, {0x0BB2, 0x0BC6}},
+    {"TAMIL SYLLABLE LEE", 2, {0x0BB2, 0x0BC7}},
+    {"TAMIL SYLLABLE LI", 2, {0x0BB2, 0x0BBF}},
+    {"TAMIL SYLLABLE LII", 2, {0x0BB2, 0x0BC0}},
+    {"TAMIL SYLLABLE LLAA", 2, {0x0BB3, 0x0BBE}},
+    {"TAMIL SYLLABLE LLAI", 2, {0x0BB3, 0x0BC8}},
+    {"TAMIL SYLLABLE LLAU", 2, {0x0BB3, 0x0BCC}},
+    {"TAMIL SYLLABLE LLE", 2, {0x0BB3, 0x0BC6}},
+    {"TAMIL SYLLABLE LLEE", 2, {0x0BB3, 0x0BC7}},
+    {"TAMIL SYLLABLE LLI", 2, {0x0BB3, 0x0BBF}},
+    {"TAMIL SYLLABLE LLII", 2, {0x0BB3, 0x0BC0}},
+    {"TAMIL SYLLABLE LLLAA", 2, {0x0BB4, 0x0BBE}},
+    {"TAMIL SYLLABLE LLLAI", 2, {0x0BB4, 0x0BC8}},
+    {"TAMIL SYLLABLE LLLAU", 2, {0x0BB4, 0x0BCC}},
+    {"TAMIL SYLLABLE LLLE", 2, {0x0BB4, 0x0BC6}},
+    {"TAMIL SYLLABLE LLLEE", 2, {0x0BB4, 0x0BC7}},
+    {"TAMIL SYLLABLE LLLI", 2, {0x0BB4, 0x0BBF}},
+    {"TAMIL SYLLABLE LLLII", 2, {0x0BB4, 0x0BC0}},
+    {"TAMIL SYLLABLE LLLO", 2, {0x0BB4, 0x0BCA}},
+    {"TAMIL SYLLABLE LLLOO", 2, {0x0BB4, 0x0BCB}},
+    {"TAMIL SYLLABLE LLLU", 2, {0x0BB4, 0x0BC1}},
+    {"TAMIL SYLLABLE LLLUU", 2, {0x0BB4, 0x0BC2}},
+    {"TAMIL SYLLABLE LLO", 2, {0x0BB3, 0x0BCA}},
+    {"TAMIL SYLLABLE LLOO", 2, {0x0BB3, 0x0BCB}},
+    {"TAMIL SYLLABLE LLU", 2, {0x0BB3, 0x0BC1}},
+    {"TAMIL SYLLABLE LLUU", 2, {0x0BB3, 0x0BC2}},
+    {"TAMIL SYLLABLE LO", 2, {0x0BB2, 0x0BCA}},
+    {"TAMIL SYLLABLE LOO", 2, {0x0BB2, 0x0BCB}},
+    {"TAMIL SYLLABLE LU", 2, {0x0BB2, 0x0BC1}},
+    {"TAMIL SYLLABLE LUU", 2, {0x0BB2, 0x0BC2}},
+    {"TAMIL SYLLABLE MAA", 2, {0x0BAE, 0x0BBE}},
+    {"TAMIL SYLLABLE MAI", 2, {0x0BAE, 0x0BC8}},
+    {"TAMIL SYLLABLE MAU", 2, {0x0BAE, 0x0BCC}},
+    {"TAMIL SYLLABLE ME", 2, {0x0BAE, 0x0BC6}},
+    {"TAMIL SYLLABLE MEE", 2, {0x0BAE, 0x0BC7}},
+    {"TAMIL SYLLABLE MI", 2, {0x0BAE, 0x0BBF}},
+    {"TAMIL SYLLABLE MII", 2, {0x0BAE, 0x0BC0}},
+    {"TAMIL SYLLABLE MO", 2, {0x0BAE, 0x0BCA}},
+    {"TAMIL SYLLABLE MOO", 2, {0x0BAE, 0x0BCB}},
+    {"TAMIL SYLLABLE MU", 2, {0x0BAE, 0x0BC1}},
+    {"TAMIL SYLLABLE MUU", 2, {0x0BAE, 0x0BC2}},
+    {"TAMIL SYLLABLE NAA", 2, {0x0BA8, 0x0BBE}},
+    {"TAMIL SYLLABLE NAI", 2, {0x0BA8, 0x0BC8}},
+    {"TAMIL SYLLABLE NAU", 2, {0x0BA8, 0x0BCC}},
+    {"TAMIL SYLLABLE NE", 2, {0x0BA8, 0x0BC6}},
+    {"TAMIL SYLLABLE NEE", 2, {0x0BA8, 0x0BC7}},
+    {"TAMIL SYLLABLE NGAA", 2, {0x0B99, 0x0BBE}},
+    {"TAMIL SYLLABLE NGAI", 2, {0x0B99, 0x0BC8}},
+    {"TAMIL SYLLABLE NGAU", 2, {0x0B99, 0x0BCC}},
+    {"TAMIL SYLLABLE NGE", 2, {0x0B99, 0x0BC6}},
+    {"TAMIL SYLLABLE NGEE", 2, {0x0B99, 0x0BC7}},
+    {"TAMIL SYLLABLE NGI", 2, {0x0B99, 0x0BBF}},
+    {"TAMIL SYLLABLE NGII", 2, {0x0B99, 0x0BC0}},
+    {"TAMIL SYLLABLE NGO", 2, {0x0B99, 0x0BCA}},
+    {"TAMIL SYLLABLE NGOO", 2, {0x0B99, 0x0BCB}},
+    {"TAMIL SYLLABLE NGU", 2, {0x0B99, 0x0BC1}},
+    {"TAMIL SYLLABLE NGUU", 2, {0x0B99, 0x0BC2}},
+    {"TAMIL SYLLABLE NI", 2, {0x0BA8, 0x0BBF}},
+    {"TAMIL SYLLABLE NII", 2, {0x0BA8, 0x0BC0}},
+    {"TAMIL SYLLABLE NNAA", 2, {0x0BA3, 0x0BBE}},
+    {"TAMIL SYLLABLE NNAI", 2, {0x0BA3, 0x0BC8}},
+    {"TAMIL SYLLABLE NNAU", 2, {0x0BA3, 0x0BCC}},
+    {"TAMIL SYLLABLE NNE", 2, {0x0BA3, 0x0BC6}},
+    {"TAMIL SYLLABLE NNEE", 2, {0x0BA3, 0x0BC7}},
+    {"TAMIL SYLLABLE NNI", 2, {0x0BA3, 0x0BBF}},
+    {"TAMIL SYLLABLE NNII", 2, {0x0BA3, 0x0BC0}},
+    {"TAMIL SYLLABLE NNNAA", 2, {0x0BA9, 0x0BBE}},
+    {"TAMIL SYLLABLE NNNAI", 2, {0x0BA9, 0x0BC8}},
+    {"TAMIL SYLLABLE NNNAU", 2, {0x0BA9, 0x0BCC}},
+    {"TAMIL SYLLABLE NNNE", 2, {0x0BA9, 0x0BC6}},
+    {"TAMIL SYLLABLE NNNEE", 2, {0x0BA9, 0x0BC7}},
+    {"TAMIL SYLLABLE NNNI", 2, {0x0BA9, 0x0BBF}},
+    {"TAMIL SYLLABLE NNNII", 2, {0x0BA9, 0x0BC0}},
+    {"TAMIL SYLLABLE NNNO", 2, {0x0BA9, 0x0BCA}},
+    {"TAMIL SYLLABLE NNNOO", 2, {0x0BA9, 0x0BCB}},
+    {"TAMIL SYLLABLE NNNU", 2, {0x0BA9, 0x0BC1}},
+    {"TAMIL SYLLABLE NNNUU", 2, {0x0BA9, 0x0BC2}},
+    {"TAMIL SYLLABLE NNO", 2, {0x0BA3, 0x0BCA}},
+    {"TAMIL SYLLABLE NNOO", 2, {0x0BA3, 0x0BCB}},
+    {"TAMIL SYLLABLE NNU", 2, {0x0BA3, 0x0BC1}},
+    {"TAMIL SYLLABLE NNUU", 2, {0x0BA3, 0x0BC2}},
+    {"TAMIL SYLLABLE NO", 2, {0x0BA8, 0x0BCA}},
+    {"TAMIL SYLLABLE NOO", 2, {0x0BA8, 0x0BCB}},
+    {"TAMIL SYLLABLE NU", 2, {0x0BA8, 0x0BC1}},
+    {"TAMIL SYLLABLE NUU", 2, {0x0BA8, 0x0BC2}},
+    {"TAMIL SYLLABLE NYAA", 2, {0x0B9E, 0x0BBE}},
+    {"TAMIL SYLLABLE NYAI", 2, {0x0B9E, 0x0BC8}},
+    {"TAMIL SYLLABLE NYAU", 2, {0x0B9E, 0x0BCC}},
+    {"TAMIL SYLLABLE NYE", 2, {0x0B9E, 0x0BC6}},
+    {"TAMIL SYLLABLE NYEE", 2, {0x0B9E, 0x0BC7}},
+    {"TAMIL SYLLABLE NYI", 2, {0x0B9E, 0x0BBF}},
+    {"TAMIL SYLLABLE NYII", 2, {0x0B9E, 0x0BC0}},
+    {"TAMIL SYLLABLE NYO", 2, {0x0B9E, 0x0BCA}},
+    {"TAMIL SYLLABLE NYOO", 2, {0x0B9E, 0x0BCB}},
+    {"TAMIL SYLLABLE NYU", 2, {0x0B9E, 0x0BC1}},
+    {"TAMIL SYLLABLE NYUU", 2, {0x0B9E, 0x0BC2}},
+    {"TAMIL SYLLABLE PAA", 2, {0x0BAA, 0x0BBE}},
+    {"TAMIL SYLLABLE PAI", 2, {0x0BAA, 0x0BC8}},
+    {"TAMIL SYLLABLE PAU", 2, {0x0BAA, 0x0BCC}},
+    {"TAMIL SYLLABLE PE", 2, {0x0BAA, 0x0BC6}},
+    {"TAMIL SYLLABLE PEE", 2, {0x0BAA, 0x0BC7}},
+    {"TAMIL SYLLABLE PI", 2, {0x0BAA, 0x0BBF}},
+    {"TAMIL SYLLABLE PII", 2, {0x0BAA, 0x0BC0}},
+    {"TAMIL SYLLABLE PO", 2, {0x0BAA, 0x0BCA}},
+    {"TAMIL SYLLABLE POO", 2, {0x0BAA, 0x0BCB}},
+    {"TAMIL SYLLABLE PU", 2, {0x0BAA, 0x0BC1}},
+    {"TAMIL SYLLABLE PUU", 2, {0x0BAA, 0x0BC2}},
+    {"TAMIL SYLLABLE RAA", 2, {0x0BB0, 0x0BBE}},
+    {"TAMIL SYLLABLE RAI", 2, {0x0BB0, 0x0BC8}},
+    {"TAMIL SYLLABLE RAU", 2, {0x0BB0, 0x0BCC}},
+    {"TAMIL SYLLABLE RE", 2, {0x0BB0, 0x0BC6}},
+    {"TAMIL SYLLABLE REE", 2, {0x0BB0, 0x0BC7}},
+    {"TAMIL SYLLABLE RI", 2, {0x0BB0, 0x0BBF}},
+    {"TAMIL SYLLABLE RII", 2, {0x0BB0, 0x0BC0}},
+    {"TAMIL SYLLABLE RO", 2, {0x0BB0, 0x0BCA}},
+    {"TAMIL SYLLABLE ROO", 2, {0x0BB0, 0x0BCB}},
+    {"TAMIL SYLLABLE RRAA", 2, {0x0BB1, 0x0BBE}},
+    {"TAMIL SYLLABLE RRAI", 2, {0x0BB1, 0x0BC8}},
+    {"TAMIL SYLLABLE RRAU", 2, {0x0BB1, 0x0BCC}},
+    {"TAMIL SYLLABLE RRE", 2, {0x0BB1, 0x0BC6}},
+    {"TAMIL SYLLABLE RREE", 2, {0x0BB1, 0x0BC7}},
+    {"TAMIL SYLLABLE RRI", 2, {0x0BB1, 0x0BBF}},
+    {"TAMIL SYLLABLE RRII", 2, {0x0BB1, 0x0BC0}},
+    {"TAMIL SYLLABLE RRO", 2, {0x0BB1, 0x0BCA}},
+    {"TAMIL SYLLABLE RROO", 2, {0x0BB1, 0x0BCB}},
+    {"TAMIL SYLLABLE RRU", 2, {0x0BB1, 0x0BC1}},
+    {"TAMIL SYLLABLE RRUU", 2, {0x0BB1, 0x0BC2}},
+    {"TAMIL SYLLABLE RU", 2, {0x0BB0, 0x0BC1}},
+    {"TAMIL SYLLABLE RUU", 2, {0x0BB0, 0x0BC2}},
+    {"TAMIL SYLLABLE SAA", 2, {0x0BB8, 0x0BBE}},
+    {"TAMIL SYLLABLE SAI", 2, {0x0BB8, 0x0BC8}},
+    {"TAMIL SYLLABLE SAU", 2, {0x0BB8, 0x0BCC}},
+    {"TAMIL SYLLABLE SE", 2, {0x0BB8, 0x0BC6}},
+    {"TAMIL SYLLABLE SEE", 2, {0x0BB8, 0x0BC7}},
+    {"TAMIL SYLLABLE SHAA", 2, {0x0BB6, 0x0BBE}},
+    {"TAMIL SYLLABLE SHAI", 2, {0x0BB6, 0x0BC8}},
+    {"TAMIL SYLLABLE SHAU", 2, {0x0BB6, 0x0BCC}},
+    {"TAMIL SYLLABLE SHE", 2, {0x0BB6, 0x0BC6}},
+    {"TAMIL SYLLABLE SHEE", 2, {0x0BB6, 0x0BC7}},
+    {"TAMIL SYLLABLE SHI", 2, {0x0BB6, 0x0BBF}},
+    {"TAMIL SYLLABLE SHII", 2, {0x0BB6, 0x0BC0}},
+    {"TAMIL SYLLABLE SHO", 2, {0x0BB6, 0x0BCA}},
+    {"TAMIL SYLLABLE SHOO", 2, {0x0BB6, 0x0BCB}},
+    {"TAMIL SYLLABLE SHRII", 4, {0x0BB6, 0x0BCD, 0x0BB0, 0x0BC0}},
+    {"TAMIL SYLLABLE SHU", 2, {0x0BB6, 0x0BC1}},
+    {"TAMIL SYLLABLE SHUU", 2, {0x0BB6, 0x0BC2}},
+    {"TAMIL SYLLABLE SI", 2, {0x0BB8, 0x0BBF}},
+    {"TAMIL SYLLABLE SII", 2, {0x0BB8, 0x0BC0}},
+    {"TAMIL SYLLABLE SO", 2, {0x0BB8, 0x0BCA}},
+    {"TAMIL SYLLABLE SOO", 2, {0x0BB8, 0x0BCB}},
+    {"TAMIL SYLLABLE SSAA", 2, {0x0BB7, 0x0BBE}},
+    {"TAMIL SYLLABLE SSAI", 2, {0x0BB7, 0x0BC8}},
+    {"TAMIL SYLLABLE SSAU", 2, {0x0BB7, 0x0BCC}},
+    {"TAMIL SYLLABLE SSE", 2, {0x0BB7, 0x0BC6}},
+    {"TAMIL SYLLABLE SSEE", 2, {0x0BB7, 0x0BC7}},
+    {"TAMIL SYLLABLE SSI", 2, {0x0BB7, 0x0BBF}},
+    {"TAMIL SYLLABLE SSII", 2, {0x0BB7, 0x0BC0}},
+    {"TAMIL SYLLABLE SSO", 2, {0x0BB7, 0x0BCA}},
+    {"TAMIL SYLLABLE SSOO", 2, {0x0BB7, 0x0BCB}},
+    {"TAMIL SYLLABLE SSU", 2, {0x0BB7, 0x0BC1}},
+    {"TAMIL SYLLABLE SSUU", 2, {0x0BB7, 0x0BC2}},
+    {"TAMIL SYLLABLE SU", 2, {0x0BB8, 0x0BC1}},
+    {"TAMIL SYLLABLE SUU", 2, {0x0BB8, 0x0BC2}},
+    {"TAMIL SYLLABLE TAA", 2, {0x0BA4, 0x0BBE}},
+    {"TAMIL SYLLABLE TAI", 2, {0x0BA4, 0x0BC8}},
+    {"TAMIL SYLLABLE TAU", 2, {0x0BA4, 0x0BCC}},
+    {"TAMIL SYLLABLE TE", 2, {0x0BA4, 0x0BC6}},
+    {"TAMIL SYLLABLE TEE", 2, {0x0BA4, 0x0BC7}},
+    {"TAMIL SYLLABLE TI", 2, {0x0BA4, 0x0BBF}},
+    {"TAMIL SYLLABLE TII", 2, {0x0BA4, 0x0BC0}},
+    {"TAMIL SYLLABLE TO", 2, {0x0BA4, 0x0BCA}},
+    {"TAMIL SYLLABLE TOO", 2, {0x0BA4, 0x0BCB}},
+    {"TAMIL SYLLABLE TTAA", 2, {0x0B9F, 0x0BBE}},
+    {"TAMIL SYLLABLE TTAI", 2, {0x0B9F, 0x0BC8}},
+    {"TAMIL SYLLABLE TTAU", 2, {0x0B9F, 0x0BCC}},
+    {"TAMIL SYLLABLE TTE", 2, {0x0B9F, 0x0BC6}},
+    {"TAMIL SYLLABLE TTEE", 2, {0x0B9F, 0x0BC7}},
+    {"TAMIL SYLLABLE TTI", 2, {0x0B9F, 0x0BBF}},
+    {"TAMIL SYLLABLE TTII", 2, {0x0B9F, 0x0BC0}},
+    {"TAMIL SYLLABLE TTO", 2, {0x0B9F, 0x0BCA}},
+    {"TAMIL SYLLABLE TTOO", 2, {0x0B9F, 0x0BCB}},
+    {"TAMIL SYLLABLE TTU", 2, {0x0B9F, 0x0BC1}},
+    {"TAMIL SYLLABLE TTUU", 2, {0x0B9F, 0x0BC2}},
+    {"TAMIL SYLLABLE TU", 2, {0x0BA4, 0x0BC1}},
+    {"TAMIL SYLLABLE TUU", 2, {0x0BA4, 0x0BC2}},
+    {"TAMIL SYLLABLE VAA", 2, {0x0BB5, 0x0BBE}},
+    {"TAMIL SYLLABLE VAI", 2, {0x0BB5, 0x0BC8}},
+    {"TAMIL SYLLABLE VAU", 2, {0x0BB5, 0x0BCC}},
+    {"TAMIL SYLLABLE VE", 2, {0x0BB5, 0x0BC6}},
+    {"TAMIL SYLLABLE VEE", 2, {0x0BB5, 0x0BC7}},
+    {"TAMIL SYLLABLE VI", 2, {0x0BB5, 0x0BBF}},
+    {"TAMIL SYLLABLE VII", 2, {0x0BB5, 0x0BC0}},
+    {"TAMIL SYLLABLE VO", 2, {0x0BB5, 0x0BCA}},
+    {"TAMIL SYLLABLE VOO", 2, {0x0BB5, 0x0BCB}},
+    {"TAMIL SYLLABLE VU", 2, {0x0BB5, 0x0BC1}},
+    {"TAMIL SYLLABLE VUU", 2, {0x0BB5, 0x0BC2}},
+    {"TAMIL SYLLABLE YAA", 2, {0x0BAF, 0x0BBE}},
+    {"TAMIL SYLLABLE YAI", 2, {0x0BAF, 0x0BC8}},
+    {"TAMIL SYLLABLE YAU", 2, {0x0BAF, 0x0BCC}},
+    {"TAMIL SYLLABLE YE", 2, {0x0BAF, 0x0BC6}},
+    {"TAMIL SYLLABLE YEE", 2, {0x0BAF, 0x0BC7}},
+    {"TAMIL SYLLABLE YI", 2, {0x0BAF, 0x0BBF}},
+    {"TAMIL SYLLABLE YII", 2, {0x0BAF, 0x0BC0}},
+    {"TAMIL SYLLABLE YO", 2, {0x0BAF, 0x0BCA}},
+    {"TAMIL SYLLABLE YOO", 2, {0x0BAF, 0x0BCB}},
+    {"TAMIL SYLLABLE YU", 2, {0x0BAF, 0x0BC1}},
+    {"TAMIL SYLLABLE YUU", 2, {0x0BAF, 0x0BC2}},
+};
diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@@ -25,7 +25,12 @@
 # written by Fredrik Lundh (fredrik at pythonware.com)
 #
 
-import sys, os, zipfile
+import os
+import sys
+import zipfile
+
+from textwrap import dedent
+from operator import itemgetter
 
 SCRIPT = sys.argv[0]
 VERSION = "3.2"
@@ -39,6 +44,8 @@
 DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
 DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
 LINE_BREAK = "LineBreak%s.txt"
+NAME_ALIASES = "NameAliases%s.txt"
+NAMED_SEQUENCES = "NamedSequences%s.txt"
 
 old_versions = ["3.2.0"]
 
@@ -692,6 +699,40 @@
     print("/* name->code dictionary */", file=fp)
     codehash.dump(fp, trace)
 
+    print(dedent("""
+        typedef struct Alias {
+            char *name;
+            int namelen;
+            int codepoint;
+        } alias;
+        """), file=fp)
+
+    print('static const int aliases_count = %d;' % len(unicode.aliases), file=fp)
+
+    print('static const alias name_aliases[] = {', file=fp)
+    for name, codepoint in unicode.aliases:
+        print('    {"%s", %d, 0x%04X},' % (name, len(name), codepoint), file=fp)
+    print('};', file=fp)
+
+    # the Py_UCS2 seq[4] should use Py_UCS4 if non-BMP chars are added to the
+    # sequences and have an higher number of elements if the sequences get longer
+    print(dedent("""
+        typedef struct NamedSequence {
+            char *name;
+            int seqlen;
+            Py_UCS2 seq[4];
+        } named_sequence;
+        """), file=fp)
+
+    print('static const int named_sequences_count = %d;' % len(unicode.named_sequences),
+          file=fp)
+
+    print('static const named_sequence named_sequences[] = {', file=fp)
+    for name, sequence in unicode.named_sequences:
+        seq_str = ', '.join('0x%04X' % cp for cp in sequence)
+        print('    {"%s", %d, {%s}},' % (name, len(sequence), seq_str), file=fp)
+    print('};', file=fp)
+
     fp.close()
 
 
@@ -855,6 +896,31 @@
         self.table = table
         self.chars = list(range(0x110000)) # unicode 3.2
 
+        self.aliases = []
+        with open_data(NAME_ALIASES, version) as file:
+            for s in file:
+                s = s.strip()
+                if not s or s.startswith('#'):
+                    continue
+                char, name = s.split(';')
+                char = int(char, 16)
+                self.aliases.append((name, char))
+
+        self.named_sequences = []
+        with open_data(NAMED_SEQUENCES, version) as file:
+            for s in file:
+                s = s.strip()
+                if not s or s.startswith('#'):
+                    continue
+                name, chars = s.split(';')
+                chars = tuple(int(char, 16) for char in chars.split())
+                # check that the structure defined in makeunicodename is OK
+                assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
+                assert all(c <= 0xFFFF for c in chars), "use Py_UCS4 instead"
+                self.named_sequences.append((name, chars))
+        # sort names to enable binary search
+        self.named_sequences.sort(key=itemgetter(0))
+
         self.exclusions = {}
         with open_data(COMPOSITION_EXCLUSIONS, version) as file:
             for s in file:


More information about the Python-bugs-list mailing list