[Python-Dev] Unicode charnames impl.

Fri, 24 Mar 2000 16:46:28 -0500 (EST)

Here's a strawman codec for doing the \N{NULL} thing.  Questions:

0) Is the code below correct?

1) What the heck would this encoding be called?

2) What does .encode() do?  (Right now it escapes \N as
\N{BACKSLASH}N.)

3) How can we store all those names?  The resulting dictionary makes a
361K .py file; Python dumps core trying to parse it.  (Another bug...)

4) What do you with the error \N{...... no closing right bracket.
   Right now it stops at that point, and never advances any farther.  
   Maybe it should assume it's an error if there's no } within the
   next 200 chars or some similar limit?

5) Do we need StreamReader/Writer classes, too?

I've also add a script that parses the names out of the NameList.txt 
file at ftp://ftp.unicode.org/Public/UNIDATA/.

--amk 

namecodec.py:
=============

import codecs

#from _namedict import namedict
namedict = {'NULL': 0, 'START OF HEADING' : 1,
            'BACKSLASH':ord('\\')}

class NameCodec(codecs.Codec):
    def encode(self,input,errors='strict'):
        # XXX what should this do?  Escape the
        # sequence \N as '\N{BACKSLASH}N'?
        return input.replace( '\\N', '\\N{BACKSLASH}N' )

    def decode(self,input,errors='strict'):
        output = unicode("")
        last = 0
        index = input.find( u'\\N{' )
        while index != -1:
            output = output + unicode( input[last:index] )
            used = index
            r_bracket = input.find( '}', index)
            if r_bracket == -1:
                # No closing bracket; bail out...
                break

            name = input[index + 3 : r_bracket]
            code = namedict.get( name )
            if code is not None:
                output = output + unichr(code)
            elif errors == 'strict':
                raise ValueError, 'Unknown character name %s' % repr(name)
            elif errors == 'ignore': pass
            elif errors == 'replace':
                output = output + unichr( 0xFFFD )

            last = r_bracket + 1
            index = input.find( '\\N{', last)
        else:
            # Finally failed gently, no longer finding a \N{...
            output = output + unicode( input[last:] )
            return len(input), output

        # Otherwise, we hit the break for an unterminated \N{...}
        return index, output

if __name__ == '__main__':
    c = NameCodec()
    for s in [ r'b\lah blah \N{NULL} asdf',
               r'b\l\N{START OF HEADING}\N{NU' ]:
        used, s2 = c.decode(s)
        print repr( s2 )

        s3 = c.encode(s)
        _, s4 = c.decode(s3)
        print repr(s3)
        assert s4 == s

    print repr( c.decode(r'blah blah \N{NULLsadf} asdf' , errors='replace' ))
    print repr( c.decode(r'blah blah \N{NULLsadf} asdf' , errors='ignore' ))

makenamelist.py
===============

# Hack to extract character names from NamesList.txt
# Output the repr() of the resulting dictionary

import re, sys, string

namedict = {}

while 1:
    L = sys.stdin.readline()
    if L == "": break

    m = re.match('([0-9a-fA-F]){4}(?:\t(.*)\s*)', L)
    if m is not None:
        last_char = int(m.group(1), 16)
        if m.group(2) is not None:
            name = string.upper( m.group(2) )
            if name not in ['<CONTROL>',
                            '<NOT A CHARACTER>']:
                namedict[ name ] = last_char
#                print name, last_char

    m = re.match('\t=\s*(.*)\s*(;.*)?', L)
    if m is not None:
        name = string.upper( m.group(1) )
        names = string.split(name, ',')
        names = map(string.strip, names)
        for n in names:
            namedict[ n ] = last_char
#            print n, last_char

# XXX and do what with this dictionary?        
print namedict