[Python-Dev] Unicode charnames impl.

Andrew M. Kuchling akuchlin@mems-exchange.org
Fri, 24 Mar 2000 16:46:28 -0500 (EST)

Here's a strawman codec for doing the \N{NULL} thing.  Questions:

0) Is the code below correct?

1) What the heck would this encoding be called?

2) What does .encode() do?  (Right now it escapes \N as

3) How can we store all those names?  The resulting dictionary makes a
361K .py file; Python dumps core trying to parse it.  (Another bug...)

4) What do you with the error \N{...... no closing right bracket.
   Right now it stops at that point, and never advances any farther.  
   Maybe it should assume it's an error if there's no } within the
   next 200 chars or some similar limit?
5) Do we need StreamReader/Writer classes, too?

I've also add a script that parses the names out of the NameList.txt 
file at ftp://ftp.unicode.org/Public/UNIDATA/.



import codecs

#from _namedict import namedict
namedict = {'NULL': 0, 'START OF HEADING' : 1,
class NameCodec(codecs.Codec):
    def encode(self,input,errors='strict'):
        # XXX what should this do?  Escape the
        # sequence \N as '\N{BACKSLASH}N'?
        return input.replace( '\\N', '\\N{BACKSLASH}N' )

    def decode(self,input,errors='strict'):
        output = unicode("")
        last = 0
        index = input.find( u'\\N{' )
        while index != -1:
            output = output + unicode( input[last:index] )
            used = index
            r_bracket = input.find( '}', index)
            if r_bracket == -1:
                # No closing bracket; bail out...

            name = input[index + 3 : r_bracket]
            code = namedict.get( name )
            if code is not None:
                output = output + unichr(code)
            elif errors == 'strict':
                raise ValueError, 'Unknown character name %s' % repr(name)
            elif errors == 'ignore': pass
            elif errors == 'replace':
                output = output + unichr( 0xFFFD )
            last = r_bracket + 1
            index = input.find( '\\N{', last)
            # Finally failed gently, no longer finding a \N{...
            output = output + unicode( input[last:] )
            return len(input), output

        # Otherwise, we hit the break for an unterminated \N{...}
        return index, output
if __name__ == '__main__':
    c = NameCodec()
    for s in [ r'b\lah blah \N{NULL} asdf',
               r'b\l\N{START OF HEADING}\N{NU' ]:
        used, s2 = c.decode(s)
        print repr( s2 )

        s3 = c.encode(s)
        _, s4 = c.decode(s3)
        print repr(s3)
        assert s4 == s
    print repr( c.decode(r'blah blah \N{NULLsadf} asdf' , errors='replace' ))
    print repr( c.decode(r'blah blah \N{NULLsadf} asdf' , errors='ignore' ))


# Hack to extract character names from NamesList.txt
# Output the repr() of the resulting dictionary
import re, sys, string

namedict = {}

while 1:
    L = sys.stdin.readline()
    if L == "": break

    m = re.match('([0-9a-fA-F]){4}(?:\t(.*)\s*)', L)
    if m is not None:
        last_char = int(m.group(1), 16)
        if m.group(2) is not None:
            name = string.upper( m.group(2) )
            if name not in ['<CONTROL>',
                            '<NOT A CHARACTER>']:
                namedict[ name ] = last_char
#                print name, last_char
    m = re.match('\t=\s*(.*)\s*(;.*)?', L)
    if m is not None:
        name = string.upper( m.group(1) )
        names = string.split(name, ',')
        names = map(string.strip, names)
        for n in names:
            namedict[ n ] = last_char
#            print n, last_char

# XXX and do what with this dictionary?        
print namedict