Here's a strawman codec for doing the \N{NULL} thing. Questions: 0) Is the code below correct? 1) What the heck would this encoding be called? 2) What does .encode() do? (Right now it escapes \N as \N{BACKSLASH}N.) 3) How can we store all those names? The resulting dictionary makes a 361K .py file; Python dumps core trying to parse it. (Another bug...) 4) What do you with the error \N{...... no closing right bracket. Right now it stops at that point, and never advances any farther. Maybe it should assume it's an error if there's no } within the next 200 chars or some similar limit? 5) Do we need StreamReader/Writer classes, too? I've also add a script that parses the names out of the NameList.txt file at ftp://ftp.unicode.org/Public/UNIDATA/. --amk namecodec.py: ============= import codecs #from _namedict import namedict namedict = {'NULL': 0, 'START OF HEADING' : 1, 'BACKSLASH':ord('\\')} class NameCodec(codecs.Codec): def encode(self,input,errors='strict'): # XXX what should this do? Escape the # sequence \N as '\N{BACKSLASH}N'? return input.replace( '\\N', '\\N{BACKSLASH}N' ) def decode(self,input,errors='strict'): output = unicode("") last = 0 index = input.find( u'\\N{' ) while index != -1: output = output + unicode( input[last:index] ) used = index r_bracket = input.find( '}', index) if r_bracket == -1: # No closing bracket; bail out... break name = input[index + 3 : r_bracket] code = namedict.get( name ) if code is not None: output = output + unichr(code) elif errors == 'strict': raise ValueError, 'Unknown character name %s' % repr(name) elif errors == 'ignore': pass elif errors == 'replace': output = output + unichr( 0xFFFD ) last = r_bracket + 1 index = input.find( '\\N{', last) else: # Finally failed gently, no longer finding a \N{... output = output + unicode( input[last:] ) return len(input), output # Otherwise, we hit the break for an unterminated \N{...} return index, output if __name__ == '__main__': c = NameCodec() for s in [ r'b\lah blah \N{NULL} asdf', r'b\l\N{START OF HEADING}\N{NU' ]: used, s2 = c.decode(s) print repr( s2 ) s3 = c.encode(s) _, s4 = c.decode(s3) print repr(s3) assert s4 == s print repr( c.decode(r'blah blah \N{NULLsadf} asdf' , errors='replace' )) print repr( c.decode(r'blah blah \N{NULLsadf} asdf' , errors='ignore' )) makenamelist.py =============== # Hack to extract character names from NamesList.txt # Output the repr() of the resulting dictionary import re, sys, string namedict = {} while 1: L = sys.stdin.readline() if L == "": break m = re.match('([0-9a-fA-F]){4}(?:\t(.*)\s*)', L) if m is not None: last_char = int(m.group(1), 16) if m.group(2) is not None: name = string.upper( m.group(2) ) if name not in ['<CONTROL>', '<NOT A CHARACTER>']: namedict[ name ] = last_char # print name, last_char m = re.match('\t=\s*(.*)\s*(;.*)?', L) if m is not None: name = string.upper( m.group(1) ) names = string.split(name, ',') names = map(string.strip, names) for n in names: namedict[ n ] = last_char # print n, last_char # XXX and do what with this dictionary? print namedict