[Python-Dev] htmllib.HTMLescape()

Fri, 6 Oct 2000 14:39:05 +0200 (MET DST)

> The difficulty, I assume, lies in figuring out which encodings
> support what characters.

It's not that difficult to write a set of new codecs; with the module
below, I can do

>>> import xmlenc
>>> u'\u0416'.encode('latin-1-xml')
'&#x416;'
>>> unicode('X&#x245;Y','latin-1-xml')
u'X\u0245Y'

The difficulty is that the algorithm is not very efficient if there
are many unsupported characters in a string.

Regards,
Martin

# Module implementing a set of new encodings of the form <enc>-xml
# Copyright Martin v. L&ouml;wis
# It currently supports hex character references only

import codecs

class CodecWrapper:
    def __init__(self,encoder,decoder):
        self.encoder = encoder
        self.decoder = decoder

    def encode(self,input,errors='strict'):
        try:
            return self.encoder(input,"strict")
        except ValueError:
            l = len(input)
            if l==1:
                return "&#x%x;" % ord(input), 1
            s1,l1 = self.encode(input[:l/2])
            s2,l2 = self.encode(input[l/2:])
            return s1+s2,l1+l2

    def decode(self,input,errors='strict'):
        input = str(input) # might be buffer object
        pos = input.find("&#x")
        if pos == -1:
            return self.decoder(input,errors)
        r1,l1 = self.decode(input[:pos],errors)
        end = input.find(";",pos)
        try:
            if end==-1:
                raise ValueError # goto failure code below
            val = int(input[pos+3:end],16)
            r2,l2 = self.decode(input[end+1:],errors)
            return r1+unichr(val)+r2,l1+end-pos+l2
        except ValueError:
            # how to deal with errors in decode?
            r2,l2 = self.decode(input[pos+2:],errors)
            return r1+"&#x"+r2,l1+3+l2

    def mkreader(self):
        r = codecs.StreamReader()
        r.decode = self.decode
        r.encode = self.encode
        return r

    def mkwriter(self):
        r = codecs.StreamWriter()
        r.decode = self.decode
        r.encode = self.encode
        return r

def search_function(encoding):
    if not encoding.endswith("-xml"):
        return None
    enc,dec,reader,writer = codecs.lookup(encoding[:-4])
    c = CodecWrapper(enc,dec)
    return c.encode,c.decode,c.mkreader,c.mkwriter

codecs.register(search_function)