htmllib.HTMLescape()

The difficulty, I assume, lies in figuring out which encodings support what characters.
It's not that difficult to write a set of new codecs; with the module below, I can do
The difficulty is that the algorithm is not very efficient if there are many unsupported characters in a string. Regards, Martin # Module implementing a set of new encodings of the form <enc>-xml # Copyright Martin v. Löwis # It currently supports hex character references only import codecs class CodecWrapper: def __init__(self,encoder,decoder): self.encoder = encoder self.decoder = decoder def encode(self,input,errors='strict'): try: return self.encoder(input,"strict") except ValueError: l = len(input) if l==1: return "&#x%x;" % ord(input), 1 s1,l1 = self.encode(input[:l/2]) s2,l2 = self.encode(input[l/2:]) return s1+s2,l1+l2 def decode(self,input,errors='strict'): input = str(input) # might be buffer object pos = input.find("&#x") if pos == -1: return self.decoder(input,errors) r1,l1 = self.decode(input[:pos],errors) end = input.find(";",pos) try: if end==-1: raise ValueError # goto failure code below val = int(input[pos+3:end],16) r2,l2 = self.decode(input[end+1:],errors) return r1+unichr(val)+r2,l1+end-pos+l2 except ValueError: # how to deal with errors in decode? r2,l2 = self.decode(input[pos+2:],errors) return r1+"&#x"+r2,l1+3+l2 def mkreader(self): r = codecs.StreamReader() r.decode = self.decode r.encode = self.encode return r def mkwriter(self): r = codecs.StreamWriter() r.decode = self.decode r.encode = self.encode return r def search_function(encoding): if not encoding.endswith("-xml"): return None enc,dec,reader,writer = codecs.lookup(encoding[:-4]) c = CodecWrapper(enc,dec) return c.encode,c.decode,c.mkreader,c.mkwriter codecs.register(search_function)
participants (1)
-
Martin von Loewis