[Python-Dev] htmllib.HTMLescape()
Martin von Loewis
loewis@informatik.hu-berlin.de
Fri, 6 Oct 2000 14:39:05 +0200 (MET DST)
> The difficulty, I assume, lies in figuring out which encodings
> support what characters.
It's not that difficult to write a set of new codecs; with the module
below, I can do
>>> import xmlenc
>>> u'\u0416'.encode('latin-1-xml')
'Ж'
>>> unicode('XɅY','latin-1-xml')
u'X\u0245Y'
The difficulty is that the algorithm is not very efficient if there
are many unsupported characters in a string.
Regards,
Martin
# Module implementing a set of new encodings of the form <enc>-xml
# Copyright Martin v. Löwis
# It currently supports hex character references only
import codecs
class CodecWrapper:
def __init__(self,encoder,decoder):
self.encoder = encoder
self.decoder = decoder
def encode(self,input,errors='strict'):
try:
return self.encoder(input,"strict")
except ValueError:
l = len(input)
if l==1:
return "&#x%x;" % ord(input), 1
s1,l1 = self.encode(input[:l/2])
s2,l2 = self.encode(input[l/2:])
return s1+s2,l1+l2
def decode(self,input,errors='strict'):
input = str(input) # might be buffer object
pos = input.find("&#x")
if pos == -1:
return self.decoder(input,errors)
r1,l1 = self.decode(input[:pos],errors)
end = input.find(";",pos)
try:
if end==-1:
raise ValueError # goto failure code below
val = int(input[pos+3:end],16)
r2,l2 = self.decode(input[end+1:],errors)
return r1+unichr(val)+r2,l1+end-pos+l2
except ValueError:
# how to deal with errors in decode?
r2,l2 = self.decode(input[pos+2:],errors)
return r1+"&#x"+r2,l1+3+l2
def mkreader(self):
r = codecs.StreamReader()
r.decode = self.decode
r.encode = self.encode
return r
def mkwriter(self):
r = codecs.StreamWriter()
r.decode = self.decode
r.encode = self.encode
return r
def search_function(encoding):
if not encoding.endswith("-xml"):
return None
enc,dec,reader,writer = codecs.lookup(encoding[:-4])
c = CodecWrapper(enc,dec)
return c.encode,c.decode,c.mkreader,c.mkwriter
codecs.register(search_function)