Benchmarking stripping of Unicode characters which are invalid XML

Sun Mar 18 16:13:32 EDT 2012

Last week I was surprised to discover that there are Unicode characters that aren't valid in an XML document. That is regardless of escaping (e.g. &#x00;) and unicode encoding (e.g. UTF-8) - not every Unicode string can be stored in XML. The valid characters are (as of XML 1.0) #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]. Others such as #x13 must be stripped, replaced or placed inside a wrapper such as base64.

I didn't find an existing function to strip these so I wrote some and benchmarked them. I'd be interested for thoughts, suggestions and improvements.

regsub_p2 was the fastest on a string containing mostly printable-ascii.

regsub_p1 0.422097921371 True
regsub_p2 0.353546857834 True
regsub_p3 0.697242021561 True
regsub_p4 0.677567005157 True
genexp_p1 6.43633103371 True
genexp_p2 6.43329787254 True
genexp_p3 6.80837488174 True
genexp_p4 6.81470417976 True
filter_p1 7.21444416046 True
filter_p2 7.46805095673 True
filter_p3 7.37018704414 True
filter_p4 7.03261303902 True
genexp_f1 12.8470640182 True
genexp_f2 5.43630099297 True
genexp_f3 4.9708840847 True
genexp_f4 12.2384109497 True
genexp_f5 6.95861411095 True
genexp_f6 4.7168610096 True
genexp_f7 20.2065701485 True
genexp_f8 21.1112251282 True

Regards, Alex
#!/usr/bin/python
# Valid XML 1.0 characters are
# #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
# http://www.w3.org/TR/2008/PER-xml-20080205/#charsets
#
# Before passing an arbitrary unicode string to an XML encoder invalid characters
# must be stripped or replaced. Escaping them doesn't help - they're simply not
# allowed in a well formed XML 1.0 document.

# The following  script banchmarks several functions that strip them

import re
import string
import timeit

p1 = re.compile(u'[^\x09\x0A\x0D\u0020-\uD7FF'
                u'\uE000-\uFFFD\U00010000-\U0010FFFF]', re.U)

p2 = re.compile(u'[^\u0020-\uD7FF\x09\x0A\x0D'
                u'\uE000-\uFFFD\U00010000-\U0010FFFF]', re.U)

p3 = re.compile(p1.pattern + u'+', re.U)
p4 = re.compile(p2.pattern + u'+', re.U)

def regsub_p1(s): return p1.sub(u'', s)
def regsub_p2(s): return p2.sub(u'', s)
def regsub_p3(s): return p3.sub(u'', s)
def regsub_p4(s): return p4.sub(u'', s)

def genexp_p1(s): return u''.join(c for c in s if not p1.match(c))
def genexp_p2(s): return u''.join(c for c in s if not p2.match(c))
def genexp_p3(s): return u''.join(c for c in s if not p3.match(c))
def genexp_p4(s): return u''.join(c for c in s if not p4.match(c))

def filter_p1(s): return u''.join(filter(lambda c: not p1.match(c), s))
def filter_p2(s): return u''.join(filter(lambda c: not p2.match(c), s))
def filter_p3(s): return u''.join(filter(lambda c: not p3.match(c), s))
def filter_p4(s): return u''.join(filter(lambda c: not p4.match(c), s))

def f1(c):
    i = ord(c)
    return (i in set([0x09, 0x0A, 0x0D]) or 0x0020 <= i <= 0xD7FF
            or 0xE000 <= i <= 0xFFFD or 0x00010000 <= i <= 0x0010FFFF)

def f2(c):
    i = ord(c)
    return (0x0020 <= i <= 0xD7FF or i in set([0x09, 0x0A, 0x0D])
            or 0xE000 <= i <= 0xFFFD or 0x00010000 <= i <= 0x0010FFFF)

def f3(c):
    return (u'\u0020' <= c <= u'\uD7FF' or c in set([u'\x09', u'\x0A', u'\x0D'])
            or u'\uE000' <= c <= u'\uFFFD' or u'\U00010000' <= c <= u'\U0010FFFF')

def f4(c):
    return (c in set([u'\x09', u'\x0A', u'\x0D']) or u'\u0020' <= c <= u'\uD7FF'
            or u'\uE000' <= c <= u'\uFFFD' or u'\U00010000' <= c <= u'\U0010FFFF')

def f5(c):
    return (c == u'\x09' or c == u'\x0A' or c == u'\x0D' or u'\u0020' <= c <= u'\uD7FF'
            or u'\uE000' <= c <= u'\uFFFD' or u'\U00010000' <= c <= u'\U0010FFFF')

def f6(c):
    return (u'\u0020' <= c <= u'\uD7FF' or c == u'\x09' or c == u'\x0A' or c == u'\x0D'
            or u'\uE000' <= c <= u'\uFFFD' or u'\U00010000' <= c <= u'\U0010FFFF')

every_8bit = u''.join(unichr(i) for i in range(256))
valid_8bit = u''.join(c for c in every_8bit if f1(c))
invalid_8bit = u''.join(c for c in every_8bit if not f1(c))
invalid_8bit_iso88591 = invalid_8bit.encode('iso-8859-1')
translator = string.maketrans(invalid_8bit_iso88591,
                              '\x00' * len(invalid_8bit_iso88591))

def f7(c):
    return ((c <= u'\xff' and ord(string.translate(c.encode('iso-8859-1'), translator)))
            or u'\uE000' <= c <= u'\uFFFD' or u'\U00010000' <= c <= u'\U0010FFFF')

def f8(c):
    return ((c <= u'\xff' and string.translate(c.encode('iso-8859-1'), None, invalid_8bit_iso88591))
            or u'\uE000' <= c <= u'\uFFFD' or u'\U00010000' <= c <= u'\U0010FFFF')

def genexp_f1(s): return u''.join(c for c in s if f1(c))
def genexp_f2(s): return u''.join(c for c in s if f2(c))
def genexp_f3(s): return u''.join(c for c in s if f3(c))
def genexp_f4(s): return u''.join(c for c in s if f4(c))
def genexp_f5(s): return u''.join(c for c in s if f5(c))
def genexp_f6(s): return u''.join(c for c in s if f6(c))
def genexp_f7(s): return u''.join(c for c in s if f7(c))
def genexp_f8(s): return u''.join(c for c in s if f8(c))

if __name__ == '__main__':
    sample_in = u'''Lorem ipsum dolor sit amet\x00, consectetur adipisicing
    elit, \tsed \rdo eiusmod tempor incididunt \x13ut labore et dolore magna
    \xf7aliqua.\ufffe'''

    expected_out = u'''Lorem ipsum dolor sit amet, consectetur adipisicing
    elit, \tsed \rdo eiusmod tempor incididunt ut labore et dolore magna
    \xf7aliqua.'''

    for func, inner_fun in [(regsub_p1, p1), (regsub_p2, p2),
                            (regsub_p3, p3), (regsub_p4, p4),
                            (genexp_p1, p1), (genexp_p2, p2),
                            (genexp_p3, p3), (genexp_p4, p4),
                            (filter_p1, p1), (filter_p2, p2),
                            (filter_p3, p3), (filter_p4, p4),
                            (genexp_f1, f1), (genexp_f2, f2),
                            (genexp_f3, f3), (genexp_f4, f4),
                            (genexp_f5, f5), (genexp_f6, f6),
                            (genexp_f7, f7), (genexp_f8, f8),
                            ]:
        t = timeit.Timer(r'%s(%s)' % (func.__name__, repr(sample_in)),
                         'from __main__ import %s' % (func.__name__,))
        print func.__name__,
        print min(t.repeat(3, 100000)),
        print func(sample_in) == expected_out,
        print