[XML-SIG] Encoding autodetection

Paul Prescod paulp@ActiveState.com
Sat, 24 Feb 2001 11:36:13 -0800


Lars Marius Garshol wrote:
> 
> * Paul Prescod
> |
> | Thanks anyways. I've written the code now. Would it be useful to
> | anyone else out there?
> 
> xmlproc could use it.  When the Unicode support is added it will need
> to do the same thing.

Yeah, that's where I looked first.

> I guess it could also be useful as a utility in some cases, such as in
> a web server.

I'll include it here for the record. If anyone wants to do anything with
it they can. It is hereby in the public domain. 

In response to a question I got privately: it will detect any encoding
that has a reasonable resemblence to an ASCII superset (e.g. UTF-8, ISO
8859-*, Shift-JIS) or to a 2 byte Unicode encoding (big or little
endian, with or without BOM). EBCDIC and 4-byte encodings are not
tested.

import codecs, encodings

"""Komodo will hand this library a buffer and ask it to either convert
it or auto-detect the type."""

# None represents a potentially variable byte. "##" in the XML spec... 
autodetect_dict={ # bytepattern     : ("name",              
                (0x00, 0x00, 0xFE, 0xFF) : ("ucs4_be"),        
                (0xFF, 0xFE, 0x00, 0x00) : ("ucs4_le"),
                (0xFE, 0xFF, None, None) : ("utf_16_be"), 
                (0xFF, 0xFE, None, None) : ("utf_16_le"), 
                (0x00, 0x3C, 0x00, 0x3F) : ("utf_16_be"),
                (0x3C, 0x00, 0x3F, 0x00) : ("utf_16_le"),
                (0x3C, 0x3F, 0x78, 0x6D): ("utf_8"),
                (0x4C, 0x6F, 0xA7, 0x94): ("EBCDIC")
                 }

def autoDetectXMLEncoding(buffer):
    """ buffer -> encoding_name
    The buffer should be at least 4 bytes long.
        Returns None if encoding cannot be detected.
        Note that encoding_name might not have an installed
        decoder (e.g. EBCDIC or Shift-JIS)
    """
    # a more efficient implementation would not decode the whole
    # buffer at once but otherwise we'd have to decode a character at
    # a time looking for the quote character...that's a pain

    encoding = "utf_8" # according to the XML spec, this is the default
                          # this code successively tries to refine the
default
                          # whenever it fails to refine, it falls back
to the last place
                          # encoding was set.
    bytes = (byte1, byte2, byte3, byte4) = tuple(map(ord, buffer[0:4]))
    enc_info = autodetect_dict.get(bytes, None)

    if not enc_info: # try autodetection again removing potentially
variable bytes
        bytes = (byte1, byte2, None, None)
        enc_info = autodetect_dict.get(bytes)

        
    if enc_info:
        encoding = enc_info # we've got a guess... these are
                                     #the new defaults

        # try to find a more precise encoding using xml declaration
        secret_decoder_ring = codecs.lookup(encoding)[1]
        (decoded,length) = secret_decoder_ring(buffer) 
        first_line = decoded.split("\n")[0]
        if first_line and first_line.startswith(u"<?xml"):
            encoding_pos = first_line.find(u"encoding")
            if encoding_pos!=-1:
                # look for double quote
                quote_pos=first_line.find('"', encoding_pos) 

                if quote_pos==-1:                 # look for single
quote
                    quote_pos=first_line.find("'", encoding_pos) 

                if quote_pos>-1:
                    quote_char,rest=(first_line[quote_pos],
                                               
first_line[quote_pos+1:])
                    encoding=rest[:rest.find(quote_char)]

    return encoding

##### Testing code 

big_teststrs = (u"<?xml version='1.0' encoding='%s'?><abc>\u2222</abc>",
                u'<?xml version="1.0" encoding="%s"?><abc>\u2222</abc>')

big_encodings = [
    #name           BOM prefix
    ("utf-16"   ,           None),  # this one already has a BOM prefix
    ("utf-8"    ,           None), 
    ("utf-16-le",           None), 
    ("utf-16-be",           None),
    ("utf-16-le",           codecs.BOM_LE), 
    ("utf-16-be",           codecs.BOM_BE),
    ("MBCS"     ,           None)]

little_teststrs = (u"<?xml version='1.0' encoding='%s'?><abc>q</abc>",
                u'<?xml version="1.0" encoding="%s"?><abc>q</abc>')
little_encodings = [
    ("ASCII"     ,          None),
    ("Latin-1"   ,          None),
    ("ISO 8859-1",          None)]

default_teststrs = ("<a>%s</a>", "<?xml version='1.0'?><a>%s</a>",
                    '<?xml version="1.0"?><a>%s</a>')

xml_default_encodings = [
    ("utf_8"    ,           None), 
    ("utf_16_le",           codecs.BOM_LE), 
    ("utf_16_be",           codecs.BOM_BE)]

def _assertSame(expr1,expr2):
    if expr1 != expr2:
        raise AssertionError, (expr1, "!=", expr2)

def testDetect(teststrs, test_encodings):
    for (encoding, bom) in test_encodings:
        for teststr in teststrs:
            data = (teststr % encoding).encode(encoding)
            if bom:
                data = bom + data
            _assertSame(autoDetectXMLEncoding(data), encoding)

def test():
    teststr=u"\u2222\u2323\u4343"
    testDetect(big_teststrs, big_encodings)
    testDetect(little_teststrs, little_encodings)
    testDetect(default_teststrs, xml_default_encodings)

if __name__=="__main__":
    test()
    print "All tests succeeded"


-- 
Vote for Your Favorite Python & Perl Programming  
Accomplishments in the first Active Awards! 
http://www.ActiveState.com/Awards