[issue18814] Add codecs.convert_surrogateescape to "clean" surrogate escaped strings

Serhiy Storchaka report at bugs.python.org
Tue Sep 23 17:34:11 CEST 2014


Serhiy Storchaka added the comment:

Good catch Antoine!

Here is a sample of more complicated implementation.

----------
title: Add a convert_surrogates function to "clean" surrogate escaped strings -> Add codecs.convert_surrogateescape to "clean" surrogate escaped strings
Added file: http://bugs.python.org/file36700/convert_surrogates.py

_______________________________________
Python tracker <report at bugs.python.org>
<http://bugs.python.org/issue18814>
_______________________________________
-------------- next part --------------
import codecs
import re

def convert_surrogates(data, errors='strict'):
    handler = None
    p = re.compile('[\ud800-\uefff]+')
    pos = 0
    res = []
    while True:
        m = p.search(data, pos)
        if m:
            if handler is None:
                handler = codecs.lookup_error(errors)
            res.append(data[pos: m.start()])
            repl, pos = handler(UnicodeTranslateError(data, m.start(), m.end(),
                                                      'lone surrogates'))
            res.append(repl)
        elif pos:
            res.append(data[pos:])
            return ''.join(res)
        else:
            return data

def convert_surrogateescape(data, errors='strict'):
    handler = None
    p = re.compile('[\ud800-\uefff]+')
    pos = 0
    res = []
    while True:
        m = p.search(data, pos)
        if m:
            if handler is None:
                handler = codecs.lookup_error(errors)
            start = m.start()
            res.append(data[pos: start])
            try:
                baddata = data[start: m.end()].encode('ascii', 'surrogateescape')
            except UnicodeEncodeError as err:
                raise UnicodeTranslateError(data,
                        err.start + start,err.end + start,
                        r'surrogates not in range \ud880-\ud8ff') from None
            try:
                repl, pos = handler(UnicodeDecodeError('unicode', baddata,
                                                       0, len(baddata),
                                                       'lone surrogates'))
            except UnicodeDecodeError as err:
                raise UnicodeTranslateError(data,
                                            err.start + start,
                                            err.end + start,
                                            err.reason) from None
            pos += start
            res.append(repl)
        elif pos:
            res.append(data[pos:])
            return ''.join(res)
        else:
            return data


More information about the Python-bugs-list mailing list