[Python-Dev] bytes / unicode

Wed Jun 23 18:49:13 CEST 2010

Oops, I forgot some important quoting (important for the algorithm,
maybe not actually for the discussion)...

from urllib.parse import urlsplit, urlunsplit
import encodings.idna

# urllib.parse.quote both always returns str, and is not as
conservative in quoting as required here...
def quote_unsafe_bytes(b):
    result = []
    for c in b:
        if c < 0x20 or c >= 0x80:
            result.extend(('%%%02X' % c).encode('ASCII'))
        else:
            result.append(c)
    return bytes(result)

def encode_http_url(url, page_encoding='ASCII', errors='strict'):
    scheme, netloc, path, query, fragment = urlsplit(url)
    scheme = scheme.encode('ASCII', errors)
    auth = port = None
    if '@' in netloc:
        auth, netloc = netloc.split('@', 1)
    if ':' in netloc:
        netloc, port = netloc.split(':', 1)
    netloc = encodings.idna.ToASCII(netloc)
    if port:
        netloc = netloc + b':' + port.encode('ASCII', errors)
    if auth:
        netloc = quote_unsafe_bytes(auth.encode('UTF-8', errors)) +
b'@' + netloc
    path = quote_unsafe_bytes(path.encode('UTF-8', errors))
    query = quote_unsafe_bytes(query.encode(page_encoding, errors))
    fragment = quote_unsafe_bytes(fragment.encode('UTF-8', errors))
    return urlunsplit_bytes((scheme, netloc, path, query, fragment))

--
Ian Bicking  |  http://blog.ianbicking.org