[Python-Dev] bytes / unicode
Ian Bicking
ianb at colorstudy.com
Wed Jun 23 18:49:13 CEST 2010
Oops, I forgot some important quoting (important for the algorithm,
maybe not actually for the discussion)...
from urllib.parse import urlsplit, urlunsplit
import encodings.idna
# urllib.parse.quote both always returns str, and is not as
conservative in quoting as required here...
def quote_unsafe_bytes(b):
result = []
for c in b:
if c < 0x20 or c >= 0x80:
result.extend(('%%%02X' % c).encode('ASCII'))
else:
result.append(c)
return bytes(result)
def encode_http_url(url, page_encoding='ASCII', errors='strict'):
scheme, netloc, path, query, fragment = urlsplit(url)
scheme = scheme.encode('ASCII', errors)
auth = port = None
if '@' in netloc:
auth, netloc = netloc.split('@', 1)
if ':' in netloc:
netloc, port = netloc.split(':', 1)
netloc = encodings.idna.ToASCII(netloc)
if port:
netloc = netloc + b':' + port.encode('ASCII', errors)
if auth:
netloc = quote_unsafe_bytes(auth.encode('UTF-8', errors)) +
b'@' + netloc
path = quote_unsafe_bytes(path.encode('UTF-8', errors))
query = quote_unsafe_bytes(query.encode(page_encoding, errors))
fragment = quote_unsafe_bytes(fragment.encode('UTF-8', errors))
return urlunsplit_bytes((scheme, netloc, path, query, fragment))
--
Ian Bicking | http://blog.ianbicking.org
More information about the Python-Dev
mailing list