[Python-3000-checkins] r65560 - in python/branches/py3k/Lib: test/test_urllib.py urllib/parse.py

guido.van.rossum python-3000-checkins at python.org
Wed Aug 6 21:31:35 CEST 2008


Author: guido.van.rossum
Date: Wed Aug  6 21:31:34 2008
New Revision: 65560

Log:
Revert accidentally committed files.  Oops!


Modified:
   python/branches/py3k/Lib/test/test_urllib.py
   python/branches/py3k/Lib/urllib/parse.py

Modified: python/branches/py3k/Lib/test/test_urllib.py
==============================================================================
--- python/branches/py3k/Lib/test/test_urllib.py	(original)
+++ python/branches/py3k/Lib/test/test_urllib.py	Wed Aug  6 21:31:34 2008
@@ -465,7 +465,7 @@
 
     def test_unquote_with_unicode(self):
         r = urllib.parse.unquote('br%C3%BCckner_sapporo_20050930.doc')
-        self.assertEqual(r, 'br\u00FCckner_sapporo_20050930.doc')
+        self.assertEqual(r, 'br\xc3\xbcckner_sapporo_20050930.doc')
 
 class urlencode_Tests(unittest.TestCase):
     """Tests for urlencode()"""

Modified: python/branches/py3k/Lib/urllib/parse.py
==============================================================================
--- python/branches/py3k/Lib/urllib/parse.py	(original)
+++ python/branches/py3k/Lib/urllib/parse.py	Wed Aug  6 21:31:34 2008
@@ -261,74 +261,84 @@
         return url, ''
 
 
-def unquote_as_string (s, plus=False, charset=None):
-    if charset is None:
-        charset = "UTF-8"
-    return str(unquote_as_bytes(s, plus=plus), charset, 'strict')
+_hextochr = dict(('%02x' % i, chr(i)) for i in range(256))
+_hextochr.update(('%02X' % i, chr(i)) for i in range(256))
 
-def unquote_as_bytes (s, plus=False):
+def unquote(s):
     """unquote('abc%20def') -> 'abc def'."""
-    if plus:
-        s = s.replace('+', ' ')
     res = s.split('%')
-    res[0] = res[0].encode('ASCII', 'strict')
     for i in range(1, len(res)):
-        res[i] = (bytes.fromhex(res[i][:2]) +
-                  res[i][2:].encode('ASCII', 'strict'))
-    return b''.join(res)
-
-_always_safe = (b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
-                b'abcdefghijklmnopqrstuvwxyz'
-                b'0123456789'
-                b'_.-')
-
-_percent_code = ord('%')
-
-_hextable = b'0123456789ABCDEF'
-
-def quote_as_bytes(s, safe = '/', plus=False):
-    """quote(b'abc at def') -> 'abc%40def'"""
-
-    if isinstance(s, str):
-        s = s.encode("UTF-8", "strict")
-    if not (isinstance(s, bytes) or isinstance(s, bytearray)):
-        raise ValueError("Argument to quote must be either bytes "
-                         "or bytearray; string arguments will be "
-                         "converted to UTF-8 bytes")
-
-    safeset = _always_safe + safe.encode('ASCII', 'strict')
-    if plus:
-        safeset += b' '
-
-    result = bytearray()
-    for i in s:
-        if i not in safeset:
-            result.append(_percent_code)
-            result.append(_hextable[(i >> 4) & 0xF])
-            result.append(_hextable[i & 0xF])
-        else:
-            result.append(i)
-    if plus:
-        result = result.replace(b' ', b'+')
-    return result
+        item = res[i]
+        try:
+            res[i] = _hextochr[item[:2]] + item[2:]
+        except KeyError:
+            res[i] = '%' + item
+        except UnicodeDecodeError:
+            res[i] = chr(int(item[:2], 16)) + item[2:]
+    return "".join(res)
 
-def quote_as_string(s, safe = '/', plus=False):
-    return str(quote_as_bytes(s, safe=safe, plus=plus), 'ASCII', 'strict')
+def unquote_plus(s):
+    """unquote('%7e/abc+def') -> '~/abc def'"""
+    s = s.replace('+', ' ')
+    return unquote(s)
+
+always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+               'abcdefghijklmnopqrstuvwxyz'
+               '0123456789' '_.-')
+_safe_quoters= {}
+
+class Quoter:
+    def __init__(self, safe):
+        self.cache = {}
+        self.safe = safe + always_safe
+
+    def __call__(self, c):
+        try:
+            return self.cache[c]
+        except KeyError:
+            if ord(c) < 256:
+                res = (c in self.safe) and c or ('%%%02X' % ord(c))
+                self.cache[c] = res
+                return res
+            else:
+                return "".join(['%%%02X' % i for i in c.encode("utf-8")])
 
-# finally, define defaults for 'quote' and 'unquote'
+def quote(s, safe = '/'):
+    """quote('abc def') -> 'abc%20def'
 
-def quote(s, safe='/'):
-    return quote_as_string(s, safe=safe)
+    Each part of a URL, e.g. the path info, the query, etc., has a
+    different set of reserved characters that must be quoted.
 
-def quote_plus(s, safe=''):
-    return quote_as_string(s, safe=safe, plus=True)
+    RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
+    the following reserved characters.
 
-def unquote(s):
-    return unquote_as_string(s)
+    reserved    = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
+                  "$" | ","
 
-def unquote_plus(s):
-    return unquote_as_string(s, plus=True)
+    Each of these characters is reserved in some component of a URL,
+    but not necessarily in all of them.
 
+    By default, the quote function is intended for quoting the path
+    section of a URL.  Thus, it will not encode '/'.  This character
+    is reserved, but in typical usage the quote function is being
+    called on a path where the existing slash characters are used as
+    reserved characters.
+    """
+    cachekey = (safe, always_safe)
+    try:
+        quoter = _safe_quoters[cachekey]
+    except KeyError:
+        quoter = Quoter(safe)
+        _safe_quoters[cachekey] = quoter
+    res = map(quoter, s)
+    return ''.join(res)
+
+def quote_plus(s, safe = ''):
+    """Quote the query fragment of a URL; replacing ' ' with '+'"""
+    if ' ' in s:
+        s = quote(s, safe + ' ')
+        return s.replace(' ', '+')
+    return quote(s, safe)
 
 def urlencode(query,doseq=0):
     """Encode a sequence of two-element tuples or dictionary into a URL query string.
@@ -377,7 +387,7 @@
                 # is there a reasonable way to convert to ASCII?
                 # encode generates a string, but "replace" or "ignore"
                 # lose information and "strict" can raise UnicodeError
-                v = quote_plus(v)
+                v = quote_plus(v.encode("ASCII","replace"))
                 l.append(k + '=' + v)
             else:
                 try:
@@ -464,8 +474,7 @@
         _userprog = re.compile('^(.*)@(.*)$')
 
     match = _userprog.match(host)
-    if match:
-        return map(unquote, match.group(1, 2))
+    if match: return map(unquote, match.group(1, 2))
     return None, host
 
 _passwdprog = None


More information about the Python-3000-checkins mailing list