[Python-checkins] cpython (2.7): Issue #1285086: Get rid of the refcounting hack and speed up urllib.unquote().

serhiy.storchaka python-checkins at python.org
Thu Mar 14 20:43:29 CET 2013


http://hg.python.org/cpython/rev/4927899bea8d
changeset:   82658:4927899bea8d
branch:      2.7
user:        Serhiy Storchaka <storchaka at gmail.com>
date:        Thu Mar 14 21:31:09 2013 +0200
summary:
  Issue #1285086: Get rid of the refcounting hack and speed up urllib.unquote().

files:
  Lib/urllib.py   |  32 ++++++++++++++++++++-------
  Lib/urlparse.py |  42 +++++++++++++++++++++++++++++-------
  Misc/NEWS       |   2 +
  3 files changed, 58 insertions(+), 18 deletions(-)


diff --git a/Lib/urllib.py b/Lib/urllib.py
--- a/Lib/urllib.py
+++ b/Lib/urllib.py
@@ -28,6 +28,7 @@
 import time
 import sys
 import base64
+import re
 
 from urlparse import urljoin as basejoin
 
@@ -1198,22 +1199,35 @@
 _hexdig = '0123456789ABCDEFabcdef'
 _hextochr = dict((a + b, chr(int(a + b, 16)))
                  for a in _hexdig for b in _hexdig)
+_asciire = re.compile('([\x00-\x7f]+)')
 
 def unquote(s):
     """unquote('abc%20def') -> 'abc def'."""
-    res = s.split('%')
+    if _is_unicode(s):
+        if '%' not in s:
+            return s
+        bits = _asciire.split(s)
+        res = [bits[0]]
+        append = res.append
+        for i in range(1, len(bits), 2):
+            append(unquote(str(bits[i])).decode('latin1'))
+            append(bits[i + 1])
+        return ''.join(res)
+
+    bits = s.split('%')
     # fastpath
-    if len(res) == 1:
+    if len(bits) == 1:
         return s
-    s = res[0]
-    for item in res[1:]:
+    res = [bits[0]]
+    append = res.append
+    for item in bits[1:]:
         try:
-            s += _hextochr[item[:2]] + item[2:]
+            append(_hextochr[item[:2]])
+            append(item[2:])
         except KeyError:
-            s += '%' + item
-        except UnicodeDecodeError:
-            s += unichr(int(item[:2], 16)) + item[2:]
-    return s
+            append('%')
+            append(item)
+    return ''.join(res)
 
 def unquote_plus(s):
     """unquote('%7e/abc+def') -> '~/abc def'"""
diff --git a/Lib/urlparse.py b/Lib/urlparse.py
--- a/Lib/urlparse.py
+++ b/Lib/urlparse.py
@@ -28,6 +28,8 @@
 
 """
 
+import re
+
 __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
            "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
 
@@ -311,6 +313,15 @@
     else:
         return url, ''
 
+try:
+    unicode
+except NameError:
+    def _is_unicode(x):
+        return 0
+else:
+    def _is_unicode(x):
+        return isinstance(x, unicode)
+
 # unquote method for parse_qs and parse_qsl
 # Cannot use directly from urllib as it would create a circular reference
 # because urllib uses urlparse methods (urljoin).  If you update this function,
@@ -319,22 +330,35 @@
 _hexdig = '0123456789ABCDEFabcdef'
 _hextochr = dict((a+b, chr(int(a+b,16)))
                  for a in _hexdig for b in _hexdig)
+_asciire = re.compile('([\x00-\x7f]+)')
 
 def unquote(s):
     """unquote('abc%20def') -> 'abc def'."""
-    res = s.split('%')
+    if _is_unicode(s):
+        if '%' not in s:
+            return s
+        bits = _asciire.split(s)
+        res = [bits[0]]
+        append = res.append
+        for i in range(1, len(bits), 2):
+            append(unquote(str(bits[i])).decode('latin1'))
+            append(bits[i + 1])
+        return ''.join(res)
+
+    bits = s.split('%')
     # fastpath
-    if len(res) == 1:
+    if len(bits) == 1:
         return s
-    s = res[0]
-    for item in res[1:]:
+    res = [bits[0]]
+    append = res.append
+    for item in bits[1:]:
         try:
-            s += _hextochr[item[:2]] + item[2:]
+            append(_hextochr[item[:2]])
+            append(item[2:])
         except KeyError:
-            s += '%' + item
-        except UnicodeDecodeError:
-            s += unichr(int(item[:2], 16)) + item[2:]
-    return s
+            append('%')
+            append(item)
+    return ''.join(res)
 
 def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
     """Parse a query given as a string argument.
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -214,6 +214,8 @@
 Library
 -------
 
+- Issue #1285086: Get rid of the refcounting hack and speed up urllib.unquote().
+
 - Issue #17368: Fix an off-by-one error in the Python JSON decoder that caused
   a failure while decoding empty object literals when object_pairs_hook was
   specified.

-- 
Repository URL: http://hg.python.org/cpython


More information about the Python-checkins mailing list