[Python-checkins] cpython: Issue #22118: Switch urllib.parse to use RFC 3986 semantics for the resolution

antoine.pitrou python-checkins at python.org
Fri Aug 22 01:16:48 CEST 2014


http://hg.python.org/cpython/rev/b116489d31ff
changeset:   92175:b116489d31ff
user:        Antoine Pitrou <solipsis at pitrou.net>
date:        Thu Aug 21 19:16:17 2014 -0400
summary:
  Issue #22118: Switch urllib.parse to use RFC 3986 semantics for the resolution of relative URLs, rather than RFCs 1808 and 2396.

Patch by Demian Brecht.

files:
  Doc/library/urllib.parse.rst |   5 +
  Lib/test/test_urlparse.py    |  40 +++++++++------
  Lib/urllib/parse.py          |  63 ++++++++++++++---------
  Misc/NEWS                    |   4 +
  4 files changed, 71 insertions(+), 41 deletions(-)


diff --git a/Doc/library/urllib.parse.rst b/Doc/library/urllib.parse.rst
--- a/Doc/library/urllib.parse.rst
+++ b/Doc/library/urllib.parse.rst
@@ -267,6 +267,11 @@
    :func:`urlunsplit`, removing possible *scheme* and *netloc* parts.
 
 
+   .. versionchanged:: 3.5
+
+      Behaviour updated to match the semantics defined in :rfc:`3986`.
+
+
 .. function:: urldefrag(url)
 
    If *url* contains a fragment identifier, return a modified version of *url*
diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py
--- a/Lib/test/test_urlparse.py
+++ b/Lib/test/test_urlparse.py
@@ -211,10 +211,6 @@
 
         # "abnormal" cases from RFC 1808:
         self.checkJoin(RFC1808_BASE, '', 'http://a/b/c/d;p?q#f')
-        self.checkJoin(RFC1808_BASE, '../../../g', 'http://a/../g')
-        self.checkJoin(RFC1808_BASE, '../../../../g', 'http://a/../../g')
-        self.checkJoin(RFC1808_BASE, '/./g', 'http://a/./g')
-        self.checkJoin(RFC1808_BASE, '/../g', 'http://a/../g')
         self.checkJoin(RFC1808_BASE, 'g.', 'http://a/b/c/g.')
         self.checkJoin(RFC1808_BASE, '.g', 'http://a/b/c/.g')
         self.checkJoin(RFC1808_BASE, 'g..', 'http://a/b/c/g..')
@@ -229,6 +225,13 @@
         #self.checkJoin(RFC1808_BASE, 'http:g', 'http:g')
         #self.checkJoin(RFC1808_BASE, 'http:', 'http:')
 
+        # XXX: The following tests are no longer compatible with RFC3986
+        # self.checkJoin(RFC1808_BASE, '../../../g', 'http://a/../g')
+        # self.checkJoin(RFC1808_BASE, '../../../../g', 'http://a/../../g')
+        # self.checkJoin(RFC1808_BASE, '/./g', 'http://a/./g')
+        # self.checkJoin(RFC1808_BASE, '/../g', 'http://a/../g')
+
+
     def test_RFC2368(self):
         # Issue 11467: path that starts with a number is not parsed correctly
         self.assertEqual(urllib.parse.urlparse('mailto:1337 at example.org'),
@@ -259,10 +262,6 @@
         self.checkJoin(RFC2396_BASE, '../../', 'http://a/')
         self.checkJoin(RFC2396_BASE, '../../g', 'http://a/g')
         self.checkJoin(RFC2396_BASE, '', RFC2396_BASE)
-        self.checkJoin(RFC2396_BASE, '../../../g', 'http://a/../g')
-        self.checkJoin(RFC2396_BASE, '../../../../g', 'http://a/../../g')
-        self.checkJoin(RFC2396_BASE, '/./g', 'http://a/./g')
-        self.checkJoin(RFC2396_BASE, '/../g', 'http://a/../g')
         self.checkJoin(RFC2396_BASE, 'g.', 'http://a/b/c/g.')
         self.checkJoin(RFC2396_BASE, '.g', 'http://a/b/c/.g')
         self.checkJoin(RFC2396_BASE, 'g..', 'http://a/b/c/g..')
@@ -278,10 +277,17 @@
         self.checkJoin(RFC2396_BASE, 'g#s/./x', 'http://a/b/c/g#s/./x')
         self.checkJoin(RFC2396_BASE, 'g#s/../x', 'http://a/b/c/g#s/../x')
 
+        # XXX: The following tests are no longer compatible with RFC3986
+        # self.checkJoin(RFC2396_BASE, '../../../g', 'http://a/../g')
+        # self.checkJoin(RFC2396_BASE, '../../../../g', 'http://a/../../g')
+        # self.checkJoin(RFC2396_BASE, '/./g', 'http://a/./g')
+        # self.checkJoin(RFC2396_BASE, '/../g', 'http://a/../g')
+
+
     def test_RFC3986(self):
         # Test cases from RFC3986
         self.checkJoin(RFC3986_BASE, '?y','http://a/b/c/d;p?y')
-        self.checkJoin(RFC2396_BASE, ';x', 'http://a/b/c/;x')
+        self.checkJoin(RFC3986_BASE, ';x', 'http://a/b/c/;x')
         self.checkJoin(RFC3986_BASE, 'g:h','g:h')
         self.checkJoin(RFC3986_BASE, 'g','http://a/b/c/g')
         self.checkJoin(RFC3986_BASE, './g','http://a/b/c/g')
@@ -305,17 +311,17 @@
         self.checkJoin(RFC3986_BASE, '../..','http://a/')
         self.checkJoin(RFC3986_BASE, '../../','http://a/')
         self.checkJoin(RFC3986_BASE, '../../g','http://a/g')
+        self.checkJoin(RFC3986_BASE, '../../../g', 'http://a/g')
 
         #Abnormal Examples
 
         # The 'abnormal scenarios' are incompatible with RFC2986 parsing
         # Tests are here for reference.
 
-        #self.checkJoin(RFC3986_BASE, '../../../g','http://a/g')
-        #self.checkJoin(RFC3986_BASE, '../../../../g','http://a/g')
-        #self.checkJoin(RFC3986_BASE, '/./g','http://a/g')
-        #self.checkJoin(RFC3986_BASE, '/../g','http://a/g')
-
+        self.checkJoin(RFC3986_BASE, '../../../g','http://a/g')
+        self.checkJoin(RFC3986_BASE, '../../../../g','http://a/g')
+        self.checkJoin(RFC3986_BASE, '/./g','http://a/g')
+        self.checkJoin(RFC3986_BASE, '/../g','http://a/g')
         self.checkJoin(RFC3986_BASE, 'g.','http://a/b/c/g.')
         self.checkJoin(RFC3986_BASE, '.g','http://a/b/c/.g')
         self.checkJoin(RFC3986_BASE, 'g..','http://a/b/c/g..')
@@ -355,10 +361,8 @@
         self.checkJoin(SIMPLE_BASE, '../g','http://a/b/g')
         self.checkJoin(SIMPLE_BASE, '../..','http://a/')
         self.checkJoin(SIMPLE_BASE, '../../g','http://a/g')
-        self.checkJoin(SIMPLE_BASE, '../../../g','http://a/../g')
         self.checkJoin(SIMPLE_BASE, './../g','http://a/b/g')
         self.checkJoin(SIMPLE_BASE, './g/.','http://a/b/c/g/')
-        self.checkJoin(SIMPLE_BASE, '/./g','http://a/./g')
         self.checkJoin(SIMPLE_BASE, 'g/./h','http://a/b/c/g/h')
         self.checkJoin(SIMPLE_BASE, 'g/../h','http://a/b/c/h')
         self.checkJoin(SIMPLE_BASE, 'http:g','http://a/b/c/g')
@@ -372,6 +376,10 @@
         self.checkJoin('svn://pathtorepo/dir1', 'dir2', 'svn://pathtorepo/dir2')
         self.checkJoin('svn+ssh://pathtorepo/dir1', 'dir2', 'svn+ssh://pathtorepo/dir2')
 
+        # XXX: The following tests are no longer compatible with RFC3986
+        # self.checkJoin(SIMPLE_BASE, '../../../g','http://a/../g')
+        # self.checkJoin(SIMPLE_BASE, '/./g','http://a/./g')
+
     def test_RFC2732(self):
         str_cases = [
             ('http://Test.python.org:5432/foo/', 'test.python.org', 5432),
diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py
--- a/Lib/urllib/parse.py
+++ b/Lib/urllib/parse.py
@@ -409,11 +409,13 @@
         return url
     if not url:
         return base
+
     base, url, _coerce_result = _coerce_args(base, url)
     bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
             urlparse(base, '', allow_fragments)
     scheme, netloc, path, params, query, fragment = \
             urlparse(url, bscheme, allow_fragments)
+
     if scheme != bscheme or scheme not in uses_relative:
         return _coerce_result(url)
     if scheme in uses_netloc:
@@ -421,9 +423,7 @@
             return _coerce_result(urlunparse((scheme, netloc, path,
                                               params, query, fragment)))
         netloc = bnetloc
-    if path[:1] == '/':
-        return _coerce_result(urlunparse((scheme, netloc, path,
-                                          params, query, fragment)))
+
     if not path and not params:
         path = bpath
         params = bparams
@@ -431,29 +431,42 @@
             query = bquery
         return _coerce_result(urlunparse((scheme, netloc, path,
                                           params, query, fragment)))
-    segments = bpath.split('/')[:-1] + path.split('/')
-    # XXX The stuff below is bogus in various ways...
-    if segments[-1] == '.':
-        segments[-1] = ''
-    while '.' in segments:
-        segments.remove('.')
-    while 1:
-        i = 1
-        n = len(segments) - 1
-        while i < n:
-            if (segments[i] == '..'
-                and segments[i-1] not in ('', '..')):
-                del segments[i-1:i+1]
-                break
-            i = i+1
+
+    base_parts = bpath.split('/')
+    if base_parts[-1] != '':
+        # the last item is not a directory, so will not be taken into account
+        # in resolving the relative path
+        del base_parts[-1]
+
+    # for rfc3986, ignore all base path should the first character be root.
+    if path[:1] == '/':
+        segments = path.split('/')
+    else:
+        segments = base_parts + path.split('/')
+
+    resolved_path = []
+
+    for seg in segments:
+        if seg == '..':
+            try:
+                resolved_path.pop()
+            except IndexError:
+                # ignore any .. segments that would otherwise cause an IndexError
+                # when popped from resolved_path if resolving for rfc3986
+                pass
+        elif seg == '.':
+            continue
         else:
-            break
-    if segments == ['', '..']:
-        segments[-1] = ''
-    elif len(segments) >= 2 and segments[-1] == '..':
-        segments[-2:] = ['']
-    return _coerce_result(urlunparse((scheme, netloc, '/'.join(segments),
-                                      params, query, fragment)))
+            resolved_path.append(seg)
+
+    if segments[-1] in ('.', '..'):
+        # do some post-processing here. if the last segment was a relative dir,
+        # then we need to append the trailing '/'
+        resolved_path.append('')
+
+    return _coerce_result(urlunparse((scheme, netloc, '/'.join(
+        resolved_path), params, query, fragment)))
+
 
 def urldefrag(url):
     """Removes any existing fragment from URL.
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -124,6 +124,10 @@
 Library
 -------
 
+- Issue #22118: Switch urllib.parse to use RFC 3986 semantics for the
+  resolution of relative URLs, rather than RFCs 1808 and 2396.
+  Patch by Demian Brecht.
+
 - Issue #21549: Added the "members" parameter to TarFile.list().
 
 - Issue #19628: Allow compileall recursion depth to be specified with a -r

-- 
Repository URL: http://hg.python.org/cpython


More information about the Python-checkins mailing list