[Python-checkins] distutils2: Only retreive links that have rel="homepage" or rel="download", from PyPI pages.

tarek.ziade python-checkins at python.org
Sun Jul 4 11:48:40 CEST 2010


tarek.ziade pushed 4b8eaa414258 to distutils2:

http://hg.python.org/distutils2/rev/4b8eaa414258
changeset:   316:4b8eaa414258
user:        Alexis Metaireau <ametaireau at gmail.com>
date:        Tue Jun 22 11:14:06 2010 +0200
summary:     Only retreive links that have rel="homepage" or rel="download", from PyPI pages.
files:       src/distutils2/pypi/simple.py, src/distutils2/tests/pypiserver/downloads_with_md5/simple/badmd5/index.html, src/distutils2/tests/pypiserver/downloads_with_md5/simple/foobar/index.html, src/distutils2/tests/pypiserver/test_found_links/simple/foobar/index.html, src/distutils2/tests/pypiserver/with_externals/simple/foobar/index.html, src/distutils2/tests/pypiserver/with_norel_links/external/homepage.html, src/distutils2/tests/pypiserver/with_norel_links/external/nonrel.html, src/distutils2/tests/pypiserver/with_norel_links/simple/foobar/index.html, src/distutils2/tests/pypiserver/with_norel_links/simple/index.html, src/distutils2/tests/pypiserver/with_real_externals/simple/foobar/index.html, src/distutils2/tests/test_pypi_simple.py

diff --git a/src/distutils2/pypi/simple.py b/src/distutils2/pypi/simple.py
--- a/src/distutils2/pypi/simple.py
+++ b/src/distutils2/pypi/simple.py
@@ -36,6 +36,7 @@
 # This pattern matches a character entity reference (a decimal numeric
 # references, a hexadecimal numeric reference, or a named reference).
 ENTITY_SUB = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?').sub
+REL = re.compile("""<([^>]*\srel\s*=\s*['"]?([^'">]+)[^>]*)>""", re.I)
 
 
 def socket_timeout(timeout=SOCKET_TIMEOUT):
@@ -170,30 +171,60 @@
         self._distributions[dist.name].append(dist)
         return self._distributions[dist.name]
 
-    def _process_url(self, url, project_name=None, follow_links=True):
+    def _process_url(self, url, project_name=None,follow_links=True, 
+        filter_rel=False):
         """Process an url and search for distributions packages.
 
         :param url: the url to analyse
         :param project_name: the project name we are searching for.
-        :param follow_links: We do not want to follow links more than from one
+        :param follow_links: Do not want to follow links more than from one
         level. This parameter tells if we want to follow the links we find (eg.
         run recursively this method on it)
         """
         f = self._open_url(url)
-        base = f.url
-        self._processed_urls.append(url)
-        for match in HREF.finditer(f.read()):
-            link = urlparse.urljoin(base, self._htmldecode(match.group(1)))
-            if link not in self._processed_urls:
-                if self._is_distribution(link):
-                    # it's a distribution, so create a dist object
-                    self._processed_urls.append(link)
-                    self._register_dist(PyPIDistribution.from_url(link,
-                        project_name))
-                else:
-                    if self._is_browsable(link) and follow_links:
-                        self._process_url(link, project_name,
-                            follow_links=False)
+        base_url = f.url
+        if url not in self._processed_urls:
+            self._processed_urls.append(url)
+            link_matcher = self._get_link_matcher(url)
+            for link, is_download in link_matcher(f.read(), base_url):
+                if link not in self._processed_urls:
+                    if self._is_distribution(link) or is_download:
+                        # it's a distribution, so create a dist object
+                        self._processed_urls.append(link)
+                        self._register_dist(PyPIDistribution.from_url(link,
+                            project_name))
+                    else:
+                        if self._is_browsable(link) and follow_links:
+                            self._process_url(link, project_name,
+                                follow_links=False)
+    
+    def _get_link_matcher(self, url):
+        """Returns the right link matcher function of the given url
+        """
+        if self.index_url in url:
+            return self._simple_link_finder
+        else:
+            return self._default_link_finder
+
+    def _simple_link_finder(self, content, base_url):
+        """Yield all links with a rel="download" or rel="homepage".
+
+        This matches the simple index requirements for matching links
+        """
+        for match in REL.finditer(content):
+            tag, rel = match.groups()
+            rels = map(str.strip, rel.lower().split(','))
+            if 'homepage' in rels or 'download' in rels:
+                for match in HREF.finditer(tag):
+                    yield (urlparse.urljoin(base_url, 
+                        self._htmldecode(match.group(1))), "downloads" in rels)
+
+    def _default_link_finder(self, content, base_url):
+        """Yield all links found on the page.
+        """
+        for match in HREF.finditer(content):
+            yield (urlparse.urljoin(base_url, 
+                self._htmldecode(match.group(1))), False)
 
     def _process_pypi_page(self, name):
         """Find and process a PyPI page for the given project name.
diff --git a/src/distutils2/tests/pypiserver/downloads_with_md5/simple/badmd5/index.html b/src/distutils2/tests/pypiserver/downloads_with_md5/simple/badmd5/index.html
--- a/src/distutils2/tests/pypiserver/downloads_with_md5/simple/badmd5/index.html
+++ b/src/distutils2/tests/pypiserver/downloads_with_md5/simple/badmd5/index.html
@@ -1,3 +1,3 @@
 <html><body>
-<a href="badmd5-0.1.tar.gz#md5=3e3d86693d6564c807272b11b3069dfe">badmd5-0.1.tar.gz</a><br/>
+<a href="badmd5-0.1.tar.gz#md5=3e3d86693d6564c807272b11b3069dfe" rel="download">badmd5-0.1.tar.gz</a><br/>
 </body></html>
diff --git a/src/distutils2/tests/pypiserver/downloads_with_md5/simple/foobar/index.html b/src/distutils2/tests/pypiserver/downloads_with_md5/simple/foobar/index.html
--- a/src/distutils2/tests/pypiserver/downloads_with_md5/simple/foobar/index.html
+++ b/src/distutils2/tests/pypiserver/downloads_with_md5/simple/foobar/index.html
@@ -1,3 +1,3 @@
 <html><body>
-<a href="foobar-0.1.tar.gz#md5=d41d8cd98f00b204e9800998ecf8427e">foobar-0.1.tar.gz</a><br/>
+<a href="foobar-0.1.tar.gz#md5=d41d8cd98f00b204e9800998ecf8427e" rel="download">foobar-0.1.tar.gz</a><br/>
 </body></html>
diff --git a/src/distutils2/tests/pypiserver/test_found_links/simple/foobar/index.html b/src/distutils2/tests/pypiserver/test_found_links/simple/foobar/index.html
--- a/src/distutils2/tests/pypiserver/test_found_links/simple/foobar/index.html
+++ b/src/distutils2/tests/pypiserver/test_found_links/simple/foobar/index.html
@@ -1,6 +1,6 @@
 <html><head><title>Links for Foobar</title></head><body><h1>Links for Foobar</h1>
-<a href="../../packages/source/F/Foobar/Foobar-1.0.tar.gz#md5=98fa833fdabcdd78d00245aead66c174">Foobar-1.0.tar.gz</a><br/> 
-<a href="../../packages/source/F/Foobar/Foobar-1.0.1.tar.gz#md5=2351efb20f6b7b5d9ce80fa4cb1bd9ca">Foobar-1.0.1.tar.gz</a><br/> 
-<a href="../../packages/source/F/Foobar/Foobar-2.0.tar.gz#md5=98fa833fdabcdd78d00245aead66c274">Foobar-2.0.tar.gz</a><br/> 
-<a href="../../packages/source/F/Foobar/Foobar-2.0.1.tar.gz#md5=2352efb20f6b7b5d9ce80fa4cb2bd9ca">Foobar-2.0.1.tar.gz</a><br/> 
+<a rel="download" href="../../packages/source/F/Foobar/Foobar-1.0.tar.gz#md5=98fa833fdabcdd78d00245aead66c174">Foobar-1.0.tar.gz</a><br/> 
+<a rel="download" href="../../packages/source/F/Foobar/Foobar-1.0.1.tar.gz#md5=2351efb20f6b7b5d9ce80fa4cb1bd9ca">Foobar-1.0.1.tar.gz</a><br/> 
+<a rel="download" href="../../packages/source/F/Foobar/Foobar-2.0.tar.gz#md5=98fa833fdabcdd78d00245aead66c274">Foobar-2.0.tar.gz</a><br/> 
+<a rel="download" href="../../packages/source/F/Foobar/Foobar-2.0.1.tar.gz#md5=2352efb20f6b7b5d9ce80fa4cb2bd9ca">Foobar-2.0.1.tar.gz</a><br/> 
 </body></html>
diff --git a/src/distutils2/tests/pypiserver/with_externals/simple/foobar/index.html b/src/distutils2/tests/pypiserver/with_externals/simple/foobar/index.html
--- a/src/distutils2/tests/pypiserver/with_externals/simple/foobar/index.html
+++ b/src/distutils2/tests/pypiserver/with_externals/simple/foobar/index.html
@@ -1,4 +1,4 @@
 <html><body>
-<a href="/foobar-0.1.tar.gz#md5=12345678901234567">foobar-0.1.tar.gz</a><br/>
+<a rel ="download" href="/foobar-0.1.tar.gz#md5=12345678901234567">foobar-0.1.tar.gz</a><br/>
 <a href="../../external/external.html" rel="homepage">external homepage</a><br/>
 </body></html>
diff --git a/src/distutils2/tests/pypiserver/with_norel_links/external/homepage.html b/src/distutils2/tests/pypiserver/with_norel_links/external/homepage.html
new file mode 100644
--- /dev/null
+++ b/src/distutils2/tests/pypiserver/with_norel_links/external/homepage.html
@@ -0,0 +1,7 @@
+<html>
+<body>
+<p>a rel=homepage HTML page</p>
+<a href="/foobar-2.0.tar.gz">foobar 2.0</a>
+</body>
+</html>
+
diff --git a/src/distutils2/tests/pypiserver/with_norel_links/external/nonrel.html b/src/distutils2/tests/pypiserver/with_norel_links/external/nonrel.html
new file mode 100644
--- /dev/null
+++ b/src/distutils2/tests/pypiserver/with_norel_links/external/nonrel.html
@@ -0,0 +1,1 @@
+A page linked without rel="download" or rel="homepage" link.
diff --git a/src/distutils2/tests/pypiserver/with_norel_links/simple/foobar/index.html b/src/distutils2/tests/pypiserver/with_norel_links/simple/foobar/index.html
new file mode 100644
--- /dev/null
+++ b/src/distutils2/tests/pypiserver/with_norel_links/simple/foobar/index.html
@@ -0,0 +1,6 @@
+<html><body>
+<a rel="download" href="/foobar-0.1.tar.gz" rel="download">foobar-0.1.tar.gz</a><br/>
+<a href="../../external/homepage.html" rel="homepage">external homepage</a><br/>
+<a href="../../external/nonrel.html">unrelated link</a><br/>
+<a href="/unrelated-0.2.tar.gz">unrelated download</a></br/>
+</body></html>
diff --git a/src/distutils2/tests/pypiserver/with_norel_links/simple/index.html b/src/distutils2/tests/pypiserver/with_norel_links/simple/index.html
new file mode 100644
--- /dev/null
+++ b/src/distutils2/tests/pypiserver/with_norel_links/simple/index.html
@@ -0,0 +1,1 @@
+<a href="foobar/">foobar/</a> 
diff --git a/src/distutils2/tests/pypiserver/with_real_externals/simple/foobar/index.html b/src/distutils2/tests/pypiserver/with_real_externals/simple/foobar/index.html
--- a/src/distutils2/tests/pypiserver/with_real_externals/simple/foobar/index.html
+++ b/src/distutils2/tests/pypiserver/with_real_externals/simple/foobar/index.html
@@ -1,4 +1,4 @@
 <html><body>
-<a href="/foobar-0.1.tar.gz#md5=0_correct_md5">foobar-0.1.tar.gz</a><br/>
+<a rel="download" href="/foobar-0.1.tar.gz#md5=0_correct_md5">foobar-0.1.tar.gz</a><br/>
 <a href="http://a-really-external-website/external/external.html" rel="homepage">external homepage</a><br/>
 </body></html>
diff --git a/src/distutils2/tests/test_pypi_simple.py b/src/distutils2/tests/test_pypi_simple.py
--- a/src/distutils2/tests/test_pypi_simple.py
+++ b/src/distutils2/tests/test_pypi_simple.py
@@ -22,7 +22,7 @@
         urls
         """
         if hosts is None:
-            hosts = (server.full_address,)
+            hosts = (server.full_address.strip("http://"),)
         kwargs['hosts'] = hosts
         return simple.SimpleIndex(server.full_address + base_url, *args,
             **kwargs)
@@ -114,15 +114,15 @@
         # We query only for the version 1.1, so all distributions must be
         # filled in the package_index (as the url has been scanned), but
         # "get" must only return the one we want.
-        pi = self._get_simple_index(server)
-        last_distribution = pi.get("foobar")
+        index = self._get_simple_index(server)
+        last_distribution = index.get("foobar")
 
         # we have scanned the index page
         self.assertIn(server.full_address + "/simple/foobar/",
-            pi._processed_urls)
+            index._processed_urls)
 
         # we have found 4 distributions in this page
-        self.assertEqual(len(pi._distributions["foobar"]), 4)
+        self.assertEqual(len(index._distributions["foobar"]), 4)
 
         # and returned the most recent one
         self.assertEqual(last_distribution.version, '2.0.1')
@@ -144,10 +144,10 @@
         """
         # Try to request the package index, wich contains links to "externals"
         # resources. They have to  be scanned too.
-        pi = self._get_simple_index(server, hosts=("*",))
-        pi.get("foobar")
+        index = self._get_simple_index(server, hosts=("*",))
+        index.get("foobar")
         self.assertIn(server.full_address + "/external/external.html",
-            pi._processed_urls)
+            index._processed_urls)
 
     @use_pypi_server("with_real_externals")
     def test_disable_external_pages(self, server):
@@ -155,10 +155,10 @@
         """
         # Test that telling the simple pyPI client to not retreive external
         # works
-        pi = self._get_simple_index(server, hosts=(server.full_address,))
-        pi.get("foobar")
+        index = self._get_simple_index(server, hosts=(server.full_address,))
+        index.get("foobar")
         self.assertNotIn(server.full_address + "/external/external.html",
-            pi._processed_urls)
+            index._processed_urls)
 
     @use_pypi_server("downloads_with_md5")
     def download_package(self, server):
@@ -168,27 +168,28 @@
         # If we request a download specific version of a distribution,
         # the system must download it, check the md5 and unpack it in a
         # temporary location, that must be returned by the lib.
-        pi = self._get_simple_index(server)
+        index = self._get_simple_index(server)
 
         # assert we can download a specific version
-        temp_location_1 = pi.download("foobar (0.1)")
+        temp_location_1 = index.download("foobar (0.1)")
         self.assertIn("foobar-0.1.tar.gz", temp_location_1)
         paths.append(temp_location_1)  # to delete later
 
         # assert we take the latest
-        temp_location_2 = pi.download("foobar")
+        temp_location_2 = index.download("foobar")
         self.assertIn("foobar-0.1.tar.gz", temp_location_2)
         paths.append(temp_location_2)
 
         # we also can specify a temp location
         specific_temp_location = tempfile.mkdtemp()
-        returned_temp_location = pi.download("foobar", specific_temp_location)
+        returned_temp_location = index.download("foobar",
+            specific_temp_location)
         self.assertIn(specific_temp_location, returned_temp_location)
         paths.append(returned_temp_location)
 
         # raise an error if we couldnt manage to get the file with a the good
         # md5 hash
-        self.assertRaises(DistutilsError, pi.download, "badmd5")
+        self.assertRaises(DistutilsError, index.download, "badmd5")
 
         # delete the temp paths
         for path in paths:
@@ -209,8 +210,8 @@
 
         Usecase :
         - someone uploads a package on pypi, a md5 is generated
-        - someone manually copies this link (with the md5 in the url) onto an
-          external page accessible from the package page.
+        - someone manually coindexes this link (with the md5 in the url) onto
+          an external page accessible from the package page.
         - someone reuploads the package (with a different md5)
         - while easy_installing, an MD5 error occurs because the external link
           is used
@@ -220,8 +221,8 @@
         index_url = server.full_address + '/simple/'
 
         # scan a test index
-        pi = simple.SimpleIndex(index_url)
-        dists = pi.find("foobar")
+        index = simple.SimpleIndex(index_url)
+        dists = index.find("foobar")
         server.stop()
 
         # the distribution has been found
@@ -231,6 +232,31 @@
         # the link should be from the index
         self.assertEqual('12345678901234567', dists[0].md5_hash)
 
+    @use_pypi_server(static_filesystem_paths=["with_norel_links"],
+        static_uri_paths=["simple", "external"])
+    def test_not_scan_all_links(self, server):
+        """Do not follow all index page links.
+        The links not tagged with rel="download" and rel="homepage" have
+        to not be processed by the package index, while processing "pages".
+        """
+        # process the pages
+        index = self._get_simple_index(server)
+        index.find("foobar")
+        # now it should have processed only pages with links rel="download" 
+        # and rel="homepage"
+        self.assertIn("%s/simple/foobar/" % server.full_address, 
+            index._processed_urls)  # it's the simple index page
+        self.assertIn("%s/external/homepage.html" % server.full_address, 
+            index._processed_urls)  # the external homepage is rel="homepage"
+        self.assertNotIn("%s/external/nonrel.html" % server.full_address, 
+            index._processed_urls)  # this link contains no rel=*
+        self.assertNotIn("%s/unrelated-0.2.tar.gz" % server.full_address,
+            index._processed_urls)  # linked from simple index (no rel)
+        self.assertIn("%s/foobar-0.1.tar.gz" % server.full_address,
+            index._processed_urls)  # linked from simple index (rel)
+        self.assertIn("%s/foobar-2.0.tar.gz" % server.full_address,
+            index._processed_urls)  # linked from external homepage (rel)
+
 
 def test_suite():
     return unittest2.makeSuite(PyPISimpleTestCase)

--
Repository URL: http://hg.python.org/distutils2


More information about the Python-checkins mailing list