[Python-checkins] distutils2: Only retreive links that have rel="homepage" or rel="download", from PyPI pages.
tarek.ziade
python-checkins at python.org
Sun Jul 4 11:48:40 CEST 2010
tarek.ziade pushed 4b8eaa414258 to distutils2:
http://hg.python.org/distutils2/rev/4b8eaa414258
changeset: 316:4b8eaa414258
user: Alexis Metaireau <ametaireau at gmail.com>
date: Tue Jun 22 11:14:06 2010 +0200
summary: Only retreive links that have rel="homepage" or rel="download", from PyPI pages.
files: src/distutils2/pypi/simple.py, src/distutils2/tests/pypiserver/downloads_with_md5/simple/badmd5/index.html, src/distutils2/tests/pypiserver/downloads_with_md5/simple/foobar/index.html, src/distutils2/tests/pypiserver/test_found_links/simple/foobar/index.html, src/distutils2/tests/pypiserver/with_externals/simple/foobar/index.html, src/distutils2/tests/pypiserver/with_norel_links/external/homepage.html, src/distutils2/tests/pypiserver/with_norel_links/external/nonrel.html, src/distutils2/tests/pypiserver/with_norel_links/simple/foobar/index.html, src/distutils2/tests/pypiserver/with_norel_links/simple/index.html, src/distutils2/tests/pypiserver/with_real_externals/simple/foobar/index.html, src/distutils2/tests/test_pypi_simple.py
diff --git a/src/distutils2/pypi/simple.py b/src/distutils2/pypi/simple.py
--- a/src/distutils2/pypi/simple.py
+++ b/src/distutils2/pypi/simple.py
@@ -36,6 +36,7 @@
# This pattern matches a character entity reference (a decimal numeric
# references, a hexadecimal numeric reference, or a named reference).
ENTITY_SUB = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?').sub
+REL = re.compile("""<([^>]*\srel\s*=\s*['"]?([^'">]+)[^>]*)>""", re.I)
def socket_timeout(timeout=SOCKET_TIMEOUT):
@@ -170,30 +171,60 @@
self._distributions[dist.name].append(dist)
return self._distributions[dist.name]
- def _process_url(self, url, project_name=None, follow_links=True):
+ def _process_url(self, url, project_name=None,follow_links=True,
+ filter_rel=False):
"""Process an url and search for distributions packages.
:param url: the url to analyse
:param project_name: the project name we are searching for.
- :param follow_links: We do not want to follow links more than from one
+ :param follow_links: Do not want to follow links more than from one
level. This parameter tells if we want to follow the links we find (eg.
run recursively this method on it)
"""
f = self._open_url(url)
- base = f.url
- self._processed_urls.append(url)
- for match in HREF.finditer(f.read()):
- link = urlparse.urljoin(base, self._htmldecode(match.group(1)))
- if link not in self._processed_urls:
- if self._is_distribution(link):
- # it's a distribution, so create a dist object
- self._processed_urls.append(link)
- self._register_dist(PyPIDistribution.from_url(link,
- project_name))
- else:
- if self._is_browsable(link) and follow_links:
- self._process_url(link, project_name,
- follow_links=False)
+ base_url = f.url
+ if url not in self._processed_urls:
+ self._processed_urls.append(url)
+ link_matcher = self._get_link_matcher(url)
+ for link, is_download in link_matcher(f.read(), base_url):
+ if link not in self._processed_urls:
+ if self._is_distribution(link) or is_download:
+ # it's a distribution, so create a dist object
+ self._processed_urls.append(link)
+ self._register_dist(PyPIDistribution.from_url(link,
+ project_name))
+ else:
+ if self._is_browsable(link) and follow_links:
+ self._process_url(link, project_name,
+ follow_links=False)
+
+ def _get_link_matcher(self, url):
+ """Returns the right link matcher function of the given url
+ """
+ if self.index_url in url:
+ return self._simple_link_finder
+ else:
+ return self._default_link_finder
+
+ def _simple_link_finder(self, content, base_url):
+ """Yield all links with a rel="download" or rel="homepage".
+
+ This matches the simple index requirements for matching links
+ """
+ for match in REL.finditer(content):
+ tag, rel = match.groups()
+ rels = map(str.strip, rel.lower().split(','))
+ if 'homepage' in rels or 'download' in rels:
+ for match in HREF.finditer(tag):
+ yield (urlparse.urljoin(base_url,
+ self._htmldecode(match.group(1))), "downloads" in rels)
+
+ def _default_link_finder(self, content, base_url):
+ """Yield all links found on the page.
+ """
+ for match in HREF.finditer(content):
+ yield (urlparse.urljoin(base_url,
+ self._htmldecode(match.group(1))), False)
def _process_pypi_page(self, name):
"""Find and process a PyPI page for the given project name.
diff --git a/src/distutils2/tests/pypiserver/downloads_with_md5/simple/badmd5/index.html b/src/distutils2/tests/pypiserver/downloads_with_md5/simple/badmd5/index.html
--- a/src/distutils2/tests/pypiserver/downloads_with_md5/simple/badmd5/index.html
+++ b/src/distutils2/tests/pypiserver/downloads_with_md5/simple/badmd5/index.html
@@ -1,3 +1,3 @@
<html><body>
-<a href="badmd5-0.1.tar.gz#md5=3e3d86693d6564c807272b11b3069dfe">badmd5-0.1.tar.gz</a><br/>
+<a href="badmd5-0.1.tar.gz#md5=3e3d86693d6564c807272b11b3069dfe" rel="download">badmd5-0.1.tar.gz</a><br/>
</body></html>
diff --git a/src/distutils2/tests/pypiserver/downloads_with_md5/simple/foobar/index.html b/src/distutils2/tests/pypiserver/downloads_with_md5/simple/foobar/index.html
--- a/src/distutils2/tests/pypiserver/downloads_with_md5/simple/foobar/index.html
+++ b/src/distutils2/tests/pypiserver/downloads_with_md5/simple/foobar/index.html
@@ -1,3 +1,3 @@
<html><body>
-<a href="foobar-0.1.tar.gz#md5=d41d8cd98f00b204e9800998ecf8427e">foobar-0.1.tar.gz</a><br/>
+<a href="foobar-0.1.tar.gz#md5=d41d8cd98f00b204e9800998ecf8427e" rel="download">foobar-0.1.tar.gz</a><br/>
</body></html>
diff --git a/src/distutils2/tests/pypiserver/test_found_links/simple/foobar/index.html b/src/distutils2/tests/pypiserver/test_found_links/simple/foobar/index.html
--- a/src/distutils2/tests/pypiserver/test_found_links/simple/foobar/index.html
+++ b/src/distutils2/tests/pypiserver/test_found_links/simple/foobar/index.html
@@ -1,6 +1,6 @@
<html><head><title>Links for Foobar</title></head><body><h1>Links for Foobar</h1>
-<a href="../../packages/source/F/Foobar/Foobar-1.0.tar.gz#md5=98fa833fdabcdd78d00245aead66c174">Foobar-1.0.tar.gz</a><br/>
-<a href="../../packages/source/F/Foobar/Foobar-1.0.1.tar.gz#md5=2351efb20f6b7b5d9ce80fa4cb1bd9ca">Foobar-1.0.1.tar.gz</a><br/>
-<a href="../../packages/source/F/Foobar/Foobar-2.0.tar.gz#md5=98fa833fdabcdd78d00245aead66c274">Foobar-2.0.tar.gz</a><br/>
-<a href="../../packages/source/F/Foobar/Foobar-2.0.1.tar.gz#md5=2352efb20f6b7b5d9ce80fa4cb2bd9ca">Foobar-2.0.1.tar.gz</a><br/>
+<a rel="download" href="../../packages/source/F/Foobar/Foobar-1.0.tar.gz#md5=98fa833fdabcdd78d00245aead66c174">Foobar-1.0.tar.gz</a><br/>
+<a rel="download" href="../../packages/source/F/Foobar/Foobar-1.0.1.tar.gz#md5=2351efb20f6b7b5d9ce80fa4cb1bd9ca">Foobar-1.0.1.tar.gz</a><br/>
+<a rel="download" href="../../packages/source/F/Foobar/Foobar-2.0.tar.gz#md5=98fa833fdabcdd78d00245aead66c274">Foobar-2.0.tar.gz</a><br/>
+<a rel="download" href="../../packages/source/F/Foobar/Foobar-2.0.1.tar.gz#md5=2352efb20f6b7b5d9ce80fa4cb2bd9ca">Foobar-2.0.1.tar.gz</a><br/>
</body></html>
diff --git a/src/distutils2/tests/pypiserver/with_externals/simple/foobar/index.html b/src/distutils2/tests/pypiserver/with_externals/simple/foobar/index.html
--- a/src/distutils2/tests/pypiserver/with_externals/simple/foobar/index.html
+++ b/src/distutils2/tests/pypiserver/with_externals/simple/foobar/index.html
@@ -1,4 +1,4 @@
<html><body>
-<a href="/foobar-0.1.tar.gz#md5=12345678901234567">foobar-0.1.tar.gz</a><br/>
+<a rel ="download" href="/foobar-0.1.tar.gz#md5=12345678901234567">foobar-0.1.tar.gz</a><br/>
<a href="../../external/external.html" rel="homepage">external homepage</a><br/>
</body></html>
diff --git a/src/distutils2/tests/pypiserver/with_norel_links/external/homepage.html b/src/distutils2/tests/pypiserver/with_norel_links/external/homepage.html
new file mode 100644
--- /dev/null
+++ b/src/distutils2/tests/pypiserver/with_norel_links/external/homepage.html
@@ -0,0 +1,7 @@
+<html>
+<body>
+<p>a rel=homepage HTML page</p>
+<a href="/foobar-2.0.tar.gz">foobar 2.0</a>
+</body>
+</html>
+
diff --git a/src/distutils2/tests/pypiserver/with_norel_links/external/nonrel.html b/src/distutils2/tests/pypiserver/with_norel_links/external/nonrel.html
new file mode 100644
--- /dev/null
+++ b/src/distutils2/tests/pypiserver/with_norel_links/external/nonrel.html
@@ -0,0 +1,1 @@
+A page linked without rel="download" or rel="homepage" link.
diff --git a/src/distutils2/tests/pypiserver/with_norel_links/simple/foobar/index.html b/src/distutils2/tests/pypiserver/with_norel_links/simple/foobar/index.html
new file mode 100644
--- /dev/null
+++ b/src/distutils2/tests/pypiserver/with_norel_links/simple/foobar/index.html
@@ -0,0 +1,6 @@
+<html><body>
+<a rel="download" href="/foobar-0.1.tar.gz" rel="download">foobar-0.1.tar.gz</a><br/>
+<a href="../../external/homepage.html" rel="homepage">external homepage</a><br/>
+<a href="../../external/nonrel.html">unrelated link</a><br/>
+<a href="/unrelated-0.2.tar.gz">unrelated download</a></br/>
+</body></html>
diff --git a/src/distutils2/tests/pypiserver/with_norel_links/simple/index.html b/src/distutils2/tests/pypiserver/with_norel_links/simple/index.html
new file mode 100644
--- /dev/null
+++ b/src/distutils2/tests/pypiserver/with_norel_links/simple/index.html
@@ -0,0 +1,1 @@
+<a href="foobar/">foobar/</a>
diff --git a/src/distutils2/tests/pypiserver/with_real_externals/simple/foobar/index.html b/src/distutils2/tests/pypiserver/with_real_externals/simple/foobar/index.html
--- a/src/distutils2/tests/pypiserver/with_real_externals/simple/foobar/index.html
+++ b/src/distutils2/tests/pypiserver/with_real_externals/simple/foobar/index.html
@@ -1,4 +1,4 @@
<html><body>
-<a href="/foobar-0.1.tar.gz#md5=0_correct_md5">foobar-0.1.tar.gz</a><br/>
+<a rel="download" href="/foobar-0.1.tar.gz#md5=0_correct_md5">foobar-0.1.tar.gz</a><br/>
<a href="http://a-really-external-website/external/external.html" rel="homepage">external homepage</a><br/>
</body></html>
diff --git a/src/distutils2/tests/test_pypi_simple.py b/src/distutils2/tests/test_pypi_simple.py
--- a/src/distutils2/tests/test_pypi_simple.py
+++ b/src/distutils2/tests/test_pypi_simple.py
@@ -22,7 +22,7 @@
urls
"""
if hosts is None:
- hosts = (server.full_address,)
+ hosts = (server.full_address.strip("http://"),)
kwargs['hosts'] = hosts
return simple.SimpleIndex(server.full_address + base_url, *args,
**kwargs)
@@ -114,15 +114,15 @@
# We query only for the version 1.1, so all distributions must be
# filled in the package_index (as the url has been scanned), but
# "get" must only return the one we want.
- pi = self._get_simple_index(server)
- last_distribution = pi.get("foobar")
+ index = self._get_simple_index(server)
+ last_distribution = index.get("foobar")
# we have scanned the index page
self.assertIn(server.full_address + "/simple/foobar/",
- pi._processed_urls)
+ index._processed_urls)
# we have found 4 distributions in this page
- self.assertEqual(len(pi._distributions["foobar"]), 4)
+ self.assertEqual(len(index._distributions["foobar"]), 4)
# and returned the most recent one
self.assertEqual(last_distribution.version, '2.0.1')
@@ -144,10 +144,10 @@
"""
# Try to request the package index, wich contains links to "externals"
# resources. They have to be scanned too.
- pi = self._get_simple_index(server, hosts=("*",))
- pi.get("foobar")
+ index = self._get_simple_index(server, hosts=("*",))
+ index.get("foobar")
self.assertIn(server.full_address + "/external/external.html",
- pi._processed_urls)
+ index._processed_urls)
@use_pypi_server("with_real_externals")
def test_disable_external_pages(self, server):
@@ -155,10 +155,10 @@
"""
# Test that telling the simple pyPI client to not retreive external
# works
- pi = self._get_simple_index(server, hosts=(server.full_address,))
- pi.get("foobar")
+ index = self._get_simple_index(server, hosts=(server.full_address,))
+ index.get("foobar")
self.assertNotIn(server.full_address + "/external/external.html",
- pi._processed_urls)
+ index._processed_urls)
@use_pypi_server("downloads_with_md5")
def download_package(self, server):
@@ -168,27 +168,28 @@
# If we request a download specific version of a distribution,
# the system must download it, check the md5 and unpack it in a
# temporary location, that must be returned by the lib.
- pi = self._get_simple_index(server)
+ index = self._get_simple_index(server)
# assert we can download a specific version
- temp_location_1 = pi.download("foobar (0.1)")
+ temp_location_1 = index.download("foobar (0.1)")
self.assertIn("foobar-0.1.tar.gz", temp_location_1)
paths.append(temp_location_1) # to delete later
# assert we take the latest
- temp_location_2 = pi.download("foobar")
+ temp_location_2 = index.download("foobar")
self.assertIn("foobar-0.1.tar.gz", temp_location_2)
paths.append(temp_location_2)
# we also can specify a temp location
specific_temp_location = tempfile.mkdtemp()
- returned_temp_location = pi.download("foobar", specific_temp_location)
+ returned_temp_location = index.download("foobar",
+ specific_temp_location)
self.assertIn(specific_temp_location, returned_temp_location)
paths.append(returned_temp_location)
# raise an error if we couldnt manage to get the file with a the good
# md5 hash
- self.assertRaises(DistutilsError, pi.download, "badmd5")
+ self.assertRaises(DistutilsError, index.download, "badmd5")
# delete the temp paths
for path in paths:
@@ -209,8 +210,8 @@
Usecase :
- someone uploads a package on pypi, a md5 is generated
- - someone manually copies this link (with the md5 in the url) onto an
- external page accessible from the package page.
+ - someone manually coindexes this link (with the md5 in the url) onto
+ an external page accessible from the package page.
- someone reuploads the package (with a different md5)
- while easy_installing, an MD5 error occurs because the external link
is used
@@ -220,8 +221,8 @@
index_url = server.full_address + '/simple/'
# scan a test index
- pi = simple.SimpleIndex(index_url)
- dists = pi.find("foobar")
+ index = simple.SimpleIndex(index_url)
+ dists = index.find("foobar")
server.stop()
# the distribution has been found
@@ -231,6 +232,31 @@
# the link should be from the index
self.assertEqual('12345678901234567', dists[0].md5_hash)
+ @use_pypi_server(static_filesystem_paths=["with_norel_links"],
+ static_uri_paths=["simple", "external"])
+ def test_not_scan_all_links(self, server):
+ """Do not follow all index page links.
+ The links not tagged with rel="download" and rel="homepage" have
+ to not be processed by the package index, while processing "pages".
+ """
+ # process the pages
+ index = self._get_simple_index(server)
+ index.find("foobar")
+ # now it should have processed only pages with links rel="download"
+ # and rel="homepage"
+ self.assertIn("%s/simple/foobar/" % server.full_address,
+ index._processed_urls) # it's the simple index page
+ self.assertIn("%s/external/homepage.html" % server.full_address,
+ index._processed_urls) # the external homepage is rel="homepage"
+ self.assertNotIn("%s/external/nonrel.html" % server.full_address,
+ index._processed_urls) # this link contains no rel=*
+ self.assertNotIn("%s/unrelated-0.2.tar.gz" % server.full_address,
+ index._processed_urls) # linked from simple index (no rel)
+ self.assertIn("%s/foobar-0.1.tar.gz" % server.full_address,
+ index._processed_urls) # linked from simple index (rel)
+ self.assertIn("%s/foobar-2.0.tar.gz" % server.full_address,
+ index._processed_urls) # linked from external homepage (rel)
+
def test_suite():
return unittest2.makeSuite(PyPISimpleTestCase)
--
Repository URL: http://hg.python.org/distutils2
More information about the Python-checkins
mailing list