[Python-checkins] distutils2: Fix bad behavior while browsing the simple index.

tarek.ziade python-checkins at python.org
Sun Aug 8 11:50:46 CEST 2010


tarek.ziade pushed 45e7efdb6985 to distutils2:

http://hg.python.org/distutils2/rev/45e7efdb6985
changeset:   451:45e7efdb6985
user:        Alexis Metaireau <ametaireau at gmail.com>
date:        Wed Jul 21 11:26:42 2010 +0200
summary:     Fix bad behavior while browsing the simple index.
files:       src/distutils2/index/dist.py, src/distutils2/index/simple.py, src/distutils2/tests/test_index_simple.py

diff --git a/src/distutils2/index/dist.py b/src/distutils2/index/dist.py
--- a/src/distutils2/index/dist.py
+++ b/src/distutils2/index/dist.py
@@ -123,6 +123,9 @@
             raise TypeError("cannot compare %s and %s"
                 % (self.name, other.name))
 
+    def __repr__(self):
+        return "<%s %s>" %(self.name, self.version)
+
     def __eq__(self, other):
         self._check_is_comparable(other)
         return self.version == other.version
diff --git a/src/distutils2/index/simple.py b/src/distutils2/index/simple.py
--- a/src/distutils2/index/simple.py
+++ b/src/distutils2/index/simple.py
@@ -14,7 +14,7 @@
 
 from distutils2.index.base import IndexClient
 from distutils2.index.dist import (ReleasesList, EXTENSIONS,
-                                   get_infos_from_url)
+                                   get_infos_from_url, MD5_HASH)
 from distutils2.index.errors import (IndexError, DownloadError,
                                      UnableToDownload)
 from distutils2.index.mirrors import get_mirrors
@@ -30,9 +30,6 @@
 # -- Regexps -------------------------------------------------
 EGG_FRAGMENT = re.compile(r'^egg=([-A-Za-z0-9_.]+)$')
 HREF = re.compile("""href\\s*=\\s*['"]?([^'"> ]+)""", re.I)
-PYPI_MD5 = re.compile(
-    '<a href="([^"#]+)">([^<]+)</a>\n\s+\\(<a (?:title="MD5 hash"\n\s+)'
-    'href="[^?]+\?:action=show_md5&amp;digest=([0-9a-f]{32})">md5</a>\\)')
 URL_SCHEME = re.compile('([-+.a-z0-9]{2,}):', re.I).match
 
 # This pattern matches a character entity reference (a decimal numeric
@@ -238,6 +235,9 @@
         else:
             return self._default_link_matcher
 
+    def _get_full_url(self, url, base_url):
+        return urlparse.urljoin(base_url, self._htmldecode(url))
+
     def _simple_link_matcher(self, content, base_url):
         """Yield all links with a rel="download" or rel="homepage".
 
@@ -245,23 +245,27 @@
         If follow_externals is set to False, dont yeld the external
         urls.
         """
+        for match in HREF.finditer(content):
+            url = self._get_full_url(match.group(1), base_url)
+            if MD5_HASH.match(url):
+                yield (url, True)
+
         for match in REL.finditer(content):
+            # search for rel links.
             tag, rel = match.groups()
             rels = map(str.strip, rel.lower().split(','))
             if 'homepage' in rels or 'download' in rels:
                 for match in HREF.finditer(tag):
-                    url = urlparse.urljoin(base_url,
-                                           self._htmldecode(match.group(1)))
+                    url = self._get_full_url(match.group(1), base_url)
                     if 'download' in rels or self._is_browsable(url):
                         # yield a list of (url, is_download)
-                        yield (urlparse.urljoin(base_url, url),
-                               'download' in rels)
+                        yield (url, 'download' in rels)
 
     def _default_link_matcher(self, content, base_url):
         """Yield all links found on the page.
         """
         for match in HREF.finditer(content):
-            url = urlparse.urljoin(base_url, self._htmldecode(match.group(1)))
+            url = self._get_full_url(match.group(1), base_url)
             if self._is_browsable(url):
                 yield (url, False)
 
diff --git a/src/distutils2/tests/test_index_simple.py b/src/distutils2/tests/test_index_simple.py
--- a/src/distutils2/tests/test_index_simple.py
+++ b/src/distutils2/tests/test_index_simple.py
@@ -239,24 +239,30 @@
         #      returns false for it.
         #   3. one link that must be followed cause it's a homepage that is
         #      browsable
+        #   4. one link that must be followed, because it contain a md5 hash
         self.assertTrue(crawler._is_browsable("%stest" % crawler.index_url))
         self.assertFalse(crawler._is_browsable("http://dl-link2"))
         content = """
         <a href="http://dl-link1" rel="download">download_link1</a>
         <a href="http://dl-link2" rel="homepage">homepage_link1</a>
-        <a href="%stest" rel="homepage">homepage_link2</a>
-        """ % crawler.index_url
+        <a href="%(index_url)stest" rel="homepage">homepage_link2</a>
+        <a href="%(index_url)stest/foobar-1.tar.gz#md5=abcdef>download_link2</a>
+        """ % {'index_url': crawler.index_url }
 
         # Test that the simple link matcher yield the good links.
         generator = crawler._simple_link_matcher(content, crawler.index_url)
+        self.assertEqual(('%stest/foobar-1.tar.gz#md5=abcdef' % crawler.index_url, 
+                         True), generator.next())
         self.assertEqual(('http://dl-link1', True), generator.next())
         self.assertEqual(('%stest' % crawler.index_url, False),
                          generator.next())
         self.assertRaises(StopIteration, generator.next)
 
-        # Follow the external links is possible
+        # Follow the external links is possible (eg. homepages)
         crawler.follow_externals = True
         generator = crawler._simple_link_matcher(content, crawler.index_url)
+        self.assertEqual(('%stest/foobar-1.tar.gz#md5=abcdef' % crawler.index_url, 
+                         True), generator.next())
         self.assertEqual(('http://dl-link1', True), generator.next())
         self.assertEqual(('http://dl-link2', False), generator.next())
         self.assertEqual(('%stest' % crawler.index_url, False),

--
Repository URL: http://hg.python.org/distutils2


More information about the Python-checkins mailing list