[Python-3000-checkins] r65118 - in python/branches/py3k/Lib: test/test_robotparser.py urllib/robotparser.py

jeremy.hylton python-3000-checkins at python.org
Fri Jul 18 22:59:45 CEST 2008


Author: jeremy.hylton
Date: Fri Jul 18 22:59:44 2008
New Revision: 65118

Log:
Bug 3347: robotparser failed because it didn't convert bytes to string.

The solution is to convert bytes to text via utf-8.  I'm not entirely
sure if this is safe, but it looks like robots.txt is expected to be
ascii.



Modified:
   python/branches/py3k/Lib/test/test_robotparser.py
   python/branches/py3k/Lib/urllib/robotparser.py

Modified: python/branches/py3k/Lib/test/test_robotparser.py
==============================================================================
--- python/branches/py3k/Lib/test/test_robotparser.py	(original)
+++ python/branches/py3k/Lib/test/test_robotparser.py	Fri Jul 18 22:59:44 2008
@@ -136,8 +136,9 @@
 
 RobotTest(7, doc, good, bad)
 
-class TestCase(unittest.TestCase):
-    def runTest(self):
+class NetworkTestCase(unittest.TestCase):
+
+    def testPasswordProtectedSite(self):
         support.requires('network')
         # whole site is password-protected.
         url = 'http://mueblesmoraleda.com'
@@ -146,9 +147,17 @@
         parser.read()
         self.assertEqual(parser.can_fetch("*", url+"/robots.txt"), False)
 
+    def testPythonOrg(self):
+        support.requires('network')
+        parser = urllib.robotparser.RobotFileParser(
+            "http://www.python.org/robots.txt")
+        parser.read()
+        self.assertTrue(parser.can_fetch("*",
+                                         "http://www.python.org/robots.txt"))
+
 def test_main():
+    support.run_unittest(NetworkTestCase)
     support.run_unittest(tests)
-    TestCase().run()
 
 if __name__=='__main__':
     support.Verbose = 1

Modified: python/branches/py3k/Lib/urllib/robotparser.py
==============================================================================
--- python/branches/py3k/Lib/urllib/robotparser.py	(original)
+++ python/branches/py3k/Lib/urllib/robotparser.py	Fri Jul 18 22:59:44 2008
@@ -60,7 +60,8 @@
             elif err.code >= 400:
                 self.allow_all = True
         else:
-            self.parse(f.read().splitlines())
+            raw = f.read()
+            self.parse(raw.decode("utf-8").splitlines())
 
     def _add_entry(self, entry):
         if "*" in entry.useragents:
@@ -123,7 +124,10 @@
             return True
         # search for given user agent matches
         # the first match counts
-        url = urllib.parse.quote(urllib.parse.urlparse(urllib.parse.unquote(url))[2]) or "/"
+        url = urllib.parse.quote(
+            urllib.parse.urlparse(urllib.parse.unquote(url))[2])
+        if not url:
+            url = "/"
         for entry in self.entries:
             if entry.applies_to(useragent):
                 return entry.allowance(url)


More information about the Python-3000-checkins mailing list