[Python-checkins] r65255 - in python/trunk/Lib: robotparser.py test/test_robotparser.py
skip.montanaro
python-checkins at python.org
Sun Jul 27 02:49:04 CEST 2008
Author: skip.montanaro
Date: Sun Jul 27 02:49:02 2008
New Revision: 65255
Log:
Close issue 3437 - missing state change when Allow lines are processed.
Adds test cases which use Allow: as well.
Modified:
python/trunk/Lib/robotparser.py
python/trunk/Lib/test/test_robotparser.py
Modified: python/trunk/Lib/robotparser.py
==============================================================================
--- python/trunk/Lib/robotparser.py (original)
+++ python/trunk/Lib/robotparser.py Sun Jul 27 02:49:02 2008
@@ -76,6 +76,10 @@
"""parse the input lines from a robots.txt file.
We allow that a user-agent: line is not preceded by
one or more blank lines."""
+ # states:
+ # 0: start state
+ # 1: saw user-agent line
+ # 2: saw an allow or disallow line
state = 0
linenumber = 0
entry = Entry()
@@ -114,6 +118,7 @@
elif line[0] == "allow":
if state != 0:
entry.rulelines.append(RuleLine(line[1], True))
+ state = 2
if state == 2:
self.entries.append(entry)
Modified: python/trunk/Lib/test/test_robotparser.py
==============================================================================
--- python/trunk/Lib/test/test_robotparser.py (original)
+++ python/trunk/Lib/test/test_robotparser.py Sun Jul 27 02:49:02 2008
@@ -134,6 +134,75 @@
RobotTest(7, doc, good, bad)
+# From Google: http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=40364
+
+# 8.
+doc = """
+User-agent: Googlebot
+Allow: /folder1/myfile.html
+Disallow: /folder1/
+"""
+
+good = ['/folder1/myfile.html']
+bad = ['/folder1/anotherfile.html']
+
+RobotTest(8, doc, good, bad, agent="Googlebot")
+
+# 9. This file is incorrect because "Googlebot" is a substring of
+# "Googlebot-Mobile", so test 10 works just like test 9.
+doc = """
+User-agent: Googlebot
+Disallow: /
+
+User-agent: Googlebot-Mobile
+Allow: /
+"""
+
+good = []
+bad = ['/something.jpg']
+
+RobotTest(9, doc, good, bad, agent="Googlebot")
+
+good = []
+bad = ['/something.jpg']
+
+RobotTest(10, doc, good, bad, agent="Googlebot-Mobile")
+
+# 11. Get the order correct.
+doc = """
+User-agent: Googlebot-Mobile
+Allow: /
+
+User-agent: Googlebot
+Disallow: /
+"""
+
+good = []
+bad = ['/something.jpg']
+
+RobotTest(11, doc, good, bad, agent="Googlebot")
+
+good = ['/something.jpg']
+bad = []
+
+RobotTest(12, doc, good, bad, agent="Googlebot-Mobile")
+
+
+# 13. Google also got the order wrong in #8. You need to specify the
+# URLs from more specific to more general.
+doc = """
+User-agent: Googlebot
+Allow: /folder1/myfile.html
+Disallow: /folder1/
+"""
+
+good = ['/folder1/myfile.html']
+bad = ['/folder1/anotherfile.html']
+
+RobotTest(13, doc, good, bad, agent="googlebot")
+
+
+
class TestCase(unittest.TestCase):
def runTest(self):
test_support.requires('network')
More information about the Python-checkins
mailing list