[Patches] updates to robotparser.py
Skip Montanaro
skip@mojam.com (Skip Montanaro)
Fri, 24 Mar 2000 16:12:59 -0600
The robotparser.py module currently lives in Tools/webchecker. In
preparation for its migration to Lib, I made the following changes:
* renamed the test() function _test
* corrected the URLs in _test() so they refer to actual documents
* added an "if __name__ == '__main__'" catcher to invoke _test()
when run as a main program
* added doc strings for the two main methods, parse and can_fetch
* replaced usage of regsub and regex with corresponding re code
Disclaimer:
I confirm that, to the best of my knowledge and belief, this
contribution is free of any claims of third parties under
copyright, patent or other rights or interests ("claims"). To
the extent that I have any such claims, I hereby grant to CNRI a
nonexclusive, irrevocable, royalty-free, worldwide license to
reproduce, distribute, perform and/or display publicly, prepare
derivative versions, and otherwise use this contribution as part
of the Python software and its related documentation, or any
derivative versions thereof, at no cost to CNRI or its licensed
users, and to authorize others to do so.
I acknowledge that CNRI may, at its sole discretion, decide
whether or not to incorporate this contribution in the Python
software and its related documentation. I further grant CNRI
permission to use my name and other identifying information
provided to CNRI by me for use in connection with the Python
software and its related documentation.
--
Skip Montanaro | http://www.mojam.com/
skip@mojam.com | http://www.musi-cal.com/
*** /tmp/robotparser.py.~1.2~ Fri Mar 24 16:06:54 2000
--- /tmp/robotparser.py Fri Mar 24 16:06:54 2000
***************
*** 23,37 ****
def set_url(self, url):
self.url = url
- ## import urlmisc
- ## self.url = urlmisc.canonical_url(url)
def read(self):
import urllib
self.parse(urllib.urlopen(self.url).readlines())
def parse(self, lines):
! import regsub, string, regex
active = []
for line in lines:
if self.debug: print '>', line,
--- 23,36 ----
def set_url(self, url):
self.url = url
def read(self):
import urllib
self.parse(urllib.urlopen(self.url).readlines())
def parse(self, lines):
! """parse the input lines from a robot.txt file"""
! import string, re
active = []
for line in lines:
if self.debug: print '>', line,
***************
*** 43,49 ****
line = string.strip(line[:string.find(line, '#')])
if not line:
continue
! line = regsub.split(line, ' *: *')
if len(line) == 2:
line[0] = string.lower(line[0])
if line[0] == 'user-agent':
--- 42,48 ----
line = string.strip(line[:string.find(line, '#')])
if not line:
continue
! line = re.split(' *: *', line)
if len(line) == 2:
line[0] = string.lower(line[0])
if line[0] == 'user-agent':
***************
*** 56,62 ****
if line[1]:
if self.debug: print '>> disallow:', line[1]
for agent in active:
! self.rules[agent].append(regex.compile(line[1]))
else:
pass
for agent in active:
--- 55,61 ----
if line[1]:
if self.debug: print '>> disallow:', line[1]
for agent in active:
! self.rules[agent].append(re.compile(line[1]))
else:
pass
for agent in active:
***************
*** 68,97 ****
self.modified()
# returns true if agent is allowed to fetch url
! def can_fetch(self, agent, url):
import urlparse
! ag = agent
if not self.rules.has_key(ag): ag = '*'
if not self.rules.has_key(ag):
! if self.debug: print '>> allowing', url, 'fetch by', agent
return 1
path = urlparse.urlparse(url)[2]
for rule in self.rules[ag]:
! if rule.match(path) != -1:
! if self.debug: print '>> disallowing', url, 'fetch by', agent
return 0
! if self.debug: print '>> allowing', url, 'fetch by', agent
return 1
! def test():
rp = RobotFileParser()
rp.debug = 1
! rp.set_url('http://www.automatrix.com/robots.txt')
rp.read()
print rp.rules
! print rp.can_fetch('*', 'http://www.calendar.com/concerts/')
print rp.can_fetch('Musi-Cal-Robot',
! 'http://dolphin:80/cgi-bin/music-search?performer=Rolling+Stones')
! print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/')
! print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/vanagon-list-001')
--- 67,97 ----
self.modified()
# returns true if agent is allowed to fetch url
! def can_fetch(self, useragent, url):
! """using the parsed robots.txt decide if useragent can fetch url"""
import urlparse
! ag = useragent
if not self.rules.has_key(ag): ag = '*'
if not self.rules.has_key(ag):
! if self.debug: print '>> allowing', url, 'fetch by', useragent
return 1
path = urlparse.urlparse(url)[2]
for rule in self.rules[ag]:
! if rule.match(path) is not None:
! if self.debug: print '>> disallowing', url, 'fetch by', useragent
return 0
! if self.debug: print '>> allowing', url, 'fetch by', useragent
return 1
! def _test():
rp = RobotFileParser()
rp.debug = 1
! rp.set_url('http://www.musi-cal.com/robots.txt')
rp.read()
print rp.rules
! print rp.can_fetch('*', 'http://www.musi-cal.com.com/')
print rp.can_fetch('Musi-Cal-Robot',
! 'http://www.musi-cal.com/cgi-bin/event-search?city=San+Francisco')
! if __name__ == "__main__":
! _test()