[Python-checkins] CVS: python/dist/src/Tools/webchecker robotparser.py,1.2,1.3
Guido van Rossum
python-dev@python.org
Mon, 27 Mar 2000 14:29:34 -0500 (EST)
Update of /projects/cvsroot/python/dist/src/Tools/webchecker
In directory eric:/projects/python/develop/guido/src/Tools/webchecker
Modified Files:
robotparser.py
Log Message:
Skip Montanaro:
The robotparser.py module currently lives in Tools/webchecker. In
preparation for its migration to Lib, I made the following changes:
* renamed the test() function _test
* corrected the URLs in _test() so they refer to actual documents
* added an "if __name__ == '__main__'" catcher to invoke _test()
when run as a main program
* added doc strings for the two main methods, parse and can_fetch
* replaced usage of regsub and regex with corresponding re code
Index: robotparser.py
===================================================================
RCS file: /projects/cvsroot/python/dist/src/Tools/webchecker/robotparser.py,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -r1.2 -r1.3
*** robotparser.py 1998/04/06 14:29:18 1.2
--- robotparser.py 2000/03/27 19:29:31 1.3
***************
*** 24,29 ****
def set_url(self, url):
self.url = url
- ## import urlmisc
- ## self.url = urlmisc.canonical_url(url)
def read(self):
--- 24,27 ----
***************
*** 32,36 ****
def parse(self, lines):
! import regsub, string, regex
active = []
for line in lines:
--- 30,35 ----
def parse(self, lines):
! """parse the input lines from a robot.txt file"""
! import string, re
active = []
for line in lines:
***************
*** 44,48 ****
if not line:
continue
! line = regsub.split(line, ' *: *')
if len(line) == 2:
line[0] = string.lower(line[0])
--- 43,47 ----
if not line:
continue
! line = re.split(' *: *', line)
if len(line) == 2:
line[0] = string.lower(line[0])
***************
*** 57,61 ****
if self.debug: print '>> disallow:', line[1]
for agent in active:
! self.rules[agent].append(regex.compile(line[1]))
else:
pass
--- 56,60 ----
if self.debug: print '>> disallow:', line[1]
for agent in active:
! self.rules[agent].append(re.compile(line[1]))
else:
pass
***************
*** 69,97 ****
# returns true if agent is allowed to fetch url
! def can_fetch(self, agent, url):
import urlparse
! ag = agent
if not self.rules.has_key(ag): ag = '*'
if not self.rules.has_key(ag):
! if self.debug: print '>> allowing', url, 'fetch by', agent
return 1
path = urlparse.urlparse(url)[2]
for rule in self.rules[ag]:
! if rule.match(path) != -1:
! if self.debug: print '>> disallowing', url, 'fetch by', agent
return 0
! if self.debug: print '>> allowing', url, 'fetch by', agent
return 1
! def test():
rp = RobotFileParser()
rp.debug = 1
! rp.set_url('http://www.automatrix.com/robots.txt')
rp.read()
print rp.rules
! print rp.can_fetch('*', 'http://www.calendar.com/concerts/')
print rp.can_fetch('Musi-Cal-Robot',
! 'http://dolphin:80/cgi-bin/music-search?performer=Rolling+Stones')
! print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/')
! print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/vanagon-list-001')
--- 68,97 ----
# returns true if agent is allowed to fetch url
! def can_fetch(self, useragent, url):
! """using the parsed robots.txt decide if useragent can fetch url"""
import urlparse
! ag = useragent
if not self.rules.has_key(ag): ag = '*'
if not self.rules.has_key(ag):
! if self.debug: print '>> allowing', url, 'fetch by', useragent
return 1
path = urlparse.urlparse(url)[2]
for rule in self.rules[ag]:
! if rule.match(path) is not None:
! if self.debug: print '>> disallowing', url, 'fetch by', useragent
return 0
! if self.debug: print '>> allowing', url, 'fetch by', useragent
return 1
! def _test():
rp = RobotFileParser()
rp.debug = 1
! rp.set_url('http://www.musi-cal.com/robots.txt')
rp.read()
print rp.rules
! print rp.can_fetch('*', 'http://www.musi-cal.com.com/')
print rp.can_fetch('Musi-Cal-Robot',
! 'http://www.musi-cal.com/cgi-bin/event-search?city=San+Francisco')
! if __name__ == "__main__":
! _test()