[Patches] updates to robotparser.py

Skip Montanaro skip@mojam.com (Skip Montanaro)
Fri, 24 Mar 2000 16:12:59 -0600


The robotparser.py module currently lives in Tools/webchecker.  In
preparation for its migration to Lib, I made the following changes:

    * renamed the test() function _test
    * corrected the URLs in _test() so they refer to actual documents
    * added an "if __name__ == '__main__'" catcher to invoke _test()
      when run as a main program
    * added doc strings for the two main methods, parse and can_fetch
    * replaced usage of regsub and regex with corresponding re code

	Disclaimer:

	I confirm that, to the best of my knowledge and belief, this
	contribution is free of any claims of third parties under
	copyright, patent or other rights or interests ("claims").  To
	the extent that I have any such claims, I hereby grant to CNRI a
	nonexclusive, irrevocable, royalty-free, worldwide license to
	reproduce, distribute, perform and/or display publicly, prepare
	derivative versions, and otherwise use this contribution as part
	of the Python software and its related documentation, or any
	derivative versions thereof, at no cost to CNRI or its licensed
	users, and to authorize others to do so.

	I acknowledge that CNRI may, at its sole discretion, decide
	whether or not to incorporate this contribution in the Python
	software and its related documentation.  I further grant CNRI
	permission to use my name and other identifying information
	provided to CNRI by me for use in connection with the Python
	software and its related documentation.

-- 
Skip Montanaro | http://www.mojam.com/
skip@mojam.com | http://www.musi-cal.com/


*** /tmp/robotparser.py.~1.2~	Fri Mar 24 16:06:54 2000
--- /tmp/robotparser.py	Fri Mar 24 16:06:54 2000
***************
*** 23,37 ****
  
      def set_url(self, url):
          self.url = url
- ##      import urlmisc
- ##      self.url = urlmisc.canonical_url(url)
  
      def read(self):
          import urllib
          self.parse(urllib.urlopen(self.url).readlines())
  
      def parse(self, lines):
!         import regsub, string, regex
          active = []
          for line in lines:
              if self.debug: print '>', line,
--- 23,36 ----
  
      def set_url(self, url):
          self.url = url
  
      def read(self):
          import urllib
          self.parse(urllib.urlopen(self.url).readlines())
  
      def parse(self, lines):
!         """parse the input lines from a robot.txt file"""
!         import string, re
          active = []
          for line in lines:
              if self.debug: print '>', line,
***************
*** 43,49 ****
              line = string.strip(line[:string.find(line, '#')])
              if not line:
                  continue
!             line = regsub.split(line, ' *: *')
              if len(line) == 2:
                  line[0] = string.lower(line[0])
                  if line[0] == 'user-agent':
--- 42,48 ----
              line = string.strip(line[:string.find(line, '#')])
              if not line:
                  continue
!             line = re.split(' *: *', line)
              if len(line) == 2:
                  line[0] = string.lower(line[0])
                  if line[0] == 'user-agent':
***************
*** 56,62 ****
                      if line[1]:
                          if self.debug: print '>> disallow:', line[1]
                          for agent in active:
!                             self.rules[agent].append(regex.compile(line[1]))
                      else:
                          pass
                          for agent in active:
--- 55,61 ----
                      if line[1]:
                          if self.debug: print '>> disallow:', line[1]
                          for agent in active:
!                             self.rules[agent].append(re.compile(line[1]))
                      else:
                          pass
                          for agent in active:
***************
*** 68,97 ****
          self.modified()
  
      # returns true if agent is allowed to fetch url
!     def can_fetch(self, agent, url):
          import urlparse
!         ag = agent
          if not self.rules.has_key(ag): ag = '*'
          if not self.rules.has_key(ag):
!             if self.debug: print '>> allowing', url, 'fetch by', agent
              return 1
          path = urlparse.urlparse(url)[2]
          for rule in self.rules[ag]:
!             if rule.match(path) != -1:
!                 if self.debug: print '>> disallowing', url, 'fetch by', agent
                  return 0
!         if self.debug: print '>> allowing', url, 'fetch by', agent
          return 1
  
! def test():
      rp = RobotFileParser()
      rp.debug = 1
!     rp.set_url('http://www.automatrix.com/robots.txt')
      rp.read()
      print rp.rules
!     print rp.can_fetch('*', 'http://www.calendar.com/concerts/')
      print rp.can_fetch('Musi-Cal-Robot',
!                        'http://dolphin:80/cgi-bin/music-search?performer=Rolling+Stones')
  
!     print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/')
!     print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/vanagon-list-001')
--- 67,97 ----
          self.modified()
  
      # returns true if agent is allowed to fetch url
!     def can_fetch(self, useragent, url):
!         """using the parsed robots.txt decide if useragent can fetch url"""
          import urlparse
!         ag = useragent
          if not self.rules.has_key(ag): ag = '*'
          if not self.rules.has_key(ag):
!             if self.debug: print '>> allowing', url, 'fetch by', useragent
              return 1
          path = urlparse.urlparse(url)[2]
          for rule in self.rules[ag]:
!             if rule.match(path) is not None:
!                 if self.debug: print '>> disallowing', url, 'fetch by', useragent
                  return 0
!         if self.debug: print '>> allowing', url, 'fetch by', useragent
          return 1
  
! def _test():
      rp = RobotFileParser()
      rp.debug = 1
!     rp.set_url('http://www.musi-cal.com/robots.txt')
      rp.read()
      print rp.rules
!     print rp.can_fetch('*', 'http://www.musi-cal.com.com/')
      print rp.can_fetch('Musi-Cal-Robot',
!                        'http://www.musi-cal.com/cgi-bin/event-search?city=San+Francisco')
  
! if __name__ == "__main__":
!     _test()