[Python-checkins] CVS: python/dist/src/Tools/webchecker robotparser.py,1.2,1.3

Guido van Rossum python-dev@python.org
Mon, 27 Mar 2000 14:29:34 -0500 (EST)


Update of /projects/cvsroot/python/dist/src/Tools/webchecker
In directory eric:/projects/python/develop/guido/src/Tools/webchecker

Modified Files:
	robotparser.py 
Log Message:
Skip Montanaro:

The robotparser.py module currently lives in Tools/webchecker.  In
preparation for its migration to Lib, I made the following changes:

    * renamed the test() function _test
    * corrected the URLs in _test() so they refer to actual documents
    * added an "if __name__ == '__main__'" catcher to invoke _test()
      when run as a main program
    * added doc strings for the two main methods, parse and can_fetch
    * replaced usage of regsub and regex with corresponding re code


Index: robotparser.py
===================================================================
RCS file: /projects/cvsroot/python/dist/src/Tools/webchecker/robotparser.py,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -r1.2 -r1.3
*** robotparser.py	1998/04/06 14:29:18	1.2
--- robotparser.py	2000/03/27 19:29:31	1.3
***************
*** 24,29 ****
      def set_url(self, url):
          self.url = url
- ##      import urlmisc
- ##      self.url = urlmisc.canonical_url(url)
  
      def read(self):
--- 24,27 ----
***************
*** 32,36 ****
  
      def parse(self, lines):
!         import regsub, string, regex
          active = []
          for line in lines:
--- 30,35 ----
  
      def parse(self, lines):
!         """parse the input lines from a robot.txt file"""
!         import string, re
          active = []
          for line in lines:
***************
*** 44,48 ****
              if not line:
                  continue
!             line = regsub.split(line, ' *: *')
              if len(line) == 2:
                  line[0] = string.lower(line[0])
--- 43,47 ----
              if not line:
                  continue
!             line = re.split(' *: *', line)
              if len(line) == 2:
                  line[0] = string.lower(line[0])
***************
*** 57,61 ****
                          if self.debug: print '>> disallow:', line[1]
                          for agent in active:
!                             self.rules[agent].append(regex.compile(line[1]))
                      else:
                          pass
--- 56,60 ----
                          if self.debug: print '>> disallow:', line[1]
                          for agent in active:
!                             self.rules[agent].append(re.compile(line[1]))
                      else:
                          pass
***************
*** 69,97 ****
  
      # returns true if agent is allowed to fetch url
!     def can_fetch(self, agent, url):
          import urlparse
!         ag = agent
          if not self.rules.has_key(ag): ag = '*'
          if not self.rules.has_key(ag):
!             if self.debug: print '>> allowing', url, 'fetch by', agent
              return 1
          path = urlparse.urlparse(url)[2]
          for rule in self.rules[ag]:
!             if rule.match(path) != -1:
!                 if self.debug: print '>> disallowing', url, 'fetch by', agent
                  return 0
!         if self.debug: print '>> allowing', url, 'fetch by', agent
          return 1
  
! def test():
      rp = RobotFileParser()
      rp.debug = 1
!     rp.set_url('http://www.automatrix.com/robots.txt')
      rp.read()
      print rp.rules
!     print rp.can_fetch('*', 'http://www.calendar.com/concerts/')
      print rp.can_fetch('Musi-Cal-Robot',
!                        'http://dolphin:80/cgi-bin/music-search?performer=Rolling+Stones')
  
!     print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/')
!     print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/vanagon-list-001')
--- 68,97 ----
  
      # returns true if agent is allowed to fetch url
!     def can_fetch(self, useragent, url):
!         """using the parsed robots.txt decide if useragent can fetch url"""
          import urlparse
!         ag = useragent
          if not self.rules.has_key(ag): ag = '*'
          if not self.rules.has_key(ag):
!             if self.debug: print '>> allowing', url, 'fetch by', useragent
              return 1
          path = urlparse.urlparse(url)[2]
          for rule in self.rules[ag]:
!             if rule.match(path) is not None:
!                 if self.debug: print '>> disallowing', url, 'fetch by', useragent
                  return 0
!         if self.debug: print '>> allowing', url, 'fetch by', useragent
          return 1
  
! def _test():
      rp = RobotFileParser()
      rp.debug = 1
!     rp.set_url('http://www.musi-cal.com/robots.txt')
      rp.read()
      print rp.rules
!     print rp.can_fetch('*', 'http://www.musi-cal.com.com/')
      print rp.can_fetch('Musi-Cal-Robot',
!                        'http://www.musi-cal.com/cgi-bin/event-search?city=San+Francisco')
  
! if __name__ == "__main__":
!     _test()