[Tutor] Recursion depth exceeded in python web crawler

Thu Jun 14 14:32:46 EDT 2018

I am trying to modify code from a web crawler to scrape for keywords from
certain websites. However, Im trying to run the web crawler before  I
modify it, and I'm running into issues.

When I ran this code -

*import threading*
*from Queue import Queue*
*from spider import Spider*
*from domain import get_domain_name*
*from general import file_to_set*

*PROJECT_NAME = "SPIDER"*
*HOME_PAGE = "https://www.cracked.com/ <https://www.cracked.com/>"*
*DOMAIN_NAME = get_domain_name(HOME_PAGE)*
*QUEUE_FILE = '/home/me/research/queue.txt'*
*CRAWLED_FILE = '/home/me/research/crawled.txt'*
*NUMBER_OF_THREADS = 1*
*#Captialize variables and make them class variables to make them const
variables*

*threadqueue = Queue()*

*Spider(PROJECT_NAME,HOME_PAGE,DOMAIN_NAME)*

*def crawl():*
*    change = file_to_set(QUEUE_FILE)*
*    if len(change) > 0:*
*        print str(len(change)) + 'links in the queue'*
*        create_jobs()*

*def create_jobs():*
*    for link in file_to_set(QUEUE_FILE):*
*        threadqueue.put(link) #.put = put item into the queue*
*    threadqueue.join()*
*    crawl()*
*def create_spiders():*
*    for _ in range(NUMBER_OF_THREADS): #_ basically if you dont want to
act on the iterable*
*        vari = threading.Thread(target = work)*
*        vari.daemon = True #makes sure that it dies when main exits*
*        vari.start()*

*#def regex():*
*        #for i in files_to_set(CRAWLED_FILE):*
*              #reg(i,LISTS) #MAKE FUNCTION FOR REGEX# i is url's, LISTs is
list or set of keywords*
*def work():*
*    while True:*
*        url = threadqueue.get()# pops item off queue*
*        Spider.crawl_pages(threading.current_thread().name,url)*
*        threadqueue.task_done()*

*create_spiders()*

*crawl()*

That used this class:

*from HTMLParser import HTMLParser*
*from urlparse import urlparse*

*class LinkFinder(HTMLParser):*
*    def _init_(self, base_url,page_url):*
*        super()._init_()*
*        self.base_url= base_url*
*        self.page_url = page_url*
*        self.links = set() #stores the links*
*    def error(self,message):*
*        pass*
*    def handle_starttag(self,tag,attrs):*
*        if tag == 'a': # means a link*
*            for (attribute,value) in attrs:*
*                if attribute  == 'href':  #href relative url i.e not
having www*
*                    url = urlparse.urljoin(self.base_url,value)*
*                    self.links.add(url)*
*    def return_links(self):*
*        return self.links()*

And this spider class:

*from urllib import urlopen #connects to webpages from python*
*from link_finder import LinkFinder*
*from general import directory, text_maker, file_to_set, conversion_to_set*

*class Spider():*
*     project_name = 'Reader'*
*     base_url = ''*
*     Queue_file = ''*
*     crawled_file = ''*
*     queue = set()*
*     crawled = set()*

*     def __init__(self,project_name, base_url,domain_name):*
*         Spider.project_name = project_name*
*         Spider.base_url = base_url*
*         Spider.domain_name = domain_name*
*         Spider.Queue_file =  '/home/me/research/queue.txt'*
*         Spider.crawled_file =  '/home/me/research/crawled.txt'*
*         self.boot()*
*         self.crawl_pages('Spider 1 ', base_url)*

*     @staticmethod  *
*     def boot():*
*          directory(Spider.project_name)*
*          text_maker(Spider.project_name,Spider.base_url)*
*          Spider.queue = file_to_set(Spider.Queue_file)*
*          Spider.crawled = file_to_set(Spider.crawled_file)*
*     @staticmethod    *
*     def crawl_pages(thread_name, page_url):*
*          if page_url not in Spider.crawled:*
*              print thread_name + 'crawling ' + page_url*
*              print 'queue' + str(len(Spider.queue)) + '|crawled' +
str(len(Spider.crawled))*
*              Spider.add_links_to_queue(Spider.gather_links(page_url))*
*              Spider.crawled.add(page_url)*
*              Spider.update_files()*
*     @staticmethod*
*     def gather_links(page_url):*
*          html_string = ''*
*          try:*
*              response = urlopen(page_url)*
*              if 'text/html' in response.getheader('Content Type'):*
*                  read = response.read()*
*                  html_string = read.decode('utf-8')*
*              finder = LinkFinder(Spider.base_url,page_url)*
*              finder.feed(html_string)*
*          except:*
*               print 'Error: cannot crawl page'*
*               return set()*
*          return finder.return_links()*

*     @staticmethod*
*     def add_links_to_queue(links):*
*            for i in links:*
*                if i in Spider.queue:*
*                    continue*
*                if i in Spider.crawled:*
*                    continue*
*               # if Spider.domain_name != get_domain_name(url):*
*                #    continue*
*                Spider.queue.add()*
*     @staticmethod*
*     def update_files():*
*            conversion_to_set(Spider.queue,Spider.Queue_file)*
*            conversion_to_set(Spider.crawled,Spider.crawled_file*)

and these functions:

*from urlparse import urlparse*

*#get subdomain name (name.example.com <http://name.example.com>)*

*def subdomain_name(url):*
*    try:*
*        return urlparse(url).netloc*
*    except:*
*        return ''*

*def get_domain_name(url):*
*    try:*
*        variable = subdomain_name.split(',')*
*        return variable[-2] + ',' + variable[-1] #returns 2nd to last and
last instances of variable*
*    except:*
*        return '''*

(there are more functions, but those are housekeeping functions)

The interpreter returned this error:

*RuntimeError: maximum recursion depth exceeded while calling a Python
object*

After calling crawl() and create_jobs() a bunch of times?

How can I resolve this?

Thanks