[Tutor] Recursion depth exceeded in python web crawler
Daniel Bosah
dbosah at buffalo.edu
Thu Jun 14 14:32:46 EDT 2018
I am trying to modify code from a web crawler to scrape for keywords from
certain websites. However, Im trying to run the web crawler before I
modify it, and I'm running into issues.
When I ran this code -
*import threading*
*from Queue import Queue*
*from spider import Spider*
*from domain import get_domain_name*
*from general import file_to_set*
*PROJECT_NAME = "SPIDER"*
*HOME_PAGE = "https://www.cracked.com/ <https://www.cracked.com/>"*
*DOMAIN_NAME = get_domain_name(HOME_PAGE)*
*QUEUE_FILE = '/home/me/research/queue.txt'*
*CRAWLED_FILE = '/home/me/research/crawled.txt'*
*NUMBER_OF_THREADS = 1*
*#Captialize variables and make them class variables to make them const
variables*
*threadqueue = Queue()*
*Spider(PROJECT_NAME,HOME_PAGE,DOMAIN_NAME)*
*def crawl():*
* change = file_to_set(QUEUE_FILE)*
* if len(change) > 0:*
* print str(len(change)) + 'links in the queue'*
* create_jobs()*
*def create_jobs():*
* for link in file_to_set(QUEUE_FILE):*
* threadqueue.put(link) #.put = put item into the queue*
* threadqueue.join()*
* crawl()*
*def create_spiders():*
* for _ in range(NUMBER_OF_THREADS): #_ basically if you dont want to
act on the iterable*
* vari = threading.Thread(target = work)*
* vari.daemon = True #makes sure that it dies when main exits*
* vari.start()*
*#def regex():*
* #for i in files_to_set(CRAWLED_FILE):*
* #reg(i,LISTS) #MAKE FUNCTION FOR REGEX# i is url's, LISTs is
list or set of keywords*
*def work():*
* while True:*
* url = threadqueue.get()# pops item off queue*
* Spider.crawl_pages(threading.current_thread().name,url)*
* threadqueue.task_done()*
*create_spiders()*
*crawl()*
That used this class:
*from HTMLParser import HTMLParser*
*from urlparse import urlparse*
*class LinkFinder(HTMLParser):*
* def _init_(self, base_url,page_url):*
* super()._init_()*
* self.base_url= base_url*
* self.page_url = page_url*
* self.links = set() #stores the links*
* def error(self,message):*
* pass*
* def handle_starttag(self,tag,attrs):*
* if tag == 'a': # means a link*
* for (attribute,value) in attrs:*
* if attribute == 'href': #href relative url i.e not
having www*
* url = urlparse.urljoin(self.base_url,value)*
* self.links.add(url)*
* def return_links(self):*
* return self.links()*
And this spider class:
*from urllib import urlopen #connects to webpages from python*
*from link_finder import LinkFinder*
*from general import directory, text_maker, file_to_set, conversion_to_set*
*class Spider():*
* project_name = 'Reader'*
* base_url = ''*
* Queue_file = ''*
* crawled_file = ''*
* queue = set()*
* crawled = set()*
* def __init__(self,project_name, base_url,domain_name):*
* Spider.project_name = project_name*
* Spider.base_url = base_url*
* Spider.domain_name = domain_name*
* Spider.Queue_file = '/home/me/research/queue.txt'*
* Spider.crawled_file = '/home/me/research/crawled.txt'*
* self.boot()*
* self.crawl_pages('Spider 1 ', base_url)*
* @staticmethod *
* def boot():*
* directory(Spider.project_name)*
* text_maker(Spider.project_name,Spider.base_url)*
* Spider.queue = file_to_set(Spider.Queue_file)*
* Spider.crawled = file_to_set(Spider.crawled_file)*
* @staticmethod *
* def crawl_pages(thread_name, page_url):*
* if page_url not in Spider.crawled:*
* print thread_name + 'crawling ' + page_url*
* print 'queue' + str(len(Spider.queue)) + '|crawled' +
str(len(Spider.crawled))*
* Spider.add_links_to_queue(Spider.gather_links(page_url))*
* Spider.crawled.add(page_url)*
* Spider.update_files()*
* @staticmethod*
* def gather_links(page_url):*
* html_string = ''*
* try:*
* response = urlopen(page_url)*
* if 'text/html' in response.getheader('Content Type'):*
* read = response.read()*
* html_string = read.decode('utf-8')*
* finder = LinkFinder(Spider.base_url,page_url)*
* finder.feed(html_string)*
* except:*
* print 'Error: cannot crawl page'*
* return set()*
* return finder.return_links()*
* @staticmethod*
* def add_links_to_queue(links):*
* for i in links:*
* if i in Spider.queue:*
* continue*
* if i in Spider.crawled:*
* continue*
* # if Spider.domain_name != get_domain_name(url):*
* # continue*
* Spider.queue.add()*
* @staticmethod*
* def update_files():*
* conversion_to_set(Spider.queue,Spider.Queue_file)*
* conversion_to_set(Spider.crawled,Spider.crawled_file*)
and these functions:
*from urlparse import urlparse*
*#get subdomain name (name.example.com <http://name.example.com>)*
*def subdomain_name(url):*
* try:*
* return urlparse(url).netloc*
* except:*
* return ''*
*def get_domain_name(url):*
* try:*
* variable = subdomain_name.split(',')*
* return variable[-2] + ',' + variable[-1] #returns 2nd to last and
last instances of variable*
* except:*
* return '''*
(there are more functions, but those are housekeeping functions)
The interpreter returned this error:
*RuntimeError: maximum recursion depth exceeded while calling a Python
object*
After calling crawl() and create_jobs() a bunch of times?
How can I resolve this?
Thanks
More information about the Tutor
mailing list