[Tutor] Question about python code that is not working
Arthur Kolbe
kolbe.business at gmail.com
Mon Jun 19 09:40:00 EDT 2023
I want to be able to put a list of websites into this program and the
program to crawl all pages of the websites I entered, to then check for any
404 pages. Then I want the program to create a csv file with two columns,
in the left column all websites I entered, and in the right column, the
status of the websites, with either "no 404 pages found on this website" or
"404 pages found on this website". Or "website not found" if the website
can't be reached. This is the code I tried to build but it's not working.
If you could tell me what I'm doing wrong and how to fix, that would be
amazing! Thanks in advance!
Code:
*import* csv
*import* scrapy
*from* scrapy.crawler *import* CrawlerRunner
*from* scrapy.spiders *import* CrawlSpider, Rule
*from* scrapy.linkextractors *import* LinkExtractor
*from* scrapy.utils.project *import* get_project_settings
*from* twisted.internet *import* reactor
*class* PageChecker(CrawlSpider):
name = 'page_checker'
custom_settings = {
'DOWNLOAD_DELAY': 1,
'CONCURRENT_REQUESTS': 4,
'USER_AGENT': 'Mozilla/5.0 (compatible; Googlebot/2.1; +
http://www.google.com/bot.html)',
'LOG_LEVEL': 'ERROR'
}
*def* __init__(self, websites=*None*, **kwargs):
self.start_urls = websites
self.allowed_domains = [self.extract_domain(url) *for* url *in*
websites]
self.rules = [Rule(LinkExtractor(allow=()),
callback=self.parse_page, follow=*True*)]
self.results = {} # Track results for each URL
super().__init__(**kwargs)
@staticmethod
*def* extract_domain(url):
*return* url.split('//')[-1].split('/')[0]
*def* parse_page(self, response):
status_code = response.status
*if* status_code == 404:
self.results.setdefault(response.url, *True*)
*def* closed(self, reason):
save_to_csv(self.results)
*def* check_404_pages(websites):
results = {}
runner = CrawlerRunner(get_project_settings())
*for* website *in* websites:
runner.crawl(PageChecker, websites=[website], results=results) #
Pass the spider class and results dictionary
d = runner.join()
d.addBoth(*lambda* _: save_to_csv(results)) # Save results to CSV
after the crawl is complete
*def* save_to_csv(results):
csv_file_path = '404_results.csv'
*with* open(csv_file_path, 'w', newline='') *as* csvfile:
writer = csv.writer(csvfile)
writer.writerow(['URL', 'Status'])
*for* url, _ *in* results.items():
writer.writerow([url, '404 Pages Found'])
*if* *not* results:
print("No 404 pages found.")
*else*:
print("CSV file created successfully.")
websites = [
"https://osteopathie.org",
"https://hpo-osteopathie.de",
"https://osteopathiezentrum.de"
]
# Check for 404 pages by crawling all pages of the websites and save the
results to a CSV file
check_404_pages(websites)
reactor.run()
Kind regards,
Arthur
More information about the Tutor
mailing list