Mailman 3 Time-out? Wrong code? - lxml - The Python XML Toolkit

July 8, 2023

      Hello,

I'm no lxml expert, so it could be a newbie error…but the following web scrawler script sometimes breaks (see "BUG") while trying to find the number of provinces/properties, even after two one-second sleeps:

==========
import requests
from lxml import html
import re
import math
import time

def grab_properties():
	properties = soup.xpath("//div[contains(@class,'gallery')]/a/@href")
	for property in properties:
		print(property)
		response = requests.get(property)
		coords = pattern_coords.search(response.text) #raw HTML since data in JSON
		if coords:
			lat,lon=coords.group(1),coords.group(2)
			print(f"{lat}\t{lon}")

pattern_count = re.compile("(\d+) Propertie") #ignore trailing s for singular/plural
pattern_coords = re.compile("latitude:(.+?),longitude:([^}]+)") #JSON
provinces = ["a", "b", "c"]
for province in provinces:
	time.sleep(1) #added but still no cigar

	url = f"https://www.acme.com/{province}/"
	print("======== ",url)
	response = requests.get(url)
	soup = html.fromstring(response.text)

	#BUG time-out?
	count = soup.xpath("//div[contains(@class,'properties-count')]/text()")
	print(count)
	count = pattern_count.search(count[0])
	if count:
		print("Number of locations:",count)

	locations = soup.xpath("//div[contains(@class,'other-location-box')]/a/@href")
	for location in locations:
		time.sleep(1) #added but still no cigar
		print(location)
		response = requests.get(location)
		soup = html.fromstring(response.text)

		#BUG time-out?
		count = soup.xpath("//div[contains(@class,'properties-count')]/text()")
		print(count)
		count = pattern_count.search(count[0])
		if not count:
			print("Number of properties not found")
			break #next location
		else:
			print("Number of properties found",count.group(1))

			#grab what's in current, first page
			grab_properties()

			#If > 30, must update URL and loop through pages by groups of 30
			count = int(count.group(1))
			for index in range (2,math.ceil(count/30)+1):
				time.sleep(1) #added but still no cigar
				url = f"{location}p/{index}/" #new URL
				response = requests.get(url)
				soup = html.fromstring(response.text)
				grab_properties()
==========

Am I using the wrong syntax to grab the numbers?

Thank you.

Time-out? Wrong code?

codecomplete＠free.fr

Bob Kline

codecomplete＠free.fr

tags

participants (2)