A fun python CLI program for all to enjoy!
DFS
nospam at dfs.com
Fri May 6 15:10:37 EDT 2016
getAddresses.py
Scrapes addresses from www.usdirectory.com and stores them in a SQLite
database, or writes them to text files for mailing labels, etc
Now, just by typing 'fast food Taco Bell <city> 10 db all' you can find
out how many Taco Bells are within 10 miles of you, and store all the
addresses in your own address database.
No more convoluted Googling, or hitting the 'Next Page' button, or
fumbling with the Yellow Pages...
Note: the db structure is flat on purpose, and the .csv files aren't
quote delimited.
Put the program in its own directory. It creates the SQLite database
there, and writes files there, too.
Reviews of code, bug reports, criticisms, suggestions for improvement,
etc are all welcome.
Enjoy!
========================================================================
#getAddresses.py
import os, sys, requests, time, datetime
from lxml import html
import pyodbc, sqlite3, re
#show values of variables, HTML content, etc
#set it to False for short/concise program output
verbose = False
if verbose == True:
print "The verbose setting is turned On."
print ""
#check if address is unique
addrCheck = []
def addrUnique(addr):
if addr not in addrCheck:
x = True
addrCheck.append(addr)
else: x = False
return x
#validate and parse command line
def showHelp():
print ""
print " Enter search word(s), city or zip, state, miles to search, txt
or csv or db, # addresses to save (no commas)"
print ""
print " eg: restaurant Knoxville TN 10 txt 50"
print " search for restaurants within 10 miles of Knoxville TN, and
write"
print " the first 50 address to a txt file"
print ""
print " eg: furniture 30303 GA 20 csv all"
print " search for furniture within 20 miles of zip 30303 GA,"
print " and write all results to a csv file"
print ""
print " eg: boxing gyms Detroit MI 10 db 5"
print " search for boxing gyms within 10 miles of Detroit MI, and
store"
print " the first 5 results in a database"
print ""
print " All entries are case-insensitive (ie TX or tx are acceptable)"
exit(0)
argCnt = len(sys.argv)
if argCnt < 7: showHelp()
if verbose == True:
print ""
print str(argCnt) + " arguments"
keyw = "" #eg restaurant, boxing gym
if argCnt == 7: keyw = sys.argv[1] #one search word
if argCnt > 7: #multiple search words
for i in range(1,argCnt-5):
keyw = keyw + sys.argv[i] + "+"
keyw = keyw[:-1] #drop trailing + sign
cityzip = sys.argv[argCnt-5] #eg Atlanta or 30339
state = sys.argv[argCnt-4] #eg GA
miles = sys.argv[argCnt-3] #eg 5,10,20,30,50 (website allows max 30)
store = sys.argv[argCnt-2] #write address to file or database
addrWant = sys.argv[argCnt-1] #eg save All or number >0
if addrWant.lower() != "all": #how many addresses to save
if addrWant.isdigit() == False: showHelp()
if addrWant == "0" : showHelp()
addrWant = int(addrWant)
elif addrWant.lower() == "all": addrWant = addrWant.lower()
else: addrWant = int(addrWant)
if store != "csv" and store != "txt" and store != "db": showHelp()
#begin timing the code
startTime = time.clock()
#website, SQLite db, search string, current date/time for use with db
datasrc = "www.usdirectory.com"
dbName = "addresses.sqlite"
search = keyw + " " + str(cityzip) + " " + state + " " + str(miles) + "
" + str(addrWant)
loaddt = datetime.datetime.now()
#write addresses to file
#each time the same search is done, the file is deleted and recreated
if store == "csv" or store == "txt":
#csv will write in .csv format - header and 1 line per address
#txt will write out 3 lines per address, then blank before next address
webfile = "usdirectory.com_"+keyw+"_"+cityzip+"_"+state+"."+store
f = open(webfile,"w")
if store == "csv": f.write("Name,Address,CityStateZip\n")
f.close
#store addresses in database
cSQL = ""
if store == "db":
#creates a SQLite database that Access 2003 can't read
#conn = sqlite3.connect(dbName)
#also creates a SQLite database that Access 2003 can't read
conn = pyodbc.connect('Driver={SQLite3 ODBC Driver};Database=' + dbName)
db = conn.cursor()
cSQL = "CREATE TABLE If Not Exists ADDRESSES "
cSQL += "(datasrc, search, category, name, street, city, state, zip,
loaddt, "
cSQL += "PRIMARY KEY (datasrc, search, name, street));"
db.execute(cSQL)
# cSQL = "CREATE TABLE If Not Exists CATEGORIES "
# cSQL += "(catID INTEGER PRIMARY KEY, catDesc);"
# db.execute(cSQL)
# db.execute("CREATE UNIQUE INDEX If Not Exists UIDX_CATDESC ON
CATEGORIES (catDesc);")
conn.commit()
if verbose == True:
print("connected to database: " + dbName)
print cSQL
print("created table: addresses")
print("")
if verbose == True:
print "Search summary"
print "------------------------------"
print "Keywords: " + keyw
print "City/Zip: " + cityzip
print "State : " + state
print "Radius : " + str(miles) + " miles"
print "Save : " + str(addrWant) + " addresses to " + store
print "------------------------------"
print ""
#build url
wBase = "http://www.usdirectory.com"
wForm = "/ypr.aspx?fromform=qsearch"
wKeyw = "&qhqn=" + keyw
wCityZip = "&qc=" + cityzip
wState = "&qs=" + state
wDist = "&rg=" + str(miles)
wSort = "&sb=a2z" #sort alpha
wPage = "&ap=" #used with the results page number
webpage = wBase + wForm + wKeyw + wCityZip + wState + wDist
if verbose == True:
print "Search url: \n" + webpage
print ""
#delete previous results of identical search
if store == "db":
cSQL = "DELETE FROM addresses "
cSQL += "WHERE datasrc = '" + datasrc + "' "
cSQL += "AND search = '" + search + "';"
if verbose == True: print cSQL
db.execute(cSQL)
conn.commit()
#query web server, save results
print "searching..."
i = 0
dupes = 0
addrReturned = 0
addrSaved = 0
while 1:
wPageNbr = wPage + str(i+1)
webpage = wBase + wForm + wKeyw + wCityZip + wState + wDist + wSort +
wPageNbr
page = requests.get(webpage)
tree = html.fromstring(page.content)
#no matches
matches = tree.xpath('//strong/text()')
if i == 0 and "No results were found" in str(matches):
print "No results found for that search"
exit(0)
os.remove(webfile)
#parse number of addresses returned
#some searches return 2 items: ['Found N results', 'junk']
#some searches return 3 items: ['Filter this search','Found N
results','junk']
if i == 0:
match = tree.xpath('//div[@class="header_text"]/text()')
if len(match) > 2: match.pop(0) #remove first element if 2
match = [int(s) for s in match[0].split() if s.isdigit()]
addrFound = match[0]
if addrWant != "all": addrWant = min(addrWant,addrFound)
print str(addrFound) + " matches found (" + str(addrWant) + " will be
saved)"
print ""
#split names, addresses into lists
nms = tree.xpath('//span[@class="header_text3"]/text()')
if len(nms) == 0: break
addr = tree.xpath('//span[@class="text3"]/text()')
addr = [t.replace("\r\n", "") for t in addr]
addr = filter(None, (t.strip() for t in addr))
street = [s.split(',')[0] for s in addr]
city = [c.split(',')[1].strip() for c in addr]
state = [s[-8:][:2] for s in addr]
zip = [z[-5:] for z in addr]
#get usdirectory.com categories
category = tree.xpath('//a/text()')
category = [c.strip() for c in category]
category = filter(None, category)
pattern = re.compile(r"^[A-Z\s&,-]+$")
category = [x for x in category if pattern.match(x)]
#screen feedback
print "retrieving page " + str(i+1) + ": " + str(len(nms)) + " addresses"
if verbose == True:
print ""
print "Names: \n" + str(nms)
print ""
print "Addresses: \n" + str(addr)
print ""
print "Categories: \n" + str(category)
print
"----------------------------------------------------------------------------------------"
print ""
#data integrity check - make sure all lists have same # of items
lenData =
[len(category),len(nms),len(addr),len(street),len(city),len(state),len(zip)]
if len(set(lenData)) != 1:
print "Data parsing issue. One or more lists has an incorrect number
of items. Program will exit."
exit(0)
if verbose == True:
if len(set(lenData)) == 1: print "Verified: each list has " +
str(len(nms)) + " items in it."
#write addresses to file
if store == "txt" or store == "csv":
addrList = []
for j in range(len(nms)):
if addrUnique(nms[j]+' '+street[j]) == True:
if store == "txt":
addrList.append(nms[j]+'\n'+street[j]+'\n'+city[j]+', '+state[j]+'
'+zip[j]+'\n\n')
if store == "csv": addrList.append(nms[j]+',' +street[j]+','
+city[j]+' ' +state[j]+' '+zip[j]+'\n')
addrSaved += 1
else:
dupes += 1
print " * duplicate address found: " + nms[j] + ", " + street[j]
addrReturned += 1
if addrWant != "all":
if addrSaved >= addrWant: break
f = open(webfile,"a")
for address in addrList: f.write(address)
f.close
#write addresses to database
if store == "db":
for j in range(len(nms)):
dupeRow = False
cSQL = "INSERT INTO ADDRESSES VALUES (?,?,?,?,?,?,?,?,?)"
#(datasrc,search,category,name,street,city,state,zip,loaddt) "
Vals =
datasrc,search,category[j],nms[j],street[j],city[j],state[j],zip[j],str(loaddt)
if verbose == True: print cSQL + ',' + str(Vals)
try: db.execute(cSQL, Vals)
except (pyodbc.Error) as programError:
if str(programError).find("UNIQUE constraint failed") > 0:
dupeRow = True
dupes +=1
print " * duplicate address found: " + nms[j] + ", " + street[j]
pass
addrReturned += 1
if dupeRow == False:
addrSaved += 1
if addrWant != "all":
if addrSaved >= addrWant: break
conn.commit()
if addrSaved >= addrFound or addrSaved >= addrWant: break
i += 1
time.sleep(2)
#finish
if (store == "csv" or store == "txt"):
print "\nFinished\nWrote " + str(addrSaved) + " addresses to file " +
webfile
elif store == "db":
db.close()
conn.close()
print "\nFinished\nStored " + str(addrSaved) + " addresses in database:
" + dbName
if dupes > 0: print "(" + str(dupes) + " duplicate addresses ignored)"
#timer
endTime = time.clock()
print "processing time: %.2g seconds" %(endTime-startTime)
#bug in www.usdirectory.com code: usually overreports matches by 1
if (addrWant == "all") and (addrReturned != addrFound):
print "Note: " + datasrc + " reported " + str(addrFound) + " matches,
but returned " + str(addrReturned)
========================================================================
More information about the Python-list
mailing list