Link Checking Issues - Sub domains
rpupkin77
pthompson2005 at gmail.com
Tue Aug 5 11:14:20 EDT 2008
Hi,
I have written this script to run as a cron that will loop through a
text file with a list of urls. It works fine for most of the links,
however there are a number of urls which are subdomains (they are
government sites) such as http://basename.airforce.mil, these links
are always throwing 400 errors even though the site exists.
Is there a way to get around this?
Here is the script:
import httplib
from urlparse import urlparse
class LinkChecker:
def oldStuff():
p = urlparse(url)
h = HTTP(p[1])
h.putrequest('HEAD', p[2])
h.endheaders()
if h.getreply()[0] == 200: return 1
else: return 0
def check(self):
print "\nLooping through the file, line by line."
# define default values for the paremeters
text_file = open("/home/jjaffe/pythonModules/JAMRSscripts/urls.txt",
"r")
output = ""
errors = "=================== ERRORS (website exists but 404, 503
etc ): ===================\n"
failures= "\n=================== FAILURES (cannot connect to website
at all): ===================\n"
eCount = 0
fCount = 0
#loop through each line and see what the response code is
for line in text_file:
p = urlparse(line)
try:
conn = httplib.HTTPConnection(p[1])
conn.request("GET", p[2])
r1 = conn.getresponse()
if r1.status != 200: #if the response code was not success (200)
then report the error
errors += "\n "+str(r1.status)+" error for: "+p[1]+p[2]
eCount = (eCount + 1)
data1 = r1.read()
conn.close()
except: #the connection attempt timed out - hence the website
doesn't even exist
failures +="\n Could not create connection object: "+p[1]+p[2]
fCount = (fCount + 1)
text_file.close()
#see if there were errors and create output string
if (eCount == 0) and (fCount == 0):
output = "No errors or failures to report"
else:
output = errors+"\n\n"+failures
print output
if __name__ == '__main__':
lc = LinkChecker()
lc.check()
del lc
Thanks in advance.
More information about the Python-list
mailing list