Link Checking Issues - Sub domains

Tue Aug 5 11:14:20 EDT 2008

Hi,

I have written this script to run as a cron that will loop through a
text file with a list of urls. It works fine for most of the links,
however there are a number of urls which are subdomains (they are
government sites) such as http://basename.airforce.mil, these links
are always throwing 400 errors even though the site exists.

Is there a way to get around this?

Here is the script:

import httplib
from urlparse import urlparse

class LinkChecker:

	def oldStuff():
		p = urlparse(url)
		h = HTTP(p[1])
		h.putrequest('HEAD', p[2])
		h.endheaders()
		if h.getreply()[0] == 200: return 1
		else: return 0

	def check(self):
		print "\nLooping through the file, line by line."

		# define default values for the paremeters
		text_file = open("/home/jjaffe/pythonModules/JAMRSscripts/urls.txt",
"r")
		output = ""
		errors = "=================== ERRORS (website exists but 404, 503
etc ): ===================\n"
		failures= "\n=================== FAILURES (cannot connect to website
at all): ===================\n"
		eCount = 0
		fCount = 0

		#loop through each line and see what the response code is
		for line in text_file:
			p = urlparse(line)
			try:
				conn = httplib.HTTPConnection(p[1])
				conn.request("GET", p[2])
				r1 = conn.getresponse()
				if r1.status != 200: #if the response code was not success (200)
then report the error
					errors += "\n "+str(r1.status)+" error for: "+p[1]+p[2]
					eCount = (eCount + 1)
				data1 = r1.read()
				conn.close()
			except: #the connection attempt timed out - hence the website
doesn't even exist
				failures +="\n Could not create connection object: "+p[1]+p[2]
				fCount = (fCount + 1)
		text_file.close()

		#see if there were errors and create output string
		if (eCount == 0) and (fCount == 0):
			output = "No errors or failures to report"
		else:
			output = errors+"\n\n"+failures

		print output

if __name__ == '__main__':
	lc = LinkChecker()
	lc.check()
	del lc

Thanks in advance.