python/xpath issue..
bruce
bedouglas at earthlink.net
Mon Aug 25 07:49:15 EDT 2008
hey guys...
got a weird, hopefully simple issue.
the following sample bit of script is stripped down, and simply gets the
"form" node from the specified site "schedule.psu.edu".
the problem i run into is that the dom/xpath from the libxml2dom works, and
i get the dom object everytime i run the app, but that the xpath is
intermittent!!! in other words, i can run the script 10 times.. and it might
work 7 or 8 times.. the other times, the xpath doesn't give the nodes
back...
when it works, name1_ in the app should be a list of nodes (for the 2 forms
in the page). and len_ should be 2.
is there anything you might suggest that i try in order to get a better
handle on exactly what might be going on here...
keep in mind, i'm not a python guy, just trying to get this to consistently
work... my suspicion is that the culprit might be memory related...
i'm running linux, on a x86 dual core with 4G ram. the python is 2.5.1.
thoughts/comments/etc would be appreciated...
-thanks!!!
#!/usr/bin/python
#
# test.py
#
# scrapes/extracts the basic data for the college
#
#
# the app gets/stores
# name
# url
# address (street/city/state
# phone
#
######################################################################
#test python script
import re
import libxml2dom
import urllib
import urllib2
import sys, string
from mechanize import Browser
import mechanize
#import tidy
import os.path
import cookielib
from libxml2dom import Node
from libxml2dom import NodeList
import subprocess
import time
########################
#
# Parse pricegrabber.com
########################
##cj = "p"
##COOKIEFILE = 'cookies.lwp'
#cookielib = 1
urlopen = urllib2.urlopen
#cj = urllib2.cookielib.LWPCookieJar()
##cj = cookielib.LWPCookieJar()
Request = urllib2.Request
br = Browser()
br2 = Browser()
##if cj != None:
## print "sss"
###install the CookieJar for the default CookieProcessor
## if os.path.isfile(COOKIEFILE):
## cj.load(COOKIEFILE)
## print "foo\n"
## if cookielib:
## opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
## urllib2.install_opener(opener)
## print "foo2\n"
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
values1 = {'name' : 'Michael Foord',
'location' : 'Northampton',
'language' : 'Python' }
headers = { 'User-Agent' : user_agent }
url="http://schedule.psu.edu/"
#=======================================
if __name__ == "__main__":
# main app
txdata = None
#----------------------------
##br.set_cookiejar(cj)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
br.addheaders = [('User-Agent', 'Firefox')]
print "url =",url
br.open(url)
##cj.save(COOKIEFILE) # resave cookies
res = br.response() # this is a copy of response
s = res.read()
print "slen=",len(s)
# s contains HTML not XML text
d = libxml2dom.parseString(s, html=1)
print "d",d
name_=[]
len_=0
name_ = d.xpath("//form")
#name_ = d.xpath("/html/body/form")
print "name1",name_
len_ = len(name_)
print "len",len(name_)
#print "sdlfs"
sys.exit()
# else:
# print "err in form_ID"
print "here..."
More information about the Python-list
mailing list