python - firefox dom/xpath question/issue
bruce
bedouglas at earthlink.net
Mon Aug 25 16:03:06 EDT 2008
Hi.
Got a test web page, that basically has two "<html" tags in it. Examining
the page via Firefox/Dom Inspector, I can create a test xpath query
"/html/body/form" which gets the target form for the test.
The issue comes when I examine the page's source html. It looks like:
<html>
<body>
</body>
</html>
<html>
<body>
.
.
.
</body>
</html>
I've simplified things a bit... but basically, the 1st "html/body" is empty,
with the 2nd containing the data/nodes I need.
In using xpath("/html/body/form"), the app returns nothing/crashes.. I've
tried to do something like xpath("/html[position()=0]") as well with no
luck... It's as if xpath only looks at the 1st html that it sees in a given
page. I can't seem to find any docs for xpath to work around this. I'm using
the libxml2dom for python 2.5.1.
Any thoughts/comments...
If I comment out the 1st html section, things work as they should. The test
code is below...
thanks
------------------------------------------
#!/usr/bin/python
#
# test.py
#
# scrapes/extracts the basic data for the college
#
#
# the app gets/stores
# name
# url
# address (street/city/state
# phone
#
######################################################################3
#test python script
import re
import libxml2dom
import urllib
import urllib2
import sys, string
from mechanize import Browser
import mechanize
#import tidy
import os.path
import cookielib
from libxml2dom import Node
from libxml2dom import NodeList
import subprocess
import time
########################
#
# Parse pricegrabber.com
########################
##cj = "p"
##COOKIEFILE = 'cookies.lwp'
#cookielib = 1
urlopen = urllib2.urlopen
Request = urllib2.Request
br = Browser()
br2 = Browser()
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
values1 = {'name' : 'Michael Foord',
'location' : 'Northampton',
'language' : 'Python' }
headers = { 'User-Agent' : user_agent }
url="http://schedule.psu.edu/"
#=======================================
if __name__ == "__main__":
# main app
txdata = None
#----------------------------
##br.set_cookiejar(cj)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
br.addheaders = [('User-Agent', 'Firefox')]
print "url =",url
#br.open(url)
##cj.save(COOKIEFILE) # resave cookies
#res = br.response() # this is a copy of response
#s = res.read()
#print "slen=",len(s)
tfile = open("/college/psu1.dat")
s = tfile.read()
print s
# s contains HTML not XML text
d=[]
d = libxml2dom.parseString(s, html=1)
print "d",d
name_=[]
len_=0
br.open(url)
##cj.save(COOKIEFILE) # resave cookies
#res = br.response() # this is a copy of response
#s = res.read()
print "slen=",len(s)
# s contains HTML not XML text
#d=[]
#d = libxml2dom.parseString(s, html=1)
#print "d",d
#name_ = d.xpath("//form")
name_ = d.xpath("/html/body/form")
len_ = len(name_)
print "len=",len_
print "name1",name_
print "len",len(name_)
#print "sdlfs"
sys.exit()
# else:
# print "err in form_ID"
print "here..."
More information about the Python-list
mailing list