python screen scraping/parsing
Dan Stromberg
dstromberglists at gmail.com
Fri Jun 13 14:25:47 EDT 2008
BeautifulSoup is a pretty nice python module for screen scraping (not
necessarily well formed) web pages.
On Fri, 13 Jun 2008 11:10:09 -0700, bruce wrote:
> Hi...
>
> got a short test app that i'm playing with. the goal is to get data off
> the page in question.
>
> basically, i should be able to get a list of "tr" nodes, and then to
> iterate/parse them. i'm missing something, as i think i can get a single
> node, but i can't figure out how to display the contents of the node..
> nor how to get the list of the "tr" nodes....
>
> my test code is:
> --------------------------------
> #!/usr/bin/python
>
>
> #test python script
> import re
> import libxml2dom
> import urllib
> import urllib2
> import sys, string
> from mechanize import Browser
> import mechanize
> #import tidy
> import os.path
> import cookielib
> from libxml2dom import Node
> from libxml2dom import NodeList
>
> ########################
> #
> # Parse pricegrabber.com
> ########################
>
>
> # datafile
> tfile = open("price.dat", 'wr+')
> efile = open("price_err.dat", 'wr+')
>
>
> urlopen = urllib2.urlopen
> ##cj = urllib2.cookielib.LWPCookieJar() Request = urllib2.Request
> br = Browser()
>
>
> user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' values1 =
> {'name' : 'Michael Foord',
> 'location' : 'Northampton',
> 'language' : 'Python' }
> headers = { 'User-Agent' : user_agent }
>
>
> url ="http://www.pricegrabber.com/rating_summary.php/page=1"
>
> #=======================================
>
>
> if __name__ == "__main__":
> # main app
>
> txdata = None
>
> #----------------------------
> # get the kentucky test pages
>
> #br.set_cookiejar(cj)
> br.set_handle_redirect(True)
> br.set_handle_referer(True)
> br.set_handle_robots(False)
> br.addheaders = [('User-Agent', 'Firefox')] br.open(url)
> #cj.save(COOKIEFILE) # resave cookies
>
> res = br.response() # this is a copy of response s = res.read()
>
> # s contains HTML not XML text
> d = libxml2dom.parseString(s, html=1)
>
> print "d = d",d
>
> #get the input/text dialogs
> #tn1 =
> "//div[@id='main_content']/form[1]/input[position()=1]/@name"
>
> t1 =
> "/html/body/div[@id='pgSiteContainer']/div[@id='pgPageContent']/table
[2]/tbo
> dy"
> tr =
> "/html/body/div[@id='pgSiteContainer']/div[@id='pgPageContent']/table
[2]/tbo
> dy/tr[4]"
>
> tr_=d.xpath(tr)
>
> print "len =",tr_[1].nodeValue
>
> print "fin"
>
> -----------------------------------------------
>
> my issue appears to be related to the last "tbody", or tbody/tr[4]...
>
> if i leave off the tbody, i can display data, as the tr_ is an array
> with data...
>
> with the "tbody" it appears that the tr_ array is not defined, or it has
> no data... however, i can use the DOM tool with firefox to observe the
> fact that the "tbody" is there...
>
> so.. what am i missing...
>
>
> thoughts/comments are most welcome...
>
> also, i'm willing to send a small amount via paypal!!
>
> -bruce
More information about the Python-list
mailing list