Javascript website scraping using WebKit and Selenium tools

Veek M vek.m1234 at gmail.com
Thu Jul 2 03:11:40 CEST 2015


I tried scraping a javascript website using two tools, both didn't work. The 
website link is: http://xdguo.taobao.com/category-499399872.htm The relevant 
text I'm trying to extract is 'GY-68...':

<div class="item3line1">

    <dl class="item " data-id="38952795780">
        <dt class="photo">
            <a target="_blank" href="//item.taobao.com/item.htm?spm=a1z10.5-
c.w4002-6778075404.11.54MDOI&id=38952795780" data-spm-wangpu-module-
id="4002-6778075404" data-spm-anchor-id="a1z10.5-c.w4002-6778075404.11">
                <img 
src="//img.alicdn.com/bao/uploaded/i4/TB1HMt3FFXXXXaFaVXXXXXXXXXX_!!0-
item_pic.jpg_240x240.jpg" alt="GY-68 BMP180 ?? BOSCH?? ??????? ??
BMP085"></img>
            </a>
        </dt>

I'm trying to match the class="item " bit as a preliminary venture:

from pyvirtualdisplay import Display
from selenium import webdriver
import time

display = Display(visible=0, size=(800, 600))
display.start()

browser = webdriver.Firefox()
browser.get('http://xdguo.taobao.com/category-499399872.htm')
print browser.title

time.sleep(120)    
content = browser.find_element_by_class_name('item ')
print content
browser.quit()

display.stop()


I get:
    selenium.common.exceptions.NoSuchElementException: Message: Unable to 
locate element: {"method":"class name","selector":"item "}

I also tried using WebKit - i know the site renders okay in WebKit because i 
tested with rekonq Here, i get the page (in Chinese) but the actual/relevant 
data is not there. WebKit's supposed to run the Javascript and give me the 
final results but I don't think that's happening.

import sys
from io import StringIO
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
from lxml import html
from lxml import etree

#Take this class for granted.Just use result of rendering.
class Render(QWebPage):
  def __init__(self, url):
    self.app = QApplication(sys.argv)
    QWebPage.__init__(self)
    self.loadFinished.connect(self._loadFinished)
    self.mainFrame().load(QUrl(url))
    self.app.exec_()

  def _loadFinished(self, result):
    self.frame = self.mainFrame()
    self.app.quit()

url = 'http://xdguo.taobao.com/category-499399872.htm'
r = Render(url) #returns a Render object
result = r.frame.toHtml() #returns a QString
result_utf8 = result.toUtf8() #returns a QByteArray of utf8 data

#QByteArray->str->unicode
#contents = StringIO(unicode(result_utf8.data(), "utf-8"))
data = result_utf8.data() #returns byte string
print(data)

element = html.fromstring(data)
print(element.tag)

for img in element.xpath('//dl[@class="item "]/dt[@class="photo"]/a/img'):
    print(img.get('alt'))

#archive_links = html.fromstring(str(result.toAscii()))
#print 
archive_links.xpath("/html/body/div[2]/div[3]/div[2]/div[2]/div[1]/div/div
/div/div/div/div[2]/div[2]/dl[1]/dt/a/img")

Basically I want a list of parts the seller has to offer that I can grep, 
sort, uniq. I also tried elinks and lynx with ECMAScript but that was too 
basic and didn't work.



More information about the Python-list mailing list