No subject
bruce
bedouglas at earthlink.net
Fri Aug 29 11:16:12 EDT 2008
Hi.
I'm using mechanize to parse a page/site that uses the meta http-equiv tag
in order to perform a refresh/redirect of the page. I've tried a number of
settings, and read different posts on various threads, but seem to be
missing something.
the test.html page is the page that the url returns, however, i was
expecting the test.py app to go ahead and perform the redirect/refresh
automatically.
does the page (test.html) need to be completely valid html?
Any thoughts on what's screwed up here??
thanks
----------------------------------------------------
test.py
--------
import re
import libxml2dom
import urllib
import urllib2
import sys, string
from mechanize import Browser
import mechanize
#import tidy
import os.path
import cookielib
from libxml2dom import Node
from libxml2dom import NodeList
import subprocess
import time
########################
#
# Parse pricegrabber.com
########################
cj = "p"
COOKIEFILE = 'cookies.lwp'
#cookielib = 1
urlopen = urllib2.urlopen
#cj = urllib2.cookielib.LWPCookieJar()
cj = cookielib.LWPCookieJar()
Request = urllib2.Request
br = Browser()
br2 = Browser()
if cj != None:
print "sss"
#install the CookieJar for the default CookieProcessor
if os.path.isfile(COOKIEFILE):
cj.load(COOKIEFILE)
print "foo\n"
if cookielib:
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
urllib2.install_opener(opener)
print "foo2\n"
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
values1 = {'name' : 'Michael Foord',
'location' : 'Northampton',
'language' : 'Python' }
headers = { 'User-Agent' : user_agent }
url="http://schedule.psu.edu/"
#=======================================
if __name__ == "__main__":
# main app
txdata = None
#----------------------------
##br.set_cookiejar(cj)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
br.set_handle_refresh(True)
br.addheaders = [('User-Agent', 'Firefox')]
#url=str(url)+str("act_main_search.cfm")+"?"
#url=url+"Semester=FALL%202008%20%20%20&"
#url=url+"CrseLoc=OZ%3A%3AAbington%20Campus&"
#url=url+"CECrseLoc=AllOZ%3A%3AAbington%20Campus&"
#url=url+"CourseAbbrev=ACCTG&CourseNum=&CrseAlpha=&Search=View+schedule"
#url="http://schedule.psu.edu/act_main_search.cfm?Semester=FALL%202008%20%20
%20%20&CrseLoc=OZ%3A%3AAbington%20Campus&CECrseLoc=AllOZ%3A%3AAbington%20Cam
pus&CourseAbbrev=ACCTG&CourseNum=&CrseAlpha="
url="http://schedule.psu.edu/act_main_search.cfm?Semester=FALL%202008%20%20%
20%20&CrseLoc=OZ%3A%3AAbington%20Campus&CECrseLoc=AllOZ%3A%3AAbington%20Camp
us&CourseAbbrev=ACCTG&CourseNum=&CrseAlpha=&CFID=543143&CFTOKEN=71842529"
print "url =",url
br.open(url)
#cj.save(COOKIEFILE) # resave cookies
res = br.response() # this is a copy of response
s = res.read()
print "slen=",len(s)
print s
=========================================
test.html
<html>
<head>
<TITLE></TITLE>
</head>
<BODY BGCOLOR="#FFFFFF">
<TD NOWRAP WIDTH="45" VALIGN="top"><A
HREF="javascript:openAWindow('http://www.registrar.psu.edu/faculty_staff/enr
oll_services/clsrooms.html#C','Intent',625,425,1)"><FONT FACE="Arial,
Helvetica, sans-serif" SIZE="2"><strong>Tech Type</strong></FONT></A></TD>
<META HTTP-EQUIV="Refresh" CONTENT="0;url=/soc/fall/Alloz/a-c/acctg.html#">
---------------------------------------------------------
sys.exit()
More information about the Python-list
mailing list