how to make a class to get all forms and input fields from webpage
alex goretoy
aleksandr.goretoy at gmail.com
Sat Dec 27 18:54:02 EST 2008
Hello All,
How do I make a class for retrieving all forms and input fields on a web
page. Meaning, form name, form url,all input fields,
text,textarea,select,etc...I have something currently and it kinda works. It
also kinda works for retrieving all images on webpages. My main concer here
is to make is retrive forms and input fields into a dict that has this kinda
of outline: Maybe not exactly like this, but something to this nature. So I
can use it in other classes.
forms=
{"name":{"url":{"""input_name1":"","input_name_2","select_input":{"value1":"","value2":"selected","value3":""}}}}
This is what I have made a long time ago, I haven't touched it in a while,
but I need to make it work with some other classes. I need tit to receive
all forms and inputs for me. What is a more efficient way to do this? My
background is in PHP and I want to extend my python knowledge. I came up
with this script by Googling something a long time ago and making
modifications and additions to it. You can find the stdout_colours class on
line, JFGI. It's allows for display of caller and called function and colors
output to terminal. Thanks for your help my friends.
#!/usr/bin/python
import urllib, urllib2
import re, sys,string
import os, sys, Image
from sgmllib import SGMLParser
from urlparse import urlparse
import stdout_colours
class URLPacker(SGMLParser):
def __init__(self,colorize,caller):
self.colorize=colorize
self.caller=caller
self.reset()
def reset(self):
SGMLParser.reset(self)
self.func_me_color="white_on_black"
self.soc=stdout_colours.stdout_colors(self.colorize,self.caller)
self.soc.me_him(['ENTER:',__name__],self.func_me_color)
self.urls = {}
self.imgs = {}
self.forms = {}
self.inputs = {}
self.action = ""
self.method = ""
self.url=""
self.path=""
self.source=""
self.dirname=""
self.level=0
self.max_depth=4
self.urlRExp = re.compile('[.]htm$|[.]html$|[.]shtml$|[/]$|[.]php$',
re.IGNORECASE)
self.fileRExp = re.compile('[\/:*?"<>|]')
self.formats=[]
self.soc.me_him(['EXIT:',__name__],self.func_me_color)
def start_a(self,attrs):
self.soc.me_him(['ENTER:',__name__],self.func_me_color)
self.soc.w(attrs,"red")
href = [v for k,v in attrs if k=='href']
for value in href:
if self.urls.has_key(value):
pass
else:
self.urls[value]=0
self.soc.me_him(['EXIT:',__name__],self.func_me_color)
def start_img(self,attrs):
self.soc.me_him(['ENTER:',__name__],self.func_me_color)
self.soc.w(attrs,"blue")
src = [v for k,v in attrs if k=="src"]
for value in src:
if self.imgs.has_key(value):
pass
else:
self.imgs[value]=0
self.soc.me_him(['EXIT:',__name__],self.func_me_color)
def start_form(self,attrs):
self.soc.me_him(['ENTER:',__name__],self.func_me_color)
self.soc.w(attrs,"green")
method = [v for k, v in attrs if k=='method']
action = [v for k,v in attrs if k=="action"]
if string.join(method,"") != "":
self.method=method[0]
else:
self.method="post"
self.action=action[0]
self.soc.w(action,"white_on_green")
self.soc.w(method,"white_on_blue")
self.forms[self.action]={}
self.forms[self.action][self.method]={}
self.soc.me_him(['EXIT:',__name__],self.func_me_color)
def start_input(self,attrs):
self.soc.me_him(['ENTER:',__name__],self.func_me_color)
self.soc.w(attrs,"yellow")
name = [v for k, v in attrs if k=='name']
value = [v for k,v in attrs if k=="value"]
if string.join(name,"") !="":
if string.join(value,"")!="":
self.forms[self.action][self.method][name[0]]=value[0]
else:
self.forms[self.action][self.method][name[0]]=""
self.soc.w(self.forms,"white_on_gold")
self.soc.me_him(['EXIT:',__name__],self.func_me_color)
def url_dirname(self, url):
self.soc.me_him(['RETURN:',__name__],self.func_me_color)
#print url
return self.fileRExp.sub('_',url)
def dirname(self,url):
self.soc.me_him(['RETURN:',__name__],self.func_me_color)
return os.path.dirname(url)
def save_images(self,minsize):
if os.path.isdir(self.path+self.dirname):
return False # aborting, dir exists
else:
a=self.path+self.dirname
os.mkdir(a)
os.chdir(a)
print self.url
for img in self.imgs.keys():
loc=self.url_dirname(img)
print loc, img, a
try:
if self.imgs[img]==0:
#urllib.urlretrieve(img,loc)
#os.system('wget -A.jpg,gif,png,wmv,avi,mpg -r -l4
-H -erobots=off --wait=1 -np -U \"Mozilla/5.0 (X11; U; Linux i686; en-US;
rv:1.7.6) Gecko/20050405 Firefox/1.0 (Ubuntu package 1.0.2)\" '+img)
#im = Image.open(loc)
#if im.size[0]<minsize or im.size[1] < minsize:
#print "removed",img,loc
#os.remove(loc)
#else:
#print "image saved", img,loc
self.imgs[img]=1
except IOError,e:
self.soc.w(["save_images IOERROR",IOError,e,"img:
",img,"loc: ",loc,"path: ",a],"white_on_red")
for img in self.urls.keys():
#if img.find(".jpg")!=-1:
loc=self.url_dirname(img)
print loc, img, a
#try:
# if self.urls[img]==0:
# continue
#os.system('wget -A.jpg,gif,png,wmv,avi,mpg -r -l4
-H -erobots=off --wait=1 -np -U \"Mozilla/5.0 (X11; U; Linux i686; en-US;
rv:1.7.6) Gecko/20050405 Firefox/1.0 (Ubuntu package 1.0.2)\" '+img)
#urllib.urlretrieve(img,loc)
#im = Image.open(loc)
#if im.size[0]<minsize or im.size[1] < minsize:
#print "removed",img,loc
#os.remove(loc)
#else:
#print "image saved", img,loc
#self.urls[img]=1
#except IOError,e:
# self.soc.w(["save_images IOERROR",IOError,e,"img:
",img,"loc: ",loc,"path: ",a],"white_on_red")
#os.system('wget -r -l1 -A jpg -U \"Mozilla/5.0 (X11; U; Linux
i686; en-US; rv:1.7.6) Gecko/20050405 Firefox/1.0 (Ubuntu package 1.0.2)\"
-nd --wait=2 --random-wait --no-parent -nv ' + self.url)
#if self.url.find("jpg")==-1:
# pass
#else:
#urllib.urlretrieve(self.url,string.join(string.split(self.url,"/"),""))
os.chdir("..")
#print "done saving to "+path+dirname
#return True
def convert_to_absolute_urls(self):
urls = {}
for url in self.urls:
if os.path.dirname(self.url) =="http:":
self.url=self.url+"/"
else:
self.url=os.path.dirname(self.url)+"/"
print urlparse(url), self.url
if urlparse(url)[0].find("http") == -1:
if urlparse(url)[0].find("script") == -1:
urls[self.url+url]=self.urls[url]
else:
urls[url]=self.urls[url]
self.urls = urls
def convert_to_absolute_imgs(self):
print "here"
imgs = {}
for img in self.imgs:
if os.path.dirname(self.url) =="http:":
self.url=self.url+"/"
else:
self.url=os.path.dirname(self.url)+"/"
print urlparse(img), self.url
if urlparse(img)[0].find("http") == -1:
if urlparse(img)[0].find("script") == -1:
imgs[self.url+img]=self.imgs[img]
else:
imgs[img]=self.imgs[img]
print "IMAGES: ",imgs
self.imgs = imgs
def process_url(self,url,path,level,max_depth):
#self.reset()
self.soc.me_him(['ENTER:',__name__],self.func_me_color)
if path[-1:] !="/":
path=path+"/"
self.url = url
self.path = path
self.soc.w(["url: ",url],"blue")
self.dirname = self.url_dirname(self.url)
self.soc.w(["dirname: ",self.dirname],"red")
if os.path.isdir(self.path):
self.soc.w(["path: ",self.path,"LEVEL:",level],"green")
if os.path.isdir(self.path+self.dirname):
print "ABORT dir already exists: "+ self.path+self.dirname
return False
else:
try:
if self.url.find(".jpg")==-1:
if self.urls[self.url]==0:
sock = urllib.urlopen(self.url)
self.source = sock.read()
self.feed(self.source)
self.urls[self.url]=1
sock.close()
self.close()
self.convert_to_absolute_urls()
self.convert_to_absolute_imgs()
print "urls:
",self.soc.w(self.urls,"white_on_red")
print "forms:
",self.soc.w(self.forms,"white_on_blue")
print "imgs:
",self.soc.w(self.imgs,"white_on_green")
print "SAVING IMAGES",self.url
#self.save_images(250)
for i in self.urls.keys():
if self.urls.has_key(i):
pass
#self.process_url(i,self.path,level+1,max_depth)
else:
self.soc.w(["BEEN
DONE",self.url],"white_on_gold")
else:
urllib.urlretrieve(self.url,self.path)
except IOError,e:
self.soc.w(["process_url
IOERROR",IOError,e],"white_on_red")
#return False
else:
self.soc.w(["Incorrect Path:", self.path],"white_on_red")
self.soc.me_him(['EXIT:',__name__],self.func_me_color)
if __name__ == "__main__":
path=sys.argv[1]
url=sys.argv[2]
colorize=sys.argv[3]
caller=sys.argv[4]
func_me_color="white_on_black"
soc=stdout_colours.stdout_colors(colorize,caller)
soc.me(['ENTER:',__name__],func_me_color)
max_depth=4
level=0
up=URLPacker(colorize,caller)
up.urls[url]=0
up.process_url(url,path,level,max_depth)
soc.me(['EXIT:',__name__],func_me_color)
--
А-Б-В-Г-Д-Е-Ё-Ж-З-И-Й-К-Л-М-Н-О-П-Р-С-Т-У-Ф-Х-Ц-Ч-Ш-Щ-Ъ-Ы-Ь-Э-Ю-Я
а-б-в-г-д-е-ё-ж-з-и-й-к-л-м-н-о-п-р-с-т-у-ф-х-ц-ч-ш-щ-ъ-ы-ь-э-ю-я
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.python.org/pipermail/python-list/attachments/20081227/e4262fab/attachment-0001.html>
More information about the Python-list
mailing list