how to make a class to get all forms and input fields from webpage

alex goretoy aleksandr.goretoy at gmail.com
Sat Dec 27 18:54:02 EST 2008


Hello All,

How do I make a class for retrieving all forms and input fields on a web
page. Meaning, form name, form url,all input fields,
text,textarea,select,etc...I have something currently and it kinda works. It
also kinda works for retrieving all images on webpages. My main concer here
is to make is retrive forms and input fields into a dict that has this kinda
of outline: Maybe not exactly like this, but something to this nature. So I
can use it in other classes.

forms=
{"name":{"url":{"""input_name1":"","input_name_2","select_input":{"value1":"","value2":"selected","value3":""}}}}

This is what I have made a long time ago, I haven't touched it in a while,
but I need to make it work with some other classes. I need tit to receive
all forms and inputs for me. What is a more efficient way to do this? My
background is in PHP and I want to extend my python knowledge. I came up
with this script by Googling something a long time ago and making
modifications and additions to it. You can find the stdout_colours class on
line, JFGI. It's allows for display of caller and called function and colors
output to terminal. Thanks for your help my friends.

#!/usr/bin/python
import urllib, urllib2
import re, sys,string
import os, sys, Image
from sgmllib import SGMLParser
from urlparse import urlparse
import stdout_colours
class URLPacker(SGMLParser):

    def __init__(self,colorize,caller):
        self.colorize=colorize
        self.caller=caller
        self.reset()
    def reset(self):
            SGMLParser.reset(self)
        self.func_me_color="white_on_black"

        self.soc=stdout_colours.stdout_colors(self.colorize,self.caller)
        self.soc.me_him(['ENTER:',__name__],self.func_me_color)

        self.urls = {}
        self.imgs = {}
        self.forms = {}
        self.inputs = {}
        self.action = ""
        self.method = ""
        self.url=""
        self.path=""
        self.source=""
        self.dirname=""
        self.level=0
        self.max_depth=4
        self.urlRExp = re.compile('[.]htm$|[.]html$|[.]shtml$|[/]$|[.]php$',
re.IGNORECASE)
        self.fileRExp = re.compile('[\/:*?"<>|]')
        self.formats=[]

        self.soc.me_him(['EXIT:',__name__],self.func_me_color)
    def start_a(self,attrs):
        self.soc.me_him(['ENTER:',__name__],self.func_me_color)
        self.soc.w(attrs,"red")
        href = [v for k,v in attrs if k=='href']
        for value in href:
            if self.urls.has_key(value):
                pass
            else:
                self.urls[value]=0

        self.soc.me_him(['EXIT:',__name__],self.func_me_color)

    def start_img(self,attrs):
        self.soc.me_him(['ENTER:',__name__],self.func_me_color)
        self.soc.w(attrs,"blue")
        src = [v for k,v in attrs if k=="src"]
        for value in src:
            if self.imgs.has_key(value):
                pass
            else:
                self.imgs[value]=0

        self.soc.me_him(['EXIT:',__name__],self.func_me_color)
    def start_form(self,attrs):
        self.soc.me_him(['ENTER:',__name__],self.func_me_color)
        self.soc.w(attrs,"green")
        method = [v for k, v in attrs if k=='method']
        action = [v for k,v in attrs if k=="action"]
        if string.join(method,"") != "":
            self.method=method[0]
        else:
            self.method="post"

        self.action=action[0]

        self.soc.w(action,"white_on_green")
        self.soc.w(method,"white_on_blue")
        self.forms[self.action]={}

        self.forms[self.action][self.method]={}

        self.soc.me_him(['EXIT:',__name__],self.func_me_color)
    def start_input(self,attrs):
        self.soc.me_him(['ENTER:',__name__],self.func_me_color)
        self.soc.w(attrs,"yellow")
        name = [v for k, v in attrs if k=='name']
        value = [v for k,v in attrs if k=="value"]
        if string.join(name,"") !="":
            if string.join(value,"")!="":
                self.forms[self.action][self.method][name[0]]=value[0]
            else:
                self.forms[self.action][self.method][name[0]]=""
        self.soc.w(self.forms,"white_on_gold")
        self.soc.me_him(['EXIT:',__name__],self.func_me_color)
    def url_dirname(self, url):
        self.soc.me_him(['RETURN:',__name__],self.func_me_color)
        #print url
        return self.fileRExp.sub('_',url)
    def dirname(self,url):
        self.soc.me_him(['RETURN:',__name__],self.func_me_color)
        return os.path.dirname(url)
    def save_images(self,minsize):
        if os.path.isdir(self.path+self.dirname):
            return False # aborting, dir exists
        else:
            a=self.path+self.dirname
            os.mkdir(a)
            os.chdir(a)
            print self.url
            for img in self.imgs.keys():
                loc=self.url_dirname(img)
                print loc, img, a
                try:
                    if self.imgs[img]==0:
                        #urllib.urlretrieve(img,loc)
                        #os.system('wget -A.jpg,gif,png,wmv,avi,mpg -r -l4
-H -erobots=off --wait=1 -np -U \"Mozilla/5.0 (X11; U; Linux i686; en-US;
rv:1.7.6) Gecko/20050405 Firefox/1.0 (Ubuntu package 1.0.2)\" '+img)
                        #im = Image.open(loc)
                        #if im.size[0]<minsize or im.size[1] < minsize:
                            #print "removed",img,loc
                            #os.remove(loc)
                        #else:
                            #print "image saved", img,loc
                        self.imgs[img]=1
                except IOError,e:
                    self.soc.w(["save_images IOERROR",IOError,e,"img:
",img,"loc: ",loc,"path: ",a],"white_on_red")
            for img in self.urls.keys():
                #if img.find(".jpg")!=-1:
                loc=self.url_dirname(img)
                print loc, img, a
                #try:
                #    if self.urls[img]==0:
                #        continue
                        #os.system('wget -A.jpg,gif,png,wmv,avi,mpg -r -l4
-H -erobots=off --wait=1 -np -U \"Mozilla/5.0 (X11; U; Linux i686; en-US;
rv:1.7.6) Gecko/20050405 Firefox/1.0 (Ubuntu package 1.0.2)\" '+img)
                        #urllib.urlretrieve(img,loc)
                        #im = Image.open(loc)
                        #if im.size[0]<minsize or im.size[1] < minsize:
                            #print "removed",img,loc
                            #os.remove(loc)
                        #else:
                            #print "image saved", img,loc
                    #self.urls[img]=1
                #except IOError,e:
                #    self.soc.w(["save_images IOERROR",IOError,e,"img:
",img,"loc: ",loc,"path: ",a],"white_on_red")

            #os.system('wget -r -l1 -A jpg -U \"Mozilla/5.0 (X11; U; Linux
i686; en-US; rv:1.7.6) Gecko/20050405 Firefox/1.0 (Ubuntu package 1.0.2)\"
-nd --wait=2 --random-wait --no-parent -nv ' + self.url)
            #if self.url.find("jpg")==-1:
            #    pass
            #else:

#urllib.urlretrieve(self.url,string.join(string.split(self.url,"/"),""))

            os.chdir("..")
            #print "done saving to "+path+dirname
            #return True
    def convert_to_absolute_urls(self):
        urls = {}
        for url in self.urls:
            if os.path.dirname(self.url) =="http:":
                self.url=self.url+"/"
            else:
                self.url=os.path.dirname(self.url)+"/"
            print urlparse(url), self.url
            if urlparse(url)[0].find("http") == -1:
                if urlparse(url)[0].find("script") == -1:
                    urls[self.url+url]=self.urls[url]
            else:
                    urls[url]=self.urls[url]
        self.urls = urls
    def convert_to_absolute_imgs(self):
        print "here"
        imgs = {}
        for img in self.imgs:
            if os.path.dirname(self.url) =="http:":
                self.url=self.url+"/"
            else:
                self.url=os.path.dirname(self.url)+"/"
            print urlparse(img), self.url
            if urlparse(img)[0].find("http") == -1:
                if urlparse(img)[0].find("script") == -1:
                    imgs[self.url+img]=self.imgs[img]
            else:
                    imgs[img]=self.imgs[img]
        print "IMAGES: ",imgs
        self.imgs = imgs
    def process_url(self,url,path,level,max_depth):
        #self.reset()
        self.soc.me_him(['ENTER:',__name__],self.func_me_color)

        if path[-1:] !="/":
            path=path+"/"

        self.url = url
        self.path = path
        self.soc.w(["url: ",url],"blue")

        self.dirname = self.url_dirname(self.url)
        self.soc.w(["dirname: ",self.dirname],"red")
        if os.path.isdir(self.path):
            self.soc.w(["path: ",self.path,"LEVEL:",level],"green")

            if os.path.isdir(self.path+self.dirname):
                print "ABORT dir already exists: "+ self.path+self.dirname
                return False
            else:
                try:

                    if self.url.find(".jpg")==-1:
                        if self.urls[self.url]==0:
                            sock = urllib.urlopen(self.url)
                            self.source = sock.read()
                            self.feed(self.source)
                            self.urls[self.url]=1
                            sock.close()
                            self.close()

                            self.convert_to_absolute_urls()
                            self.convert_to_absolute_imgs()

                            print "urls:
",self.soc.w(self.urls,"white_on_red")
                            print "forms:
",self.soc.w(self.forms,"white_on_blue")
                            print "imgs:
",self.soc.w(self.imgs,"white_on_green")

                            print "SAVING IMAGES",self.url
                            #self.save_images(250)
                            for i in self.urls.keys():
                                if self.urls.has_key(i):
                                    pass

#self.process_url(i,self.path,level+1,max_depth)


                        else:
                            self.soc.w(["BEEN
DONE",self.url],"white_on_gold")
                    else:
                        urllib.urlretrieve(self.url,self.path)
                except IOError,e:
                    self.soc.w(["process_url
IOERROR",IOError,e],"white_on_red")
                    #return False
        else:
            self.soc.w(["Incorrect Path:", self.path],"white_on_red")

        self.soc.me_him(['EXIT:',__name__],self.func_me_color)
if __name__ == "__main__":

    path=sys.argv[1]
    url=sys.argv[2]
    colorize=sys.argv[3]
    caller=sys.argv[4]

    func_me_color="white_on_black"
        soc=stdout_colours.stdout_colors(colorize,caller)
        soc.me(['ENTER:',__name__],func_me_color)

    max_depth=4
    level=0
    up=URLPacker(colorize,caller)
    up.urls[url]=0
    up.process_url(url,path,level,max_depth)






    soc.me(['EXIT:',__name__],func_me_color)
-- 
А-Б-В-Г-Д-Е-Ё-Ж-З-И-Й-К-Л-М-Н-О-П-Р-С-Т-У-Ф-Х-Ц-Ч-Ш-Щ-Ъ-Ы-Ь-Э-Ю-Я
а-б-в-г-д-е-ё-ж-з-и-й-к-л-м-н-о-п-р-с-т-у-ф-х-ц-ч-ш-щ-ъ-ы-ь-э-ю-я
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.python.org/pipermail/python-list/attachments/20081227/e4262fab/attachment.html>


More information about the Python-list mailing list