Hello All,<br><br>How do I make a class for retrieving all forms and input fields on a web page. Meaning, form name, form url,all input fields, text,textarea,select,etc...I have something currently and it kinda works. It also kinda works for retrieving all images on webpages. My main concer here is to make is retrive forms and input fields into a dict that has this kinda of outline: Maybe not exactly like this, but something to this nature. So I can use it in other classes.<br>
<br>forms= {"name":{"url":{"""input_name1":"","input_name_2","select_input":{"value1":"","value2":"selected","value3":""}}}}<br clear="all">
<br>This is what I have made a long time ago, I haven't touched it in a while, but I need to make it work with some other classes. I need tit to receive all forms and inputs for me. What is a more efficient way to do this? My background is in PHP and I want to extend my python knowledge. I came up with this script by Googling something a long time ago and making modifications and additions to it. You can find the stdout_colours class on line, JFGI. It's allows for display of caller and called function and colors output to terminal. Thanks for your help my friends.<br>
<br>#!/usr/bin/python<br>import urllib, urllib2<br>import re, sys,string<br>import os, sys, Image<br>from sgmllib import SGMLParser<br>from urlparse import urlparse<br>import stdout_colours<br>class URLPacker(SGMLParser):<br>
<br>    def __init__(self,colorize,caller):<br>        self.colorize=colorize<br>        self.caller=caller<br>        self.reset()<br>    def reset(self):<br>            SGMLParser.reset(self)<br>        self.func_me_color="white_on_black"<br>
        <br>        self.soc=stdout_colours.stdout_colors(self.colorize,self.caller)<br>        self.soc.me_him(['ENTER:',__name__],self.func_me_color)<br>            <br>        self.urls = {}<br>        self.imgs = {}<br>
        self.forms = {}<br>        self.inputs = {}<br>        self.action = ""<br>        self.method = ""<br>        self.url=""<br>        self.path=""<br>        self.source=""<br>
        self.dirname=""<br>        self.level=0<br>        self.max_depth=4<br>        self.urlRExp = re.compile('[.]htm$|[.]html$|[.]shtml$|[/]$|[.]php$', re.IGNORECASE)<br>        self.fileRExp = re.compile('[\/:*?"<>|]')<br>
        self.formats=[]<br>        <br>        self.soc.me_him(['EXIT:',__name__],self.func_me_color)<br>    def start_a(self,attrs):<br>        self.soc.me_him(['ENTER:',__name__],self.func_me_color)<br>        self.soc.w(attrs,"red")<br>
        href = [v for k,v in attrs if k=='href']<br>        for value in href:<br>            if self.urls.has_key(value):<br>                pass<br>            else:    <br>                self.urls[value]=0<br>
        <br>        self.soc.me_him(['EXIT:',__name__],self.func_me_color)<br>            <br>    def start_img(self,attrs):<br>        self.soc.me_him(['ENTER:',__name__],self.func_me_color)<br>        self.soc.w(attrs,"blue")<br>
        src = [v for k,v in attrs if k=="src"]<br>        for value in src:<br>            if self.imgs.has_key(value):<br>                pass<br>            else:<br>                self.imgs[value]=0<br>                <br>
        self.soc.me_him(['EXIT:',__name__],self.func_me_color)<br>    def start_form(self,attrs):<br>        self.soc.me_him(['ENTER:',__name__],self.func_me_color)<br>        self.soc.w(attrs,"green")<br>
        method = [v for k, v in attrs if k=='method']<br>        action = [v for k,v in attrs if k=="action"]<br>        if string.join(method,"") != "":<br>            self.method=method[0]<br>
        else:<br>            self.method="post"<br>            <br>        self.action=action[0]<br>        <br>        self.soc.w(action,"white_on_green")<br>        self.soc.w(method,"white_on_blue")<br>
        self.forms[self.action]={}<br>        <br>        self.forms[self.action][self.method]={}<br>        <br>        self.soc.me_him(['EXIT:',__name__],self.func_me_color)<br>    def start_input(self,attrs):<br>
        self.soc.me_him(['ENTER:',__name__],self.func_me_color)<br>        self.soc.w(attrs,"yellow")<br>        name = [v for k, v in attrs if k=='name']<br>        value = [v for k,v in attrs if k=="value"]<br>
        if string.join(name,"") !="":<br>            if string.join(value,"")!="":<br>                self.forms[self.action][self.method][name[0]]=value[0]<br>            else:<br>
                self.forms[self.action][self.method][name[0]]=""<br>        self.soc.w(self.forms,"white_on_gold")<br>        self.soc.me_him(['EXIT:',__name__],self.func_me_color)<br>    def url_dirname(self, url):<br>
        self.soc.me_him(['RETURN:',__name__],self.func_me_color)<br>        #print url<br>        return self.fileRExp.sub('_',url)<br>    def dirname(self,url):<br>        self.soc.me_him(['RETURN:',__name__],self.func_me_color)<br>
        return os.path.dirname(url)<br>    def save_images(self,minsize):<br>        if os.path.isdir(self.path+self.dirname):<br>            return False # aborting, dir exists<br>        else:<br>            a=self.path+self.dirname<br>
            os.mkdir(a)<br>            os.chdir(a)<br>            print self.url<br>            for img in self.imgs.keys():<br>                loc=self.url_dirname(img)<br>                print loc, img, a<br>                try:<br>
                    if self.imgs[img]==0:<br>                        #urllib.urlretrieve(img,loc)<br>                        #os.system('wget -A.jpg,gif,png,wmv,avi,mpg -r -l4 -H -erobots=off --wait=1 -np -U \"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.7.6) Gecko/20050405 Firefox/1.0 (Ubuntu package 1.0.2)\" '+img)<br>
                        #im = Image.open(loc)<br>                        #if im.size[0]<minsize or im.size[1] < minsize:<br>                            #print "removed",img,loc<br>                            #os.remove(loc)<br>
                        #else:<br>                            #print "image saved", img,loc<br>                        self.imgs[img]=1<br>                except IOError,e:<br>                    self.soc.w(["save_images IOERROR",IOError,e,"img: ",img,"loc: ",loc,"path: ",a],"white_on_red")<br>
            for img in self.urls.keys():<br>                #if img.find(".jpg")!=-1:<br>                loc=self.url_dirname(img)<br>                print loc, img, a<br>                #try:<br>                #    if self.urls[img]==0:<br>
                #        continue<br>                        #os.system('wget -A.jpg,gif,png,wmv,avi,mpg -r -l4 -H -erobots=off --wait=1 -np -U \"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.7.6) Gecko/20050405 Firefox/1.0 (Ubuntu package 1.0.2)\" '+img)<br>
                        #urllib.urlretrieve(img,loc)<br>                        #im = Image.open(loc)<br>                        #if im.size[0]<minsize or im.size[1] < minsize:<br>                            #print "removed",img,loc<br>
                            #os.remove(loc)<br>                        #else:<br>                            #print "image saved", img,loc<br>                    #self.urls[img]=1<br>                #except IOError,e:<br>
                #    self.soc.w(["save_images IOERROR",IOError,e,"img: ",img,"loc: ",loc,"path: ",a],"white_on_red")<br>            <br>            #os.system('wget -r -l1 -A jpg -U \"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.7.6) Gecko/20050405 Firefox/1.0 (Ubuntu package 1.0.2)\" -nd --wait=2 --random-wait --no-parent -nv ' + self.url)<br>
            #if self.url.find("jpg")==-1:<br>            #    pass<br>            #else:<br>            #urllib.urlretrieve(self.url,string.join(string.split(self.url,"/"),""))<br>                <br>
            os.chdir("..")<br>            #print "done saving to "+path+dirname<br>            #return True<br>    def convert_to_absolute_urls(self):<br>        urls = {}<br>        for url in self.urls:<br>
            if os.path.dirname(self.url) =="http:":<br>                self.url=self.url+"/"<br>            else:<br>                self.url=os.path.dirname(self.url)+"/"<br>            print urlparse(url), self.url<br>
            if urlparse(url)[0].find("http") == -1:<br>                if urlparse(url)[0].find("script") == -1:<br>                    urls[self.url+url]=self.urls[url]<br>            else:<br>                    urls[url]=self.urls[url]<br>
        self.urls = urls<br>    def convert_to_absolute_imgs(self):<br>        print "here"<br>        imgs = {}<br>        for img in self.imgs:<br>            if os.path.dirname(self.url) =="http:":<br>
                self.url=self.url+"/"<br>            else:<br>                self.url=os.path.dirname(self.url)+"/"<br>            print urlparse(img), self.url<br>            if urlparse(img)[0].find("http") == -1:<br>
                if urlparse(img)[0].find("script") == -1:<br>                    imgs[self.url+img]=self.imgs[img]<br>            else:<br>                    imgs[img]=self.imgs[img]<br>        print "IMAGES: ",imgs<br>
        self.imgs = imgs<br>    def process_url(self,url,path,level,max_depth):<br>        #self.reset()<br>        self.soc.me_him(['ENTER:',__name__],self.func_me_color)<br>        <br>        if path[-1:] !="/":<br>
            path=path+"/"<br>        <br>        self.url = url<br>        self.path = path<br>        self.soc.w(["url: ",url],"blue")<br>        <br>        self.dirname = self.url_dirname(self.url)<br>
        self.soc.w(["dirname: ",self.dirname],"red")<br>        if os.path.isdir(self.path):<br>            self.soc.w(["path: ",self.path,"LEVEL:",level],"green")<br>            <br>
            if os.path.isdir(self.path+self.dirname):<br>                print "ABORT dir already exists: "+ self.path+self.dirname<br>                return False<br>            else:<br>                try:<br>
                    <br>                    if self.url.find(".jpg")==-1:<br>                        if self.urls[self.url]==0:<br>                            sock = urllib.urlopen(self.url)<br>                            self.source = sock.read()<br>
                            self.feed(self.source)<br>                            self.urls[self.url]=1<br>                            sock.close()<br>                            self.close()<br>                            <br>
                            self.convert_to_absolute_urls()<br>                            self.convert_to_absolute_imgs()<br>                            <br>                            print "urls: ",self.soc.w(self.urls,"white_on_red")<br>
                            print "forms: ",self.soc.w(self.forms,"white_on_blue")<br>                            print "imgs: ",self.soc.w(self.imgs,"white_on_green")<br>                            <br>
                            print "SAVING IMAGES",self.url<br>                            #self.save_images(250)<br>                            for i in self.urls.keys():<br>                                if self.urls.has_key(i):<br>
                                    pass<br>                                    #self.process_url(i,self.path,level+1,max_depth)<br>                                    <br>                                    <br>                        else:<br>
                            self.soc.w(["BEEN DONE",self.url],"white_on_gold")<br>                    else:<br>                        urllib.urlretrieve(self.url,self.path)<br>                except IOError,e:<br>
                    self.soc.w(["process_url IOERROR",IOError,e],"white_on_red")<br>                    #return False<br>        else:<br>            self.soc.w(["Incorrect Path:", self.path],"white_on_red")<br>
            <br>        self.soc.me_him(['EXIT:',__name__],self.func_me_color)<br>if __name__ == "__main__":<br><br>    path=sys.argv[1]<br>    url=sys.argv[2]<br>    colorize=sys.argv[3]<br>    caller=sys.argv[4]<br>
    <br>    func_me_color="white_on_black"<br>        soc=stdout_colours.stdout_colors(colorize,caller)<br>        soc.me(['ENTER:',__name__],func_me_color)<br><br>    max_depth=4<br>    level=0<br>    up=URLPacker(colorize,caller)<br>
    up.urls[url]=0<br>    up.process_url(url,path,level,max_depth)<br>    <br>    <br>    <br>    <br>    <br>    <br>    soc.me(['EXIT:',__name__],func_me_color)<br>-- <br>А-Б-В-Г-Д-Е-Ё-Ж-З-И-Й-К-Л-М-Н-О-П-Р-С-Т-У-Ф-Х-Ц-Ч-Ш-Щ-Ъ-Ы-Ь-Э-Ю-Я<br>
а-б-в-г-д-е-ё-ж-з-и-й-к-л-м-н-о-п-р-с-т-у-ф-х-ц-ч-ш-щ-ъ-ы-ь-э-ю-я<br>