regex for href substitution

Robin Becker robin at jessikat.fsnet.co.uk
Wed Feb 19 10:42:33 EST 2003


..... well here's my start at copying parser with href substitution
hacked out of sgmlib.

import sys
from urlparse import urlparse, urlunparse
from types import StringType
REVIEW='/review'
LIVE='/live'

def _href_substitute(value,(chost,croot,nhost,nroot)):
        T = list(urlparse(value,'http',1))
        if T[0]!='http': return value
        if chost and chost!=T[1]: return value
        if type(croot) is StringType:
                i = T[2].find(croot)
                if i<0 or i>1: return value
                n = len(croot)
                if chost: T[1] = nhost
                else: T[1]=''
                if i==0:
                        T[2] = nroot + T[2][n:]
                else:
                        T[2] = '/' + nroot + T[2][n+1:]
                T[0] = ''
        else:
                if T[2] in croot: T[2] = nroot[croot.index(T[2])]
        T[0]=T[1]=''
        return urlunparse(T)

from sgmllib import SGMLParser
class CopyingSGMLParser(SGMLParser):

        entitydefs = {}
        def __init__(self, verbose=0, copyData=None):
                '''
                verbose switches on verbose mode
                copyData = (current host,current root, new host, new root)
                '''
                self._data = ''
                self._result = ''
                L = list(copyData)
                for i in (1,3):
                        if L[i][0]=='/': L[i]=L[i][1:]
                self._copyData = tuple(L)
                self.links = {}
                self.names = []
                self.errors = []
                SGMLParser.__init__(self, verbose)

        def handle_data(self, data):
                self._data = self._data + data

        def _append(self,data):
                self._result = self._result + data

        def _flush(self):
                self._append(self._data)
                self._data = ''

        def handle_comment(self, comment):
                self._flush()
                self._append('<!-%s->' % comment)

        def handle_charref(self, ref):
                self._flush()
                self._append('&#' + ref + ';')

        def _substitute(self,value):
                return _href_substitute(value,self._copyData)

        def unknown_starttag(self, tag, attrs):
                self._flush()

                t = '<' + tag
                if attrs:
                        for name, value in attrs:
                                if tag.lower()=='a':
                                        if name.lower()=='href':
                                                value = self._substitute(value)
                                                self.links[value] = None
                                        elif name.lower()=='name':
                                                if value in self.names:
                                                        self.errors.append("duplicate name %s"%value)
                                                else: self.names.append(value)
                                t = t + ' ' + name + '=' + '"' + value + '"'
                self._append(t + '>')

        def unknown_endtag(self, tag):
                self._flush()
                self._append('</' + tag + '>')

        def unknown_entityref(self, ref):
                self._flush()
                self._append('&' + ref + ';')

        def unknown_charref(self, ref):
                self._flush()
                self._append('&#' + ref + ';')

        def close(self):
                SGMLParser.close(self)
                self._flush()

if __name__ == '__main__':
        import sys
        if len(sys.argv)==1 or sys.argv[1]=='CopyingSGMLParser':
                data = '''
<html>
<header>
<title>this is my title</title>

</header>
<body>

&bingo; < etc etc é
<a href=review/dingo>dingo</a>
<a href=/review/dingo>/dingo</a>
<a href=ftp://gargle/review/dingo>ftp</a>
<a href=http://gargle/review/dingo>gargle</a>
<a name=targ>targ</a>
<a href=HTTP:///review/dingo>/dingo</a>
</body>
</html>
'''
                x = CopyingSGMLParser(copyData=('','review','','live'))
                x.feed(data)
                x.close()
                print x._result
                print x.names
                print x.links

                
                x = CopyingSGMLParser(copyData=('','review','','live'))
                x.feed('''<dtml-var standard_html_header>
Some text
<dtml-var standard_html_footer>''')
                x.close()
                print x._result

-- 
Robin Becker




More information about the Python-list mailing list