regex for href substitution
Robin Becker
robin at jessikat.fsnet.co.uk
Wed Feb 19 10:42:33 EST 2003
..... well here's my start at copying parser with href substitution
hacked out of sgmlib.
import sys
from urlparse import urlparse, urlunparse
from types import StringType
REVIEW='/review'
LIVE='/live'
def _href_substitute(value,(chost,croot,nhost,nroot)):
T = list(urlparse(value,'http',1))
if T[0]!='http': return value
if chost and chost!=T[1]: return value
if type(croot) is StringType:
i = T[2].find(croot)
if i<0 or i>1: return value
n = len(croot)
if chost: T[1] = nhost
else: T[1]=''
if i==0:
T[2] = nroot + T[2][n:]
else:
T[2] = '/' + nroot + T[2][n+1:]
T[0] = ''
else:
if T[2] in croot: T[2] = nroot[croot.index(T[2])]
T[0]=T[1]=''
return urlunparse(T)
from sgmllib import SGMLParser
class CopyingSGMLParser(SGMLParser):
entitydefs = {}
def __init__(self, verbose=0, copyData=None):
'''
verbose switches on verbose mode
copyData = (current host,current root, new host, new root)
'''
self._data = ''
self._result = ''
L = list(copyData)
for i in (1,3):
if L[i][0]=='/': L[i]=L[i][1:]
self._copyData = tuple(L)
self.links = {}
self.names = []
self.errors = []
SGMLParser.__init__(self, verbose)
def handle_data(self, data):
self._data = self._data + data
def _append(self,data):
self._result = self._result + data
def _flush(self):
self._append(self._data)
self._data = ''
def handle_comment(self, comment):
self._flush()
self._append('<!-%s->' % comment)
def handle_charref(self, ref):
self._flush()
self._append('&#' + ref + ';')
def _substitute(self,value):
return _href_substitute(value,self._copyData)
def unknown_starttag(self, tag, attrs):
self._flush()
t = '<' + tag
if attrs:
for name, value in attrs:
if tag.lower()=='a':
if name.lower()=='href':
value = self._substitute(value)
self.links[value] = None
elif name.lower()=='name':
if value in self.names:
self.errors.append("duplicate name %s"%value)
else: self.names.append(value)
t = t + ' ' + name + '=' + '"' + value + '"'
self._append(t + '>')
def unknown_endtag(self, tag):
self._flush()
self._append('</' + tag + '>')
def unknown_entityref(self, ref):
self._flush()
self._append('&' + ref + ';')
def unknown_charref(self, ref):
self._flush()
self._append('&#' + ref + ';')
def close(self):
SGMLParser.close(self)
self._flush()
if __name__ == '__main__':
import sys
if len(sys.argv)==1 or sys.argv[1]=='CopyingSGMLParser':
data = '''
<html>
<header>
<title>this is my title</title>
</header>
<body>
&bingo; < etc etc é
<a href=review/dingo>dingo</a>
<a href=/review/dingo>/dingo</a>
<a href=ftp://gargle/review/dingo>ftp</a>
<a href=http://gargle/review/dingo>gargle</a>
<a name=targ>targ</a>
<a href=HTTP:///review/dingo>/dingo</a>
</body>
</html>
'''
x = CopyingSGMLParser(copyData=('','review','','live'))
x.feed(data)
x.close()
print x._result
print x.names
print x.links
x = CopyingSGMLParser(copyData=('','review','','live'))
x.feed('''<dtml-var standard_html_header>
Some text
<dtml-var standard_html_footer>''')
x.close()
print x._result
--
Robin Becker
More information about the Python-list
mailing list