[XML-SIG] How to get SAX to parse not well formed HTML doc?
Douglas Bagnall
douglas@paradise.net.nz
Wed, 18 Jul 2001 16:52:59 +1200
--Message-Boundary-5927
Content-type: text/plain; charset=US-ASCII
Content-transfer-encoding: 7BIT
Content-description: Mail message body
Hi there,
I've used the attached script to turn html into xml for minidom, and it
seems to work fairly well so long as the html doesn't contain text cut
and pasted from Microsoft Word.
fix(<filename>) prints out xmlish version of the file.
fixstring(<string>) does the same to a string.
obviously, you'd change this somewhere around line 110.
The output is tested against minidom, so if you get no traceback, it
will be xml safe. Which is not to say it'll look good.
Another thing I've done is put tohtml() and writehtml() methods in my
version of minidom. They're the same as toxml & writexml, except they
test empty elements against a tuple: br, img, link and so forth are
rendered <br /> (note the space) while other empty tags are written the
long way - <td></td>, <p></p> etc. It's really simple. Would this be of
any use to anyone else, or would it be just clutter up minidom.py?
Douglas
--Message-Boundary-5927
Content-type: text/plain; charset=US-ASCII
Content-transfer-encoding: 7BIT
Content-description: Text from file 'rehtml.py'
#!/usr/bin/python
"""
Excerpt from experimental WCN auto page generating version of Kea editor.
copyright katipo communications ltd 2001
by douglas bagnall <douglas@katipo.co.nz>
fix(<filename>) prints out xmlish version of the file.
fixstring(<string>) does the same to a string.
obviously, you'd change this somewhere around line 110.
The output is tested against minidom, so if you get no traceback,
it will be xml safe. Which is not to say it'll look good.
Html entities are not handled, nor are valueless attributes, like
selected in option (xhtml 1.0 asks for selected="selected").
Misunderstood attributes are omitted without notice.
"""
from xml.dom.minidom import parseString
import sys,re,string,os
singlelist=('img','br','link','hr','input','area',"meta")
wf=re.compile(r'''\w+=('|")[^'"]+\1''')
def attrify(tag):
attrs=tag.split()
fattrs=[re.sub("[^\w-]","x",attrs.pop(0).lower())] #deals rudely with non-alphanumeric tags
while attrs:
trying=attrs.pop(0)
if wf.match(trying):
fattrs.append(trying)
else:
trying=re.sub(r'[\'"]',"",trying) # clear quotes
trying=trying.replace('=','="',1)+'"' # and requote (won't get valueless html attributes eg <option selected>)
if wf.match(trying):
fattrs.append(trying)
#tried hard enough, so now forget it, return fixed attrs only
return " ".join(fattrs)
def ent(s):
s=s.replace("&","&")
s=s.replace(">",">")
return s
def splitter(y):
z=y.split('>',1)
tag=z[0]
if len(z)==1 or not tag or not re.match(r"^[A-Za-z_/!]",tag[0]):
return ["","","<%s"%y.replace('>','>')] #so loose >s get entitied and empty tag is returned
if tag[0]=="/": #ends
return ["e", re.sub(r"\s+"," ",tag[1:]), ent(z[1])]
elif tag[-1]=="/" or tag.split()[0].lower() in singlelist: #tags without closers
return ["m", attrify(re.sub('/$','',tag)), ent(z[1])] #normalise to no " />"
elif tag[:3]=="!--":
return ["c",tag,z[1]]
else: #start
return ["s", attrify(tag), ent(z[1])]
#joiner *almost* reverses splitter, but whitespace in tags remains reduced, and leading <s included
joinerdict={"":"%s%s", "s":"<%s>%s", "m":"<%s />%s", "e": "</%s>%s", "c": "<%s>%s"}
def joiner(z):
return joinerdict[z[0]]%(z[1],z[2])
def fixstring(z): #
zbits=(z).split('<') #list of string bits, without '<' eg '<p>foo<br>bar' becomes ['','p>foo','br>bar']
zlist=map(splitter,zbits)
zbits=[]
zstack=[]
for x in zlist:
if x[0]=="s":
stag=x[1].split()[0]
if stag in ("p","a","form","option","select","td","li") and stag==zstack[-1]: #whatever else too
zstack.pop()
zbits.append(["e",stag,""])
zstack.append(stag)
zbits.append(x)
elif x[0]=="e":
etag=x[1].split()[0]
tstack=[]
if zstack:
lasttag=zstack.pop()
while zstack and etag != lasttag:
tstack.append(lasttag)
lasttag=zstack.pop()
if etag == lasttag:
for t in tstack:
zbits.append(["e",t,""]) #ie </t>
zbits.append(x)
else: #couldn't find in zstack, probably closed in previous tstack manoeuvre?
zbits.append(["","",x[2]])
tstack.reverse()
zstack.append(lasttag)
zstack+=tstack #pile them back on (starting with latest lasttag, which is unprocessed, otherwise lost)
else:
zbits.append(["","",x[2]]) #oh well.
else: #single or empty tags or comments
zbits.append(x) #carry on with no stacking
zstack.reverse()
for a in zstack: #clear any unclosed tags!
zbits.append(["e",a,""])
z=''.join(map(joiner,zbits))[4:] #first 4 are a <
# now test it
try:
zdom=parseString("<span>%s</span>"%z) #for test if valid tags
except:
print """<!-- Couldn't Parse ! -->\n%s""" % z
raise
#so, it worked!
print """<!-- Yay!, successfully parsed -->\n%s""" % z
def fix(z):
try:
f=open(z,"r")
fixstring(f.read())
except:
print "%s is probably not a file" %z
raise
--Message-Boundary-5927--