[XML-SIG] How to get SAX to parse not well formed HTML doc?

Douglas Bagnall douglas@paradise.net.nz
Wed, 18 Jul 2001 16:52:59 +1200


--Message-Boundary-5927
Content-type: text/plain; charset=US-ASCII
Content-transfer-encoding: 7BIT
Content-description: Mail message body


Hi there,

I've used the attached script to turn html into xml for minidom, and it 
seems to work fairly well so long as the html doesn't contain text cut 
and pasted from Microsoft Word. 

fix(<filename>) prints out xmlish version of the file.
fixstring(<string>) does the same to a string.
obviously, you'd change this somewhere around line 110.
The output is tested against minidom, so if you get no traceback, it 
will be xml safe. Which is not to say it'll look good.


Another thing I've done is put tohtml() and writehtml() methods in my 
version of minidom. They're the same as toxml & writexml, except they 
test empty elements against a tuple: br, img, link and so forth are 
rendered <br /> (note the space) while other empty tags are written the 
long way - <td></td>, <p></p> etc. It's really simple. Would this be of 
any use to anyone else, or would it be just clutter up minidom.py?


Douglas



--Message-Boundary-5927
Content-type: text/plain; charset=US-ASCII
Content-transfer-encoding: 7BIT
Content-description: Text from file 'rehtml.py'

#!/usr/bin/python
"""
Excerpt from experimental WCN auto page generating version of Kea editor.

copyright katipo communications ltd  2001
by douglas bagnall <douglas@katipo.co.nz>


fix(<filename>) prints out xmlish version of the file.
fixstring(<string>) does the same to a string.
obviously, you'd change this somewhere around line 110.

The output is tested against minidom, so if you get no traceback,
it will be xml safe. Which is not to say it'll look good.

Html entities are not handled, nor are valueless attributes, like
selected in option (xhtml 1.0 asks for selected="selected").
Misunderstood attributes are omitted without notice.

"""

from xml.dom.minidom import parseString
import sys,re,string,os

singlelist=('img','br','link','hr','input','area',"meta")
wf=re.compile(r'''\w+=('|")[^'"]+\1''')

def attrify(tag):
    attrs=tag.split()
    fattrs=[re.sub("[^\w-]","x",attrs.pop(0).lower())] #deals rudely with non-alphanumeric tags
    while attrs:
        trying=attrs.pop(0)
        if wf.match(trying):
            fattrs.append(trying)
        else:
            trying=re.sub(r'[\'"]',"",trying)        # clear quotes
            trying=trying.replace('=','="',1)+'"'    # and requote (won't get valueless html attributes eg <option selected>)
            if wf.match(trying):
                fattrs.append(trying)
                #tried hard enough, so now forget it, return fixed attrs only
    return " ".join(fattrs)

def ent(s):
    s=s.replace("&","&amp;")
    s=s.replace(">","&gt;")
    return s

def splitter(y):
    z=y.split('>',1)
    tag=z[0]
    if len(z)==1 or not tag or not re.match(r"^[A-Za-z_/!]",tag[0]):
        return ["","","&lt;%s"%y.replace('>','&gt;')]  #so loose >s get entitied and empty tag is returned
    if tag[0]=="/":  #ends
        return ["e", re.sub(r"\s+"," ",tag[1:]), ent(z[1])]
    elif tag[-1]=="/" or tag.split()[0].lower() in singlelist:        #tags without closers
        return ["m", attrify(re.sub('/$','',tag)), ent(z[1])]     #normalise to no " />"
    elif tag[:3]=="!--":
        return ["c",tag,z[1]]
    else: #start
        return ["s", attrify(tag), ent(z[1])]

#joiner *almost* reverses splitter, but whitespace in tags remains reduced, and leading <s included
joinerdict={"":"%s%s", "s":"<%s>%s", "m":"<%s />%s", "e": "</%s>%s", "c": "<%s>%s"}
def joiner(z):
    return joinerdict[z[0]]%(z[1],z[2])

def fixstring(z):   #
    zbits=(z).split('<')    #list of string bits, without '<' eg '<p>foo<br>bar' becomes ['','p>foo','br>bar']
    zlist=map(splitter,zbits)
    zbits=[]
    zstack=[]
    for x in zlist:
        if x[0]=="s":
            stag=x[1].split()[0]
            if stag in ("p","a","form","option","select","td","li") and stag==zstack[-1]: #whatever else too
                zstack.pop()
                zbits.append(["e",stag,""])
            zstack.append(stag)
            zbits.append(x)
        elif x[0]=="e":
            etag=x[1].split()[0]
            tstack=[]
            if zstack:
                lasttag=zstack.pop()
                while zstack and etag != lasttag:
                    tstack.append(lasttag)
                    lasttag=zstack.pop()
                if etag == lasttag:
                    for t in tstack:
                        zbits.append(["e",t,""])  #ie </t>
                    zbits.append(x)
                else:    #couldn't find in zstack, probably closed in previous tstack manoeuvre?
                    zbits.append(["","",x[2]])
                    tstack.reverse()
                    zstack.append(lasttag)
                    zstack+=tstack #pile them back on (starting with latest lasttag, which is unprocessed, otherwise lost)
            else:
                zbits.append(["","",x[2]]) #oh well.
        else:    #single or empty tags or comments
            zbits.append(x)    #carry on with no stacking

    zstack.reverse()
    for a in zstack:  #clear any unclosed tags!
        zbits.append(["e",a,""])

    z=''.join(map(joiner,zbits))[4:]   #first 4 are a &lt;

    #  now test it
    try:
        zdom=parseString("<span>%s</span>"%z)  #for test if valid tags
    except:
        print """<!-- Couldn't Parse ! -->\n%s""" % z
        raise
    #so, it worked!
    print """<!-- Yay!, successfully parsed -->\n%s""" % z


def fix(z):
    try:
        f=open(z,"r")
        fixstring(f.read())
    except:
        print "%s is probably not a file" %z
        raise

--Message-Boundary-5927--