sgmllib parser keeps old tag data?
MRAB
google at mrabarnett.plus.com
Fri Feb 13 09:31:40 EST 2009
Berend van Berkum wrote:
> -----BEGIN PGP SIGNED MESSAGE-----
> Hash: SHA1
>
>
> Hi everyone,
>
> I read the source, made numerous tests, but SGMLParser's keeps returning *tag* data
> from previous parser instances. I'm totally confused why.. The content data it
> returns is ok.
>
> E.g.::
>
> sp = MyParser()
> sp.feed('<test><t />Test</test>')
> print sp.content, sp.markup
> sp.close()
>
> sp = MyParser()
> sp.feed('<xml>\n</xml>\r\n')
> print sp.content, sp.markup
> sp.close()
>
> gives::
>
> ('Test', [{'t': ({}, (0, 0))}, {'test': ({}, (0, 4))}])
> ('\n\r\n', [{'t': ({}, (0, 0))}, {'test': ({}, (0, 4))}, {'xml': ({}, (0, 1))}])
>
> It keeps the tags from the previous session, while i'm sure the stack etc.
> should be clean..
>
> Any ideas?
>
>
> regards, Berend
>
> - ----
>
> import sgmllib
>
>
> class MyParser(sgmllib.SGMLParser):
>
> content = ''
> markup = []
> span_stack = []
>
These are in the _class_ itself, so they will be shared by all its
instances. You should so something like this instead:
def __init__(self):
self.content = ''
self.markup = []
self.span_stack = []
> def handle_data(self, data):
> self.content += data
>
> def unknown_starttag(self, tag, attr):
> stack = { tag: ( dict(attr), ( len(self.content), ) ) }
> self.span_stack.append(stack)
>
> def unknown_endtag(self, tag):
> prev_tag, ( attr, ( offset, ) ) = self.span_stack.pop().items()[0]
>
> if tag:
> # close all tags on stack until it finds a matching end tag
> # XXX: need to return to LEVEL, not same tag name
> while tag != prev_tag:
> span = { prev_tag: ( attr, ( offset, 0 ) ) }
> self.markup.append( span )
>
> prev_tag, ( attr, ( offset, ) ) = self.span_stack.pop().items()[0]
>
> length = len( self.content ) - offset
> span = { tag: ( attr, ( offset, length ) ) }
> self.markup.append( span )
>
> def do_unknown_tag(self, tag, attr):
> assert not tag and not attr, "do_unknown_tag %s, %s" % (tag, attr)
>
> def close(self):
> sgmllib.SGMLParser.close(self)
> self.content = ''
> self.markup = []
> self.span_stack = []
>
>
> def parse_data(data):
> sp = MyParser()
> sp.feed(data)
> r = sp.content, sp.markup
> sp.close()
> return r
>
> print parse_data('<test><t />Test</test>')
> print parse_data('<xml>\n</xml>\r\n')
> print parse_data('<sgml><s>Test 3</s></sgml>')
>
More information about the Python-list
mailing list