How to read a xml-file sequentially
Robert Roy
rjroy at takingcontrol.com
Tue Oct 30 16:57:58 EST 2001
"Thomas Weholt" <thomas at gatsoft.no>On Tue, 30 Oct 2001 12:17:37 GMT,
>Hi,
>
>I need to read a xml-file sequentially, piece by piece, when a given end-tag
>to a given start-tag is found, the xml-data inbetween the start and end are
>sent to a different method for processing, then the next piece of data is
>read, sort of like old-school record-based files.
>
>Only the needed data is kept in memory.
>
>How can this be done using preferrably expat or sax? No DOM cuz the files
>are huge.
>
>Thomas
>
>
Here is an example of how you could do this. My example document is an
xml document from project gutenberg. It contains a bunch of letters
which I wish to extract into individual documents..
http://gutenberg.hwg.org/xmlfiles/lvfnd10v2.xml
You define a class Handler that exposes 3 methods: initialise, write,
and finalise. In this case the document I have contains a bunch of
letters
Then you define a parser class that takes as a parameter an
instantiated Handler. The parser class will then call the appropriate
methods in Handler.
I also defined a FancyHandler that will create parse each file and
output it in html format. This is something I just slapped together so
the implementation is a bit fragile, ie if there is no <title> tag,
the html will be incorrect, but it illustrates the concept.
bob
######### the basic stuff
from xml.parsers import expat
class Handler:
def __init__(self):
self.filenum = 1
self.outfile = None
def initialise(self):
self.outfile = open("letters/letter%s"%self.filenum, 'w')
self.filenum += 1
def finalise(self):
self.outfile.close()
self.outfile = None
def write(self, data):
self.outfile.write(data)
class parsit:
def __init__(self, handler, interestingtag="record"):
self.parser = parser = expat.ParserCreate()
parser.StartElementHandler = self.unknown_starttag
parser.EndElementHandler = self.unknown_endtag
parser.CharacterDataHandler = self.handle_data
self.data = []
self.interestingtag = interestingtag
self.capture = 0
self.handler = handler
def handle_data(self, data):
if self.capture:
self.handler.write(data)
def unknown_starttag(self, tag, attrs):
if tag == self.interestingtag:
self.capture = 1
self.handler.initialise()
if self.capture:
hd = self.handle_data
hd('<%s' % tag)
for item in attrs.items():
hd(' %s="%s"' % item)
hd('>')
def unknown_endtag(self, tag):
if self.capture:
self.handle_data('</%s>' % tag)
if tag == self.interestingtag:
self.capture = 0
self.handler.finalise()
def feed(self, data, isfinal=1):
self.parser.Parse(data, isfinal)
def close(self):
pass
if __name__=="__main__":
# p = parsit(FancyHandler(), "letter")
p = parsit(Handler(), "letter")
infile = open('lvfnd10v2.xml')
# feed only what you can chew
while 1:
buffer = infile.read(16000)
if not buffer:
p.feed(buffer)
break
else:
p.feed(buffer, 0)
########## this is an alternative handler
class FancyHandler:
def __init__(self):
self.filenum = 1
self.outfile = None
self.outfilenames = []
self.start_handlers = {'para': self.start_letter,
'para': self.start_para,
'title': self.start_title,
'from': self.start_from,
'to': self.start_to,
'sig': self.start_sig}
self.end_handlers = {'letter': self.end_letter,
'para': self.end_para,
'title': self.end_title,
'from': self.end_from,
'to': self.end_to,
'sig': self.end_sig}
def initialise(self):
self.parser = parser = expat.ParserCreate()
parser.StartElementHandler = self.unknown_starttag
parser.EndElementHandler = self.unknown_endtag
parser.CharacterDataHandler = self.handle_data
fn = "htmlletters/letter%s.html"%self.filenum
self.outfilenames.append(fn)
self.outfile = open(fn, 'w')
self.filenum += 1
self.title = ''
self.intitle = ''
def finalise(self):
self.parser.Parse('', 1)
self.outfile.close()
self.outfile = None
def write(self, data):
#if data.find("letter")!= -1: print data
self.parser.Parse(data, 0)
def handle_data(self, data):
if self.intitle:
self.title += data
self.outfile.write(data)
## handlers
def unknown_starttag(self, tag, attrs):
hndlr = self.start_handlers.get(tag, None)
if hndlr:
hndlr(attrs)
def unknown_endtag(self, tag):
hndlr = self.end_handlers.get(tag, None)
if hndlr:
hndlr()
def start_letter(self, attrs):
self.handle_data('<html>')
def end_letter(self):
self.handle_data('</body></html>')
def start_para(self, attrs):
self.handle_data('<p>')
def end_para(self):
self.handle_data('</p>')
def start_title(self, attrs):
self.handle_data('<head><title>')
self.intitle = 1
def end_title(self):
self.handle_data('%s</title></head><body><h1>%s</h1>'%(self.title,
self.title))
self.intitle = 0
def start_from(self, attrs):
self.handle_data('<em>')
def end_from(self):
self.handle_data('</em>')
def start_to(self, attrs):
self.handle_data('<em>')
def end_to(self):
self.handle_data('</em>')
def start_sig(self, attrs):
self.handle_data('<strong>')
def end_sig(self):
self.handle_data('</strong>')
def makeIndex(self):
pass
More information about the Python-list
mailing list