How to read a xml-file sequentially

Robert Roy rjroy at takingcontrol.com
Tue Oct 30 16:57:58 EST 2001


"Thomas Weholt" <thomas at gatsoft.no>On Tue, 30 Oct 2001 12:17:37 GMT, 

>Hi,
>
>I need to read a xml-file sequentially, piece by piece, when a given end-tag
>to a given start-tag is found, the xml-data inbetween the start and end are
>sent to a different method for processing, then the next piece of data is
>read, sort of like old-school record-based files.
>
>Only the needed data is kept in memory.
>
>How can this be done using preferrably expat or sax? No DOM cuz the files
>are huge.
>
>Thomas
>
>
Here is an example of how you could do this. My example document is an
xml document from project gutenberg. It contains a bunch of letters
which I wish to extract into individual documents..

http://gutenberg.hwg.org/xmlfiles/lvfnd10v2.xml

You define a class Handler that exposes 3 methods: initialise, write,
and finalise.  In this case the document I have contains a bunch of
letters

Then you define a parser class that takes as a parameter an
instantiated Handler. The parser class will then call the appropriate
methods in Handler.

I also defined a FancyHandler that will create parse each file and
output it in html format. This is something I just slapped together so
the implementation is a bit fragile, ie if there is no <title> tag,
the html will be incorrect, but it illustrates the concept. 

bob



#########  the basic stuff
from xml.parsers import expat

class Handler:
    def __init__(self):
        self.filenum = 1
        self.outfile = None

    def initialise(self):
        self.outfile = open("letters/letter%s"%self.filenum, 'w')
        self.filenum += 1

    def finalise(self):
        self.outfile.close()
        self.outfile = None
    
    def write(self, data):
        self.outfile.write(data)

class parsit:
    def __init__(self, handler, interestingtag="record"):
        self.parser = parser = expat.ParserCreate()
        parser.StartElementHandler = self.unknown_starttag
        parser.EndElementHandler = self.unknown_endtag
        parser.CharacterDataHandler = self.handle_data
        self.data = []       
        self.interestingtag = interestingtag
        self.capture = 0
        self.handler = handler
        
    def handle_data(self, data):
        if self.capture:
            self.handler.write(data)
            
    def unknown_starttag(self, tag, attrs):
        if tag == self.interestingtag:
            self.capture = 1
            self.handler.initialise()
        if self.capture:
            hd = self.handle_data
            hd('<%s' % tag)
            for item in attrs.items():
                hd(' %s="%s"' % item)
            hd('>')
            
    def unknown_endtag(self, tag):
        if self.capture:
            self.handle_data('</%s>' % tag)
        if tag == self.interestingtag:
            self.capture = 0
            self.handler.finalise()
        
    def feed(self, data, isfinal=1):
        self.parser.Parse(data, isfinal)

    def close(self):
        pass

if __name__=="__main__":
#    p = parsit(FancyHandler(), "letter")
    p = parsit(Handler(), "letter")
    infile = open('lvfnd10v2.xml')
    # feed only what you can chew
    while 1:
        buffer = infile.read(16000)
        if not buffer:
            p.feed(buffer)
            break
        else:    
            p.feed(buffer, 0)



########## this is an alternative handler

class FancyHandler:
    def __init__(self):
        self.filenum = 1
        self.outfile = None
        self.outfilenames = []
        self.start_handlers = {'para': self.start_letter, 
                'para': self.start_para, 
                'title': self.start_title, 
                'from': self.start_from, 
                'to': self.start_to, 
                'sig': self.start_sig}
        self.end_handlers = {'letter': self.end_letter,
                'para': self.end_para, 
                'title': self.end_title, 
                'from': self.end_from, 
                'to': self.end_to, 
                'sig': self.end_sig}

    def initialise(self):
        self.parser = parser = expat.ParserCreate()
        parser.StartElementHandler = self.unknown_starttag
        parser.EndElementHandler = self.unknown_endtag
        parser.CharacterDataHandler = self.handle_data

        fn = "htmlletters/letter%s.html"%self.filenum
        self.outfilenames.append(fn)
        self.outfile = open(fn, 'w')
        self.filenum += 1
        self.title = ''
        self.intitle = ''

    def finalise(self):
        self.parser.Parse('', 1)
        self.outfile.close()
        self.outfile = None
            
    def write(self, data):
        #if data.find("letter")!= -1: print data      
        self.parser.Parse(data, 0)
        
    def handle_data(self, data):  
        if self.intitle:
            self.title += data
        self.outfile.write(data)

## handlers
    def unknown_starttag(self, tag, attrs):
        hndlr = self.start_handlers.get(tag, None)
        if hndlr:
            hndlr(attrs)

    def unknown_endtag(self, tag):
        hndlr = self.end_handlers.get(tag, None)
        if hndlr:
            hndlr()

    def start_letter(self, attrs):
        self.handle_data('<html>')

    def end_letter(self):
        self.handle_data('</body></html>')

    def start_para(self, attrs):
        self.handle_data('<p>')

    def end_para(self):
        self.handle_data('</p>')
        
    def start_title(self, attrs):
        self.handle_data('<head><title>')
        self.intitle = 1

    def end_title(self):

self.handle_data('%s</title></head><body><h1>%s</h1>'%(self.title,
self.title))
        self.intitle = 0
                
    def start_from(self, attrs):
        self.handle_data('<em>')

    def end_from(self):
        self.handle_data('</em>')
        
    def start_to(self, attrs):
        self.handle_data('<em>')

    def end_to(self):
        self.handle_data('</em>')
        
    def start_sig(self, attrs):
        self.handle_data('<strong>')

    def end_sig(self):
        self.handle_data('</strong>')
        
    def makeIndex(self):
        pass





More information about the Python-list mailing list