Which one is the best XML-parser?

Fri Jun 24 11:13:53 EDT 2016

Marko Rauhamaa wrote:

> Random832 <random832 at fastmail.com>:
>> You know what would be really nice? A "semi-incremental" parser that
>> can e.g. yield (whether through an event or through the iterator
>> protocol) a fully formed element (preferably one that can be queried
>> with xpath) at a time for each record of a document representing a
>> list of objects. Does anything like that exist?
> 
> You can construct that from a SAX parser, but it's less convenient than
> it could be. Python's JSON parser doesn't have it so I've had to build a
> clumsy one myself:
> 
>             def decode_json_object_array(self):
>                 # A very clumsy implementation of an incremental JSON
>                 # decoder
>                 it = self.get_text()
>                 inbuf = ""
>                 while True:
>                     try:
>                         inbuf += next(it)
>                     except StopIteration:
>                         # a premature end; trigger a decode error
>                         json.loads("[" + inbuf)
>                     try:
>                         head, tail = inbuf.split("[", 1)
>                     except ValueError:
>                         continue
>                     break
>                 # trigger a decode error if head contains junk
>                 json.loads(head + "[]")
>                 inbuf = ""
>                 chunk = tail
>                 while True:
>                     bracket_maybe = ""
>                     for big in chunk.split("]"):
>                         comma_maybe = ""
>                         for small in big.split(","):
>                             inbuf += comma_maybe + small
>                             comma_maybe = ","
>                             try:
>                                 yield json.loads(inbuf)
>                             #except json.JSONDecodeError:
>                             except ValueError: # legacy exception
>                                 pass
>                             else:
>                                 inbuf = comma_maybe = ""
>                         inbuf += bracket_maybe
>                         bracket_maybe = "]"
>                         try:
>                             yield json.loads(inbuf)
>                         #except json.JSONDecodeError:
>                         except ValueError: # legacy exception
>                             pass
>                         else:
>                             inbuf = ""
>                     try:
>                         chunk += next(it)
>                     except StopIteration:
>                         break
>                 # trigger a decode error if chunk contains junk
>                 json.loads("[" + chunk)
> 
> It could easily be converted to an analogous XML parser.

For XML you could use iterparse, see 

http://effbot.org/elementtree/iterparse.htm

I came up with the following and found memory usage to be stable.

import random
import xml.etree.ElementTree
from xml.sax.saxutils import escape

def iter_elems(file, tag):
    it = xml.etree.ElementTree.iterparse(file, events=("start", "end"))
    root = next(it)[1]
    for event, elem in it:
        if event == "end" and elem.tag == tag:
            yield elem
            root.clear()

# --- example below ---

class NeverendingXMLFile:
    def __init__(self, words):
        self.words = words
        self.chunks = self.gen_chunks()

    def gen_chunks(self):
        words = self.words
        yield b"<doc>"
        while True:
            yield "<word>{}</word>".format(random.choice(words)).encode()

    def read(self, size=None):
        return next(self.chunks)

def filelike():
    with open("/usr/share/dict/words") as f:
        words = [escape(line.strip()) for line in f]
    infile = NeverendingXMLFile(words)
    return infile

if __name__ == "__main__":
    for word in iter_elems(filelike(), "word"):
        print(word.text)

In theory this should be even simpler with lxml as it exposes the root 
element and allows to filter per tag

http://lxml.de/parsing.html#iterparse-and-iterwalk

Unfortunately root seems to be set after the closing </...> and thus doesn't 
help with dereferencing seen elements during iteration.