Which one is the best XML-parser?
Peter Otten
__peter__ at web.de
Fri Jun 24 11:13:53 EDT 2016
Marko Rauhamaa wrote:
> Random832 <random832 at fastmail.com>:
>> You know what would be really nice? A "semi-incremental" parser that
>> can e.g. yield (whether through an event or through the iterator
>> protocol) a fully formed element (preferably one that can be queried
>> with xpath) at a time for each record of a document representing a
>> list of objects. Does anything like that exist?
>
> You can construct that from a SAX parser, but it's less convenient than
> it could be. Python's JSON parser doesn't have it so I've had to build a
> clumsy one myself:
>
> def decode_json_object_array(self):
> # A very clumsy implementation of an incremental JSON
> # decoder
> it = self.get_text()
> inbuf = ""
> while True:
> try:
> inbuf += next(it)
> except StopIteration:
> # a premature end; trigger a decode error
> json.loads("[" + inbuf)
> try:
> head, tail = inbuf.split("[", 1)
> except ValueError:
> continue
> break
> # trigger a decode error if head contains junk
> json.loads(head + "[]")
> inbuf = ""
> chunk = tail
> while True:
> bracket_maybe = ""
> for big in chunk.split("]"):
> comma_maybe = ""
> for small in big.split(","):
> inbuf += comma_maybe + small
> comma_maybe = ","
> try:
> yield json.loads(inbuf)
> #except json.JSONDecodeError:
> except ValueError: # legacy exception
> pass
> else:
> inbuf = comma_maybe = ""
> inbuf += bracket_maybe
> bracket_maybe = "]"
> try:
> yield json.loads(inbuf)
> #except json.JSONDecodeError:
> except ValueError: # legacy exception
> pass
> else:
> inbuf = ""
> try:
> chunk += next(it)
> except StopIteration:
> break
> # trigger a decode error if chunk contains junk
> json.loads("[" + chunk)
>
> It could easily be converted to an analogous XML parser.
For XML you could use iterparse, see
http://effbot.org/elementtree/iterparse.htm
I came up with the following and found memory usage to be stable.
import random
import xml.etree.ElementTree
from xml.sax.saxutils import escape
def iter_elems(file, tag):
it = xml.etree.ElementTree.iterparse(file, events=("start", "end"))
root = next(it)[1]
for event, elem in it:
if event == "end" and elem.tag == tag:
yield elem
root.clear()
# --- example below ---
class NeverendingXMLFile:
def __init__(self, words):
self.words = words
self.chunks = self.gen_chunks()
def gen_chunks(self):
words = self.words
yield b"<doc>"
while True:
yield "<word>{}</word>".format(random.choice(words)).encode()
def read(self, size=None):
return next(self.chunks)
def filelike():
with open("/usr/share/dict/words") as f:
words = [escape(line.strip()) for line in f]
infile = NeverendingXMLFile(words)
return infile
if __name__ == "__main__":
for word in iter_elems(filelike(), "word"):
print(word.text)
In theory this should be even simpler with lxml as it exposes the root
element and allows to filter per tag
http://lxml.de/parsing.html#iterparse-and-iterwalk
Unfortunately root seems to be set after the closing </...> and thus doesn't
help with dereferencing seen elements during iteration.
More information about the Python-list
mailing list