lxml, comparing nodes
code_berzerker
emen999 at gmail.com
Fri Jul 25 06:05:00 EDT 2008
> If document order doesn't matter, try sorting the elements of each level in
> the two documents by some arbitrary deterministic key, such as (tag name,
> text, attr count, whatever), and then compare them in order, instead of trying
> to find matches in multiple passes. itertools.groupby() might be your friend here.
I think that sorting multiple times by each attribute will cost more
than I've managed to do:
from lxml import etree
from collections import deque
import string, re, time
def xmlEqual(xmlStr1, xmlStr2):
et1 = etree.XML(xmlStr1)
et2 = etree.XML(xmlStr2)
let1 = [x for x in et1.iter()]
let2 = [x for x in et2.iter()]
if len(let1) != len(let2):
return False
while let1:
el = let1.pop(0)
foundEl = findMatchingElem(el, let2)
if foundEl is None:
return False
let2.remove(foundEl)
return True
def findMatchingElem(el, eList):
for elem in eList:
if elemsEqual(el, elem):
return elem
return None
def elemsEqual(el1, el2):
if el1.tag != el2.tag or el1.attrib != el2.attrib:
return False
# no requirement for text checking for now
#if el1.text != el2.text or el1.tail != el2.tail:
#return False
path1 = el1.getroottree().getpath(el1)
path2 = el2.getroottree().getpath(el2)
idxRE = re.compile(r"(\[\d*\])")
path1 = idxRE.sub("", path1)
path2 = idxRE.sub("", path2)
if path1 != path2:
return False
return True
Notice that if documents are in exact same order, each element is
compared only once!
More information about the Python-list
mailing list