lxml, comparing nodes

code_berzerker emen999 at gmail.com
Fri Jul 25 06:05:00 EDT 2008


> If document order doesn't matter, try sorting the elements of each level in
> the two documents by some arbitrary deterministic key, such as (tag name,
> text, attr count, whatever), and then compare them in order, instead of trying
> to find matches in multiple passes. itertools.groupby() might be your friend here.

I think that sorting multiple times by each attribute will cost more
than I've managed to do:

from lxml import etree
from collections import deque
import string, re, time

def xmlEqual(xmlStr1, xmlStr2):
  et1 = etree.XML(xmlStr1)
  et2 = etree.XML(xmlStr2)

  let1 = [x for x in et1.iter()]
  let2 = [x for x in et2.iter()]

  if len(let1) != len(let2):
    return False

  while let1:
    el = let1.pop(0)
    foundEl = findMatchingElem(el, let2)
    if foundEl is None:
      return False
    let2.remove(foundEl)
  return True


def findMatchingElem(el, eList):
  for elem in eList:
    if elemsEqual(el, elem):
      return elem
  return None


def elemsEqual(el1, el2):
  if el1.tag != el2.tag or el1.attrib != el2.attrib:
    return False
  # no requirement for text checking for now
  #if el1.text != el2.text or el1.tail != el2.tail:
    #return False
  path1 = el1.getroottree().getpath(el1)
  path2 = el2.getroottree().getpath(el2)
  idxRE = re.compile(r"(\[\d*\])")
  path1 = idxRE.sub("", path1)
  path2 = idxRE.sub("", path2)
  if path1 != path2:
    return False

  return True

Notice that if documents are in exact same order, each element is
compared only once!



More information about the Python-list mailing list