#!/usr/bin/env python3
from lxml import etree
# Create a custom class that knows which attributes of wbt
# we care about to consider them unique or not.
#
# Note that both eq() and hash() need to be supported. I was
# originally expecting that just hash() would have been sufficient
# for set() to cull duplicates.
class WPT(etree.ElementBase):
def __eq__(self, b):
return self.attrib['lat'] == b.attrib['lat'] and self.attrib['lon'] == b.attrib['lon']
def __hash__(self):
return hash( (self.attrib['lat'], self.attrib['lon']) )
# Create a parser that returns WPT objects in place of _Elements
# but only for elements with a name of 'wpt'
def get_wpt_parser():
lookup = etree.ElementNamespaceClassLookup()
parser = etree.XMLParser()
parser.set_element_class_lookup(lookup)
namespace = lookup.get_namespace('')
namespace['wpt'] = WPT
return parser
# Load the XML data and find the parent of the data we're interested in
wbt_parser = get_wpt_parser()
root = etree.parse('input.xml', wbt_parser)
entries = root.find('entries')
# Some sanity checking: Print out the Python type of the entries
# element (should be a traditional _Element) and each of the children,
# which should be of type WPT.
print(f"type(entries) = {type(entries)}")
print(f"type(entries.children = {','.join(str(type(c)) for c in entries.getchildren())}")
# Read the child elements of the parent into a set; which will cause
# duplicated entries to be removed; with set() leveraging the __eq__ and
# __hash__ functions of the WBT class above
children = set(entries.iterchildren())
# Replace the original children with the unique children
entries[:] = children
# Write out the resultant XML
with open('output.xml', 'wb') as output_file:
output_file.write(etree.tostring(root))