Python - working with xml/lxml/objectify/schemas, datatypes, and assignments
aapost
aapost at idontexist.club
Sun Jan 15 20:13:35 EST 2023
On 1/3/23 22:57, aapost wrote:
> I am trying to wrap my head around how one goes about working with and
> editing xml elements ... Back to
> contemplating and tinkering..
For anyone in a similar situation, xmlschema is actually quite nice.
It didn't have the features I was looking for out of the box, but it
does have a to_objects function and I have learned quite a bit while
picking it apart. I am able to patch it to be good enough for my
requirements.
Below is the patch for anyone interested:
#
# Contribution for the xmlschema & elementpath python modules which are
# Copyright (c), 2016-2020, SISSA (International School for Advanced
Studies).
# All rights reserved.
#
# This file is distributed under the terms of the MIT License.
# See the file 'LICENSE' in the root directory of the present
# distribution, or http://opensource.org/licenses/MIT.
#
# Patching and expansion of the xmlschema.dataobjects.DataElement object
features
# to get the best demonstration, change schema variable to your .xsd,
and xmlobj to your .xml files
# then run this as $ python -i filename.py
from typing import Any, Optional, Union, Tuple
#from types import MethodType
class ValueLockedError(Exception):
def __init__(self, obj, variable_name):
self.message = "Can't set ." + variable_name + \
"\nThe object:\n" + str(obj) + \
"\nis Locked (._locked is set to True)"
super().__init__(self.message)
# importing in order necessary for intended monkey patch
import elementpath.etree as ep_etree
# Monkey patching additional static functions to the import of
elementpath.etree
# for namespace management of xml.etree.ElementTree code paths (which use
# the global variable register_namespace._namespace_map for namespace
registering)
def etree_remove_registered_namespace(elem: ep_etree.ElementProtocol,
uri: str = '') -> None:
etree_module: Any
if not ep_etree.is_etree_element(elem):
raise TypeError(f"{elem!r} is not an Element")
elif isinstance(elem, ep_etree.PyElementTree.Element):
etree_module = ep_etree.PyElementTree
elif not hasattr(elem, 'nsmap'):
etree_module = ep_etree.ElementTree
else:
import lxml.etree as etree_module # type: ignore[no-redef]
if not hasattr(elem, 'nsmap'):
if uri in etree_module.register_namespace._namespace_map:
del etree_module.register_namespace._namespace_map[uri]
else:
# TODO research this for better understanding
# _namespace_map is uri->prefix
# DataElement.nsmap prefix->uri
# lxml etree .nsmap ?->?
# not using lxml anyway so not really an issue as
# this condition shouldn't be met
for key, value in elem.nsmap.items():
# research - can there be multiple instances of uri to prefix?..
# or are they intended to be 1:1?..
if value == uri:
if key in elem.nsmap:
del elem.nsmap[key]
#patching
setattr(ep_etree, "etree_remove_registered_namespace",
staticmethod(etree_remove_registered_namespace))
# for namespace management of xml.etree.ElementTree code paths (which use
# the global variable register_namespace._namespace_map for namespace
registering)
def etree_get_registered_namespaces(elem: ep_etree.ElementProtocol) -> dict:
etree_module: Any
if not ep_etree.is_etree_element(elem):
raise TypeError(f"{elem!r} is not an Element")
elif isinstance(elem, ep_etree.PyElementTree.Element):
etree_module = ep_etree.PyElementTree
elif not hasattr(elem, 'nsmap'):
etree_module = ep_etree.ElementTree
else:
import lxml.etree as etree_module # type: ignore[no-redef]
if not hasattr(elem, 'nsmap'):
return etree_module.register_namespace._namespace_map
else:
return elem.nsmap # shouldn't be met
#patching
setattr(ep_etree, "etree_get_registered_namespaces",
staticmethod(etree_get_registered_namespaces))
# for namespace management of xml.etree.ElementTree code paths (which use
# the global variable register_namespace._namespace_map for namespace
registering)
def etree_register_namespace(elem: ep_etree.ElementProtocol,
prefix: str = None,
uri: str = None) -> None:
etree_module: Any
if not ep_etree.is_etree_element(elem):
raise TypeError(f"{elem!r} is not an Element")
elif isinstance(elem, ep_etree.PyElementTree.Element):
etree_module = ep_etree.PyElementTree
elif not hasattr(elem, 'nsmap'):
etree_module = ep_etree.ElementTree
else:
import lxml.etree as etree_module # type: ignore[no-redef]
if prefix != None and uri != None:
if not hasattr(elem, 'nsmap'):
etree_module.register_namespace(prefix, uri)
else:
# TODO research this for better understanding
# _namespace_map is uri->prefix
# DataElement.nsmap prefix->uri
# lxml etree .nsmap ?->?
# not using lxml anyway so not really an issue as
# this condition shouldn't be met
elem.nsmap[prefix] = uri
#patching
setattr(ep_etree, "etree_register_namespace",
staticmethod(etree_register_namespace))
# importing in order necessary for intended monkey patch
import xmlschema
# Monkey patching additional instance functions to the import of xmlschema
# specifically xmlschema.dataobjects.DataElement
# Instance functions so DataElement object can use above
elementpath.etree namespace functions
def register_namespace(self, prefix: str = None, uri: str = None) -> None:
#root = self.encode(validation='strict')
root, errors = self.encode(validation='lax')
if prefix != None and uri != None:
ep_etree.etree_register_namespace(root, prefix, uri)
#patching
setattr(xmlschema.dataobjects.DataElement, "register_namespace",
register_namespace)
def remove_registered_namespace(self, uri: str = '') -> None:
#root = self.encode(validation='strict')
root, errors = self.encode(validation='lax')
ep_etree.etree_remove_registered_namespace(root, uri)
#patching
setattr(xmlschema.dataobjects.DataElement,
"remove_registered_namespace", remove_registered_namespace)
def get_registered_namespaces(self) -> dict:
#root = self.encode(validation='strict')
root, errors = self.encode(validation='lax')
return ep_etree.etree_get_registered_namespaces(root)
#patching
setattr(xmlschema.dataobjects.DataElement, "get_registered_namespaces",
get_registered_namespaces)
# replacing .validate() & .is_valid() on DataElement so that namespaces
from the DataElement
# get set to the xml.etree.ElementTree register_namespace._namespace_map
global when used
def validate(self, use_defaults: bool = True,
namespaces: Optional[xmlschema.aliases.NamespacesType] = None,
max_depth: Optional[int] = None) -> None:
"""
Validates the XML data object.
:raises: :exc:`XMLSchemaValidationError` if XML data object is not valid.
:raises: :exc:`XMLSchemaValueError` if the instance has no schema
bindings.
"""
if (self.nsmap and namespaces == None): #added code
namespaces = self.nsmap #added code
for error in self.iter_errors(use_defaults, namespaces, max_depth):
raise error
#patching
setattr(xmlschema.dataobjects.DataElement, "validate", validate)
def is_valid(self, use_defaults: bool = True,
namespaces: Optional[xmlschema.aliases.NamespacesType] = None,
max_depth: Optional[int] = None) -> bool:
"""
Like :meth:`validate` except it does not raise an exception on validation
error but returns ``True`` if the XML data object is valid, ``False`` if
it's invalid.
:raises: :exc:`XMLSchemaValueError` if the instance has no schema
bindings.
:raises: :exc:`XMLSchemaValueError` if the instance has no schema
bindings.
"""
if (self.nsmap and namespaces == None): #added code
namespaces = self.nsmap #added code
error = next(self.iter_errors(use_defaults, namespaces, max_depth), None)
return error is None
#patching
setattr(xmlschema.dataobjects.DataElement, "is_valid", is_valid)
# replace .tostring() on DataElement to allow for
xml_declaration/encoding support
# TODO research more, will likely customize a bit further
def tostring(self,
namespaces: Optional[xmlschema.aliases.NamespacesType] = None,
indent: str = '',
max_lines: Optional[int] = None,
spaces_for_tab: Optional[int] = None,
xml_declaration: Optional[bool] = None,
encoding: str = 'unicode',
method: str = 'xml') -> Any:
if (self.nsmap and namespaces == None):
namespaces = self.nsmap
# Serializes the data element tree to an XML source string.
# root, errors = self.encode(validation='lax')
root = self.encode(validation="strict") #prefer strict on my output
just in case..
return ep_etree.etree_tostring(
root, namespaces, indent, max_lines, spaces_for_tab,
xml_declaration, encoding, method)
#patching
setattr(xmlschema.dataobjects.DataElement, "tostring", tostring)
# add get_value function - paired with set_value
def get_value(self) -> Any:
print(type(self))
return self.value
#patching
setattr(xmlschema.dataobjects.DataElement, "get_value", get_value)
# add set_value function
# assures change meets XMLSchema
# reverts back on error
# assumes data meets Schema to begin with, will remain unchanged in the
end if it is not
# :raises: :exc:`XMLSchemaValidationError` if XML data object is not
valid after attempted change
# :raises: :exc:`XMLSchemaValueError` if the instance has no schema
bindings.
# :raises: :exc:`ValueLockedError` if using ._locked and set to True
def set_value(self,
value: Any,
use_defaults: bool = True,
namespaces: Optional[xmlschema.aliases.NamespacesType] =
None,
max_depth: Optional[int] = None) -> None:
if hasattr(self, "_locked") and self._locked == True:
raise ValueLockedError(self, variable_name='value')
else:
if hasattr(self, "_locked"):
self._locked = True
self._set_value_temp_value = self.value
self.value = value
if (self.nsmap and namespaces == None):
namespaces = self.nsmap
for error in self.iter_errors(use_defaults, namespaces, max_depth):
self.value = self._set_value_temp_value # revert value back to
original
del self._set_value_temp_value #clean up
if hasattr(self, "_locked"): # unlock before raising if using/exists
self._locked = False
raise error # raise error
# no errors
del self._set_value_temp_value # clean up
if hasattr(self, "_locked"):
self._locked = False # unlock before returning if using/exists
#patching
setattr(xmlschema.dataobjects.DataElement, "set_value", set_value)
# add get_attrib function - paired with set_attrib
# remove added logic from .get(), requiring explicit matches only
def get_attrib(self, key: str) -> Any:
return self.attrib[key]
#patching
setattr(xmlschema.dataobjects.DataElement, "get_attrib", get_attrib)
# add set_attrib function
# assures change meets XMLSchema
# reverts back on error
# assumes data meets Schema to begin with, will remain unchanged in the
end if it is not
# :raises: :exc:`XMLSchemaValidationError` if XML data object is not
valid after attempted change
# :raises: :exc:`XMLSchemaValueError` if the instance has no schema
bindings.
# :raises: :exc:`ValueLockedError` if using ._locked and set to True
def set_attrib(self,
key: str,
value: Any,
use_defaults: bool = True,
namespaces: Optional[xmlschema.aliases.NamespacesType] =
None,
max_depth: Optional[int] = None) -> Union[bool,
Optional[Tuple[bool, str]]]:
if hasattr(self, "_locked") and self._locked == True:
raise ValueLockedError(self, variable_name='attrib[' + key + ']')
else:
if hasattr(self, "_locked"):
self._locked = True
if key in self.attrib:
self._set_attrib_temp_value = self.attrib[key] # save original
value if exists
else:
self._set_attrib_value_did_not_exist = True # or note if it
doesn't exist
self.attrib[key] = value
if (self.nsmap and namespaces == None):
namespaces = self.nsmap
for error in self.iter_errors(use_defaults, namespaces, max_depth):
if hasattr(self, '_set_attrib_temp_value'):
self.attrib[key] = self._set_attrib_temp_value # revert value
back to original if existed
del self._set_attrib_temp_value
elif hasattr(self, '_set_attrib_value_did_not_exist'):
del self.attrib[key] # or just delete if it didn't
del self._set_attrib_value_did_not_exist
if hasattr(self, "_locked"):
self._locked = False
raise error
# no errors
if hasattr(self, '_set_attrib_temp_value'):
del self._set_attrib_temp_value # clean up
elif hasattr(self, '_set_attrib_value_did_not_exist'):
del self._set_attrib_value_did_not_exist # clean up
# TODO research @property / some or some type of better variable
binding?
# self._expand_xDE_attrib_prefix exists if
expand_xmlschema_DataElement is run
if hasattr(self, '_expand_xDE_attrib_prefix'):
setattr(self, self._expand_xDE_attrib_prefix + key, value)
if hasattr(self, "_locked"):
self._locked = False # unlock before returning if using/exists
#patching
setattr(xmlschema.dataobjects.DataElement, "set_attrib", set_attrib)
# add del_attrib function
# assures change meets XMLSchema
# reverts back on error
# assumes data meets Schema to begin with, will remain unchanged in the
end if it is not
# :raises: :exc:`XMLSchemaValidationError` if XML data object is not
valid after attempted change
# :raises: :exc:`XMLSchemaValueError` if the instance has no schema
bindings.
# :raises: :exc:`ValueLockedError` if using ._locked and set to True
# :raises: :exc:`KeyError` if xml tag attribute (.attrib[key]) doesn't exist
def del_attrib(self,
key: str,
use_defaults: bool = True,
namespaces: Optional[xmlschema.aliases.NamespacesType] =
None,
max_depth: Optional[int] = None) -> Union[bool,
Optional[Tuple[bool, str]]]:
if hasattr(self, "_locked") and self._locked == True:
raise ValueLockedError(self, variable_name='attrib[' + key + ']')
else:
if hasattr(self, "_locked"):
self._locked = True
if key in self.attrib:
self._del_attrib_temp_value = self.attrib[key] # save original
value if exists
else:
if hasattr(self, "_locked"):
self._locked = False
raise KeyError("'" + key + "' Attribute does not exist, nothing
to do")
del self.attrib[key]
if (self.nsmap and namespaces == None):
namespaces = self.nsmap
for error in self.iter_errors(use_defaults, namespaces, max_depth):
if hasattr(self, '_del_attrib_temp_value'):
self.attrib[key] = self._del_attrib_temp_value # attribute
required, recreate value back to original
del self._del_attrib_temp_value
if hasattr(self, "_locked"):
self._locked = False
# append informational message to error output
if hasattr(error, "message"):
error.message += ":\n\nThe attribute value was returned to
original state due to error" \
"\n\nThis error represents the state of this
element IF the attribute were removed"
raise error
# no errors
if hasattr(self, '_del_attrib_temp_value'):
del self._del_attrib_temp_value # clean up
# TODO research @property / some or some type of better variable
binding?
# self._expand_xDE_attrib_prefix exists if
expand_xmlschema_DataElement is run
if hasattr(self, '_expand_xDE_attrib_prefix'):
delattr(self, self._expand_xDE_attrib_prefix + key)
if hasattr(self, "_locked"):
self._locked = False # unlock before returning if using/exists
#patching
setattr(xmlschema.dataobjects.DataElement, "del_attrib", del_attrib)
# Monkey patching some class methods helpful for learning / troubleshooting
@classmethod
def _show_me_mro(cls):
return cls.mro()
setattr(xmlschema.validators.schemas.XsdValidator, "_show_me_mro",
classmethod(_show_me_mro))
setattr(xmlschema.dataobjects.DataElement, "_show_me_mro",
classmethod(_show_me_mro))
schema = xmlschema.XMLSchema("path/to/your.xsd",
converter=xmlschema.JsonMLConverter)
xmlobj = schema.to_objects("path/to/your.xml")
# creates dot notation naming for all children recursively
# c_ default prefix for child, a_ default prefix for tag attribute
# _# numbered suffix for all children starting at 0
# increases from there if more than 1 child with same name
def expand_xmlschema_DataElement(xsobj: xmlschema.dataobjects.DataElement,
child_prefix: str = 'c_',
attrib_prefix: str = 'a_') -> None:
xsobj._expand_xDE_child_prefix = child_prefix
xsobj._expand_xDE_attrib_prefix = attrib_prefix
# _locked just an idea at the moment, may or may not use this in the end
setattr(xsobj, "_locked", False)
# set a class attribute for each xml tag attribute
# DO NOT change these directly, use set_attrib on the parent class
which changes .attrib first
# These are currently just a copy of what is in the .attrib dict
# Validation has no knowledge of their existence if they are changed
outside of design
# TODO research @property / or some type of better variable binding?
if (xsobj.attrib):
#print(xsobj.local_name + " has attributes")
for key in xsobj.attrib.keys():
setattr(xsobj, xsobj._expand_xDE_attrib_prefix + key,
xsobj.attrib[key])
# set a class attribute for each child
for each in xsobj.iterchildren():
expand_xmlschema_DataElement(each)
count = 0
while(True):
if hasattr(xsobj, xsobj._expand_xDE_child_prefix +
each.local_name + "_" + str(count)):
count += 1
else:
setattr(xsobj, xsobj._expand_xDE_child_prefix + each.local_name
+ "_" + str(count), each)
break
expand_xmlschema_DataElement(xmlobj)
More information about the Python-list
mailing list