Python - working with xml/lxml/objectify/schemas, datatypes, and assignments

aapost aapost at idontexist.club
Sun Jan 15 20:13:35 EST 2023


On 1/3/23 22:57, aapost wrote:
> I am trying to wrap my head around how one goes about working with and 
> editing xml elements ... Back to 
> contemplating and tinkering..

For anyone in a similar situation, xmlschema is actually quite nice.

It didn't have the features I was looking for out of the box, but it 
does have a to_objects function and I have learned quite a bit while 
picking it apart. I am able to patch it to be good enough for my 
requirements.

Below is the patch for anyone interested:

#
# Contribution for the xmlschema & elementpath python modules which are
# Copyright (c), 2016-2020, SISSA (International School for Advanced 
Studies).
# All rights reserved.
#
# This file is distributed under the terms of the MIT License.
# See the file 'LICENSE' in the root directory of the present
# distribution, or http://opensource.org/licenses/MIT.
#

# Patching and expansion of the xmlschema.dataobjects.DataElement object 
features
# to get the best demonstration, change schema variable to your .xsd, 
and xmlobj to your .xml files
# then run this as $ python -i filename.py

from typing import Any, Optional, Union, Tuple
#from types import MethodType

class ValueLockedError(Exception):
   def __init__(self, obj, variable_name):
     self.message = "Can't set ." + variable_name + \
                    "\nThe object:\n" + str(obj) + \
                    "\nis Locked (._locked is set to True)"
     super().__init__(self.message)

# importing in order necessary for intended monkey patch
import elementpath.etree as ep_etree

# Monkey patching additional static functions to the import of 
elementpath.etree

# for namespace management of xml.etree.ElementTree code paths (which use
# the global variable register_namespace._namespace_map for namespace 
registering)
def etree_remove_registered_namespace(elem: ep_etree.ElementProtocol,
                                       uri: str = '') -> None:
   etree_module: Any
   if not ep_etree.is_etree_element(elem):
     raise TypeError(f"{elem!r} is not an Element")
   elif isinstance(elem, ep_etree.PyElementTree.Element):
     etree_module = ep_etree.PyElementTree
   elif not hasattr(elem, 'nsmap'):
     etree_module = ep_etree.ElementTree
   else:
     import lxml.etree as etree_module  # type: ignore[no-redef]

   if not hasattr(elem, 'nsmap'):
     if uri in etree_module.register_namespace._namespace_map:
       del etree_module.register_namespace._namespace_map[uri]
   else:
     # TODO research this for better understanding
     # _namespace_map is uri->prefix
     # DataElement.nsmap prefix->uri
     # lxml etree .nsmap ?->?
     # not using lxml anyway so not really an issue as
     # this condition shouldn't be met
     for key, value in elem.nsmap.items():
       # research - can there be multiple instances of uri to prefix?..
       # or are they intended to be 1:1?..
       if value == uri:
         if key in elem.nsmap:
           del elem.nsmap[key]

#patching
setattr(ep_etree, "etree_remove_registered_namespace",
         staticmethod(etree_remove_registered_namespace))

# for namespace management of xml.etree.ElementTree code paths (which use
# the global variable register_namespace._namespace_map for namespace 
registering)
def etree_get_registered_namespaces(elem: ep_etree.ElementProtocol) -> dict:
   etree_module: Any
   if not ep_etree.is_etree_element(elem):
     raise TypeError(f"{elem!r} is not an Element")
   elif isinstance(elem, ep_etree.PyElementTree.Element):
     etree_module = ep_etree.PyElementTree
   elif not hasattr(elem, 'nsmap'):
     etree_module = ep_etree.ElementTree
   else:
     import lxml.etree as etree_module  # type: ignore[no-redef]

   if not hasattr(elem, 'nsmap'):
     return etree_module.register_namespace._namespace_map
   else:
     return elem.nsmap # shouldn't be met

#patching
setattr(ep_etree, "etree_get_registered_namespaces",
         staticmethod(etree_get_registered_namespaces))

# for namespace management of xml.etree.ElementTree code paths (which use
# the global variable register_namespace._namespace_map for namespace 
registering)
def etree_register_namespace(elem: ep_etree.ElementProtocol,
                              prefix: str = None,
                              uri: str = None) -> None:
   etree_module: Any
   if not ep_etree.is_etree_element(elem):
     raise TypeError(f"{elem!r} is not an Element")
   elif isinstance(elem, ep_etree.PyElementTree.Element):
     etree_module = ep_etree.PyElementTree
   elif not hasattr(elem, 'nsmap'):
     etree_module = ep_etree.ElementTree
   else:
     import lxml.etree as etree_module  # type: ignore[no-redef]

   if prefix != None and uri != None:
     if not hasattr(elem, 'nsmap'):
       etree_module.register_namespace(prefix, uri)
     else:
       # TODO research this for better understanding
       # _namespace_map is uri->prefix
       # DataElement.nsmap prefix->uri
       # lxml etree .nsmap ?->?
       # not using lxml anyway so not really an issue as
       # this condition shouldn't be met
       elem.nsmap[prefix] = uri

#patching
setattr(ep_etree, "etree_register_namespace",
         staticmethod(etree_register_namespace))


# importing in order necessary for intended monkey patch
import xmlschema

# Monkey patching additional instance functions to the import of xmlschema
# specifically xmlschema.dataobjects.DataElement

# Instance functions so DataElement object can use above 
elementpath.etree namespace functions
def register_namespace(self, prefix: str = None, uri: str = None) -> None:
   #root = self.encode(validation='strict')
   root, errors = self.encode(validation='lax')
   if prefix != None and uri != None:
     ep_etree.etree_register_namespace(root, prefix, uri)

#patching
setattr(xmlschema.dataobjects.DataElement, "register_namespace", 
register_namespace)

def remove_registered_namespace(self, uri: str = '') -> None:
   #root = self.encode(validation='strict')
   root, errors = self.encode(validation='lax')
   ep_etree.etree_remove_registered_namespace(root, uri)

#patching
setattr(xmlschema.dataobjects.DataElement, 
"remove_registered_namespace", remove_registered_namespace)

def get_registered_namespaces(self) -> dict:
   #root = self.encode(validation='strict')
   root, errors = self.encode(validation='lax')
   return ep_etree.etree_get_registered_namespaces(root)

#patching
setattr(xmlschema.dataobjects.DataElement, "get_registered_namespaces", 
get_registered_namespaces)


# replacing .validate() & .is_valid() on DataElement so that namespaces 
from the DataElement
# get set to the xml.etree.ElementTree register_namespace._namespace_map 
global when used
def validate(self, use_defaults: bool = True,
              namespaces: Optional[xmlschema.aliases.NamespacesType] = None,
              max_depth: Optional[int] = None) -> None:
   """
   Validates the XML data object.
   :raises: :exc:`XMLSchemaValidationError` if XML data object is not valid.
   :raises: :exc:`XMLSchemaValueError` if the instance has no schema 
bindings.
   """
   if (self.nsmap and namespaces == None): #added code
     namespaces = self.nsmap #added code
   for error in self.iter_errors(use_defaults, namespaces, max_depth):
     raise error

#patching
setattr(xmlschema.dataobjects.DataElement, "validate", validate)

def is_valid(self, use_defaults: bool = True,
              namespaces: Optional[xmlschema.aliases.NamespacesType] = None,
              max_depth: Optional[int] = None) -> bool:
   """
   Like :meth:`validate` except it does not raise an exception on validation
   error but returns ``True`` if the XML data object is valid, ``False`` if
   it's invalid.

   :raises: :exc:`XMLSchemaValueError` if the instance has no schema 
bindings.
   :raises: :exc:`XMLSchemaValueError` if the instance has no schema 
bindings.
   """
   if (self.nsmap and namespaces == None): #added code
     namespaces = self.nsmap #added code
   error = next(self.iter_errors(use_defaults, namespaces, max_depth), None)
   return error is None

#patching
setattr(xmlschema.dataobjects.DataElement, "is_valid", is_valid)


# replace .tostring() on DataElement to allow for 
xml_declaration/encoding support
# TODO research more, will likely customize a bit further
def tostring(self,
              namespaces: Optional[xmlschema.aliases.NamespacesType] = None,
              indent: str = '',
              max_lines: Optional[int] = None,
              spaces_for_tab: Optional[int] = None,
              xml_declaration: Optional[bool] = None,
              encoding: str = 'unicode',
              method: str = 'xml') -> Any:

   if (self.nsmap and namespaces == None):
     namespaces = self.nsmap

   # Serializes the data element tree to an XML source string.
   # root, errors = self.encode(validation='lax')
   root = self.encode(validation="strict") #prefer strict on my output 
just in case..
   return ep_etree.etree_tostring(
     root, namespaces, indent, max_lines, spaces_for_tab,
     xml_declaration, encoding, method)

#patching
setattr(xmlschema.dataobjects.DataElement, "tostring", tostring)


# add get_value function - paired with set_value
def get_value(self) -> Any:
   print(type(self))
   return self.value

#patching
setattr(xmlschema.dataobjects.DataElement, "get_value", get_value)

# add set_value function
# assures change meets XMLSchema
# reverts back on error
# assumes data meets Schema to begin with, will remain unchanged in the 
end if it is not
# :raises: :exc:`XMLSchemaValidationError` if XML data object is not 
valid after attempted change
# :raises: :exc:`XMLSchemaValueError` if the instance has no schema 
bindings.
# :raises: :exc:`ValueLockedError` if using ._locked and set to True
def set_value(self,
               value: Any,
               use_defaults: bool = True,
               namespaces: Optional[xmlschema.aliases.NamespacesType] = 
None,
               max_depth: Optional[int] = None) -> None:
   if hasattr(self, "_locked") and self._locked == True:
     raise ValueLockedError(self, variable_name='value')
   else:
     if hasattr(self, "_locked"):
       self._locked = True

     self._set_value_temp_value = self.value

     self.value = value

     if (self.nsmap and namespaces == None):
       namespaces = self.nsmap
     for error in self.iter_errors(use_defaults, namespaces, max_depth):
       self.value = self._set_value_temp_value # revert value back to 
original
       del self._set_value_temp_value #clean up
       if hasattr(self, "_locked"): # unlock before raising if using/exists
         self._locked = False
       raise error # raise error

     # no errors

     del self._set_value_temp_value # clean up
     if hasattr(self, "_locked"):
       self._locked = False # unlock before returning if using/exists

#patching
setattr(xmlschema.dataobjects.DataElement, "set_value", set_value)


# add get_attrib function - paired with set_attrib
# remove added logic from .get(), requiring explicit matches only
def get_attrib(self, key: str) -> Any:
   return self.attrib[key]

#patching
setattr(xmlschema.dataobjects.DataElement, "get_attrib", get_attrib)

# add set_attrib function
# assures change meets XMLSchema
# reverts back on error
# assumes data meets Schema to begin with, will remain unchanged in the 
end if it is not
# :raises: :exc:`XMLSchemaValidationError` if XML data object is not 
valid after attempted change
# :raises: :exc:`XMLSchemaValueError` if the instance has no schema 
bindings.
# :raises: :exc:`ValueLockedError` if using ._locked and set to True
def set_attrib(self,
                key: str,
                value: Any,
                use_defaults: bool = True,
                namespaces: Optional[xmlschema.aliases.NamespacesType] = 
None,
                max_depth: Optional[int] = None) -> Union[bool, 
Optional[Tuple[bool, str]]]:
   if hasattr(self, "_locked") and self._locked == True:
     raise ValueLockedError(self, variable_name='attrib[' + key + ']')
   else:
     if hasattr(self, "_locked"):
       self._locked = True

     if key in self.attrib:
       self._set_attrib_temp_value = self.attrib[key] # save original 
value if exists
     else:
       self._set_attrib_value_did_not_exist = True # or note if it 
doesn't exist

     self.attrib[key] = value

     if (self.nsmap and namespaces == None):
       namespaces = self.nsmap
     for error in self.iter_errors(use_defaults, namespaces, max_depth):
       if hasattr(self, '_set_attrib_temp_value'):
         self.attrib[key] = self._set_attrib_temp_value # revert value 
back to original if existed
         del self._set_attrib_temp_value
       elif hasattr(self, '_set_attrib_value_did_not_exist'):
         del self.attrib[key] # or just delete if it didn't
         del self._set_attrib_value_did_not_exist
       if hasattr(self, "_locked"):
         self._locked = False
       raise error

     # no errors

     if hasattr(self, '_set_attrib_temp_value'):
       del self._set_attrib_temp_value # clean up
     elif hasattr(self, '_set_attrib_value_did_not_exist'):
       del self._set_attrib_value_did_not_exist # clean up

     # TODO research @property / some or some type of better variable 
binding?
     # self._expand_xDE_attrib_prefix exists if 
expand_xmlschema_DataElement is run
     if hasattr(self, '_expand_xDE_attrib_prefix'):
       setattr(self, self._expand_xDE_attrib_prefix + key, value)

     if hasattr(self, "_locked"):
       self._locked = False # unlock before returning if using/exists

#patching
setattr(xmlschema.dataobjects.DataElement, "set_attrib", set_attrib)

# add del_attrib function
# assures change meets XMLSchema
# reverts back on error
# assumes data meets Schema to begin with, will remain unchanged in the 
end if it is not
# :raises: :exc:`XMLSchemaValidationError` if XML data object is not 
valid after attempted change
# :raises: :exc:`XMLSchemaValueError` if the instance has no schema 
bindings.
# :raises: :exc:`ValueLockedError` if using ._locked and set to True
# :raises: :exc:`KeyError` if xml tag attribute (.attrib[key]) doesn't exist
def del_attrib(self,
                key: str,
                use_defaults: bool = True,
                namespaces: Optional[xmlschema.aliases.NamespacesType] = 
None,
                max_depth: Optional[int] = None) -> Union[bool, 
Optional[Tuple[bool, str]]]:
   if hasattr(self, "_locked") and self._locked == True:
     raise ValueLockedError(self, variable_name='attrib[' + key + ']')
   else:
     if hasattr(self, "_locked"):
       self._locked = True

     if key in self.attrib:
       self._del_attrib_temp_value = self.attrib[key] # save original 
value if exists
     else:
       if hasattr(self, "_locked"):
         self._locked = False
       raise KeyError("'" + key + "' Attribute does not exist, nothing 
to do")

     del self.attrib[key]

     if (self.nsmap and namespaces == None):
       namespaces = self.nsmap
     for error in self.iter_errors(use_defaults, namespaces, max_depth):
       if hasattr(self, '_del_attrib_temp_value'):
         self.attrib[key] = self._del_attrib_temp_value # attribute 
required, recreate value back to original
         del self._del_attrib_temp_value
       if hasattr(self, "_locked"):
         self._locked = False
       # append informational message to error output
       if hasattr(error, "message"):
         error.message += ":\n\nThe attribute value was returned to 
original state due to error" \
                          "\n\nThis error represents the state of this 
element IF the attribute were removed"
       raise error

     # no errors

     if hasattr(self, '_del_attrib_temp_value'):
       del self._del_attrib_temp_value # clean up

     # TODO research @property / some or some type of better variable 
binding?
     # self._expand_xDE_attrib_prefix exists if 
expand_xmlschema_DataElement is run
     if hasattr(self, '_expand_xDE_attrib_prefix'):
       delattr(self, self._expand_xDE_attrib_prefix + key)

     if hasattr(self, "_locked"):
       self._locked = False # unlock before returning if using/exists

#patching
setattr(xmlschema.dataobjects.DataElement, "del_attrib", del_attrib)


# Monkey patching some class methods helpful for learning / troubleshooting
@classmethod
def _show_me_mro(cls):
   return cls.mro()

setattr(xmlschema.validators.schemas.XsdValidator, "_show_me_mro", 
classmethod(_show_me_mro))
setattr(xmlschema.dataobjects.DataElement, "_show_me_mro", 
classmethod(_show_me_mro))


schema = xmlschema.XMLSchema("path/to/your.xsd", 
converter=xmlschema.JsonMLConverter)
xmlobj = schema.to_objects("path/to/your.xml")


# creates dot notation naming for all children recursively
# c_ default prefix for child, a_ default prefix for tag attribute
# _# numbered suffix for all children starting at 0
# increases from there if more than 1 child with same name
def expand_xmlschema_DataElement(xsobj: xmlschema.dataobjects.DataElement,
                                  child_prefix: str = 'c_',
                                  attrib_prefix: str = 'a_') -> None:
   xsobj._expand_xDE_child_prefix = child_prefix
   xsobj._expand_xDE_attrib_prefix = attrib_prefix

   # _locked just an idea at the moment, may or may not use this in the end
   setattr(xsobj, "_locked", False)

   # set a class attribute for each xml tag attribute
   # DO NOT change these directly, use set_attrib on the parent class 
which changes .attrib first
   # These are currently just a copy of what is in the .attrib dict
   # Validation has no knowledge of their existence if they are changed 
outside of design
   # TODO research @property / or some type of better variable binding?
   if (xsobj.attrib):
     #print(xsobj.local_name + " has attributes")
     for key in xsobj.attrib.keys():
       setattr(xsobj, xsobj._expand_xDE_attrib_prefix + key, 
xsobj.attrib[key])

   # set a class attribute for each child
   for each in xsobj.iterchildren():
     expand_xmlschema_DataElement(each)
     count = 0
     while(True):
       if hasattr(xsobj, xsobj._expand_xDE_child_prefix + 
each.local_name + "_" + str(count)):
         count += 1
       else:
         setattr(xsobj, xsobj._expand_xDE_child_prefix + each.local_name 
+ "_" + str(count), each)
         break

expand_xmlschema_DataElement(xmlobj)


More information about the Python-list mailing list