<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">

<HTML><HEAD><TITLE></TITLE>

<META content="text/html; charset=us-ascii" http-equiv=Content-Type>

<META name=GENERATOR content="MSHTML 8.00.6001.18904"></HEAD>

<BODY><!-- Converted from text/plain format -->

<P><FONT size=2 face=Arial>Hi,<BR><BR>I'm parsing XML files using ElementTree 

from xml.etree (see code below (and attached xml_parse_example.py)).</FONT></P>

<P><FONT size=2 face=Arial>However, I'm coming across input XML files (attached 

an example: tmp.xml) which include invalid characters, that produce the 

following traceback:</FONT></P>

<P><FONT size=2 face="Courier New">$ python xml_parse_example.py <BR>Traceback 

(most recent call last):<BR>  File "xml_parse_example.py", line 63, in 

<module><BR>    tree = 

xml2dict.open_and_parse_xml_file()<BR>  File "xml_parse_example.py", line 

14, in open_and_parse_xml_file<BR>    tree = 

ElementTree.parse(f)<BR>  File "c:\Python26\lib\xml\etree\ElementTree.py", 

line 862, in parse<BR>    tree.parse(source, parser)<BR>  

File "c:\Python26\lib\xml\etree\ElementTree.py", line 586, in 

parse<BR>    parser.feed(data)<BR>  File 

"c:\Python26\lib\xml\etree\ElementTree.py", line 1245, in 

feed<BR>    self._parser.Parse(data, 

0)<BR>xml.parsers.expat.ExpatError: not well-formed (invalid token): line 6, 

column 34</FONT></P>

<P><FONT size=2 face=Arial>I read the documentation for xml.etree.ElementTree 

and see that it may take an optional <EM>parser</EM> parameter, but I don't know 

what this parser should be - to ignore the invalid characters.<BR></FONT><FONT 

size=2 face=Arial><BR><FONT color=#0000ff>Could you suggest a way to call 

ElementTree, so it won't bomb on these invalid characters ?</FONT></FONT></P>

<P><FONT size=2 face=Arial>Thanks,<BR>Ron.</FONT></P><FONT face=Arial>

<P><FONT size=2>

<HR>

</FONT></P>

<P><FONT size=2 face="Courier New">#!/usr/bin/env python</FONT></P>

<P><FONT size=2 face="Courier New">from xml.etree import ElementTree<BR>import 

pprint</FONT></P>

<P><FONT size=2 face="Courier New">compute_tail = False</FONT></P>

<P><FONT size=2 face="Courier New">class XmlFileToDict():<BR>    

def __init__(self, xml_file_path):<BR>        

self.xml_file_path = xml_file_path</FONT></P>

<P><FONT size=2 face="Courier New">    def 

open_and_parse_xml_file(self):<BR>        

with open(self.xml_file_path, 'rt') as 

f:<BR>            tree = 

ElementTree.parse(f)<BR>        return 

tree</FONT></P>

<P><FONT size=2 face="Courier New">    def dict_list(self, 

node):<BR>            res 

= {}<BR>            

res[node.tag] = 

[]<BR>            

self.xml_to_dict(node,res[node.tag])<BR>            

reply = {}<BR>            

if 

compute_tail:<BR>                

reply[node.tag] = 

{'value':res[node.tag],'attribs':node.attrib,'tail':node.tail}<BR>            

else:<BR>                

reply[node.tag] = 

{'value':res[node.tag],'attribs':node.attrib}<BR>            

<BR>            return 

reply</FONT></P>

<P><FONT size=2 face="Courier New">    def xml_to_dict(self, 

node, 

res):<BR>            rep 

= {}<BR>            

<BR>            if 

len(node):<BR>                    

#n = 

0<BR>                    

for n in 

list(node):<BR>                            

rep[node.tag] = 

[]<BR>                            

value = 

self.xml_to_dict(n,rep[node.tag])<BR>                            

if 

len(n):<BR>                                    

if 

compute_tail:<BR>                                        

value = 

{'value':rep[node.tag],'attributes':n.attrib,'tail':n.tail}<BR>                                    

else:<BR>                                        

value = 

{'value':rep[node.tag],'attributes':n.attrib}<BR>                                    

res.append({n.tag:value})<BR>                            

else 

:<BR>                                    

<BR>                                    

res.append(rep[node.tag][0])<BR>                            

<BR>            

else:<BR>                    

<BR>                    

<BR>                    

value = 

{}<BR>                    

if 

compute_tail:<BR>                        

value = 

{'value':node.text,'attributes':node.attrib,'tail':node.tail}<BR>                    

else:<BR>                        

value = 

{'value':node.text,'attributes':node.attrib}<BR>                    

<BR>                    

res.append({node.tag:value})<BR>            

<BR>            return 

<BR>                        

<BR>if __name__ == '__main__' :<BR>    xml_file_path ='tmp.xml' 

<BR>    xml2dict = 

XmlFileToDict(xml_file_path)<BR>    tree = 

xml2dict.open_and_parse_xml_file()<BR>    xml_dict = 

xml2dict.dict_list(tree.getroot())<BR>    

pprint.pprint(xml_dict)</FONT></FONT></P>

<P><FONT size=2 face=Arial>

<HR>

<BR></FONT></P></BODY></HTML>