<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
<HTML><HEAD><TITLE></TITLE>
<META content="text/html; charset=us-ascii" http-equiv=Content-Type>
<META name=GENERATOR content="MSHTML 8.00.6001.18904"></HEAD>
<BODY><!-- Converted from text/plain format -->
<P><FONT size=2 face=Arial>Hi,<BR><BR>I'm parsing XML files using ElementTree 
from xml.etree (see code below (and attached xml_parse_example.py)).</FONT></P>
<P><FONT size=2 face=Arial>However, I'm coming across input XML files (attached 
an example: tmp.xml) which include invalid characters, that produce the 
following traceback:</FONT></P>
<P><FONT size=2 face="Courier New">$ python xml_parse_example.py <BR>Traceback 
(most recent call last):<BR>  File "xml_parse_example.py", line 63, in 
<module><BR>    tree = 
xml2dict.open_and_parse_xml_file()<BR>  File "xml_parse_example.py", line 
14, in open_and_parse_xml_file<BR>    tree = 
ElementTree.parse(f)<BR>  File "c:\Python26\lib\xml\etree\ElementTree.py", 
line 862, in parse<BR>    tree.parse(source, parser)<BR>  
File "c:\Python26\lib\xml\etree\ElementTree.py", line 586, in 
parse<BR>    parser.feed(data)<BR>  File 
"c:\Python26\lib\xml\etree\ElementTree.py", line 1245, in 
feed<BR>    self._parser.Parse(data, 
0)<BR>xml.parsers.expat.ExpatError: not well-formed (invalid token): line 6, 
column 34</FONT></P>
<P><FONT size=2 face=Arial>I read the documentation for xml.etree.ElementTree 
and see that it may take an optional <EM>parser</EM> parameter, but I don't know 
what this parser should be - to ignore the invalid characters.<BR></FONT><FONT 
size=2 face=Arial><BR><FONT color=#0000ff>Could you suggest a way to call 
ElementTree, so it won't bomb on these invalid characters ?</FONT></FONT></P>
<P><FONT size=2 face=Arial>Thanks,<BR>Ron.</FONT></P><FONT face=Arial>
<P><FONT size=2>
<HR>
</FONT></P>
<P><FONT size=2 face="Courier New">#!/usr/bin/env python</FONT></P>
<P><FONT size=2 face="Courier New">from xml.etree import ElementTree<BR>import 
pprint</FONT></P>
<P><FONT size=2 face="Courier New">compute_tail = False</FONT></P>
<P><FONT size=2 face="Courier New">class XmlFileToDict():<BR>    
def __init__(self, xml_file_path):<BR>        
self.xml_file_path = xml_file_path</FONT></P>
<P><FONT size=2 face="Courier New">    def 
open_and_parse_xml_file(self):<BR>        
with open(self.xml_file_path, 'rt') as 
f:<BR>            tree = 
ElementTree.parse(f)<BR>        return 
tree</FONT></P>
<P><FONT size=2 face="Courier New">    def dict_list(self, 
node):<BR>            res 
= {}<BR>            
res[node.tag] = 
[]<BR>            
self.xml_to_dict(node,res[node.tag])<BR>            
reply = {}<BR>            
if 
compute_tail:<BR>                
reply[node.tag] = 
{'value':res[node.tag],'attribs':node.attrib,'tail':node.tail}<BR>            
else:<BR>                
reply[node.tag] = 
{'value':res[node.tag],'attribs':node.attrib}<BR>            
<BR>            return 
reply</FONT></P>
<P><FONT size=2 face="Courier New">    def xml_to_dict(self, 
node, 
res):<BR>            rep 
= {}<BR>            
<BR>            if 
len(node):<BR>                    
#n = 
0<BR>                    
for n in 
list(node):<BR>                            
rep[node.tag] = 
[]<BR>                            
value = 
self.xml_to_dict(n,rep[node.tag])<BR>                            
if 
len(n):<BR>                                    
if 
compute_tail:<BR>                                        
value = 
{'value':rep[node.tag],'attributes':n.attrib,'tail':n.tail}<BR>                                    
else:<BR>                                        
value = 
{'value':rep[node.tag],'attributes':n.attrib}<BR>                                    
res.append({n.tag:value})<BR>                            
else 
:<BR>                                    
<BR>                                    
res.append(rep[node.tag][0])<BR>                            
<BR>            
else:<BR>                    
<BR>                    
<BR>                    
value = 
{}<BR>                    
if 
compute_tail:<BR>                        
value = 
{'value':node.text,'attributes':node.attrib,'tail':node.tail}<BR>                    
else:<BR>                        
value = 
{'value':node.text,'attributes':node.attrib}<BR>                    
<BR>                    
res.append({node.tag:value})<BR>            
<BR>            return 
<BR>                        
<BR>if __name__ == '__main__' :<BR>    xml_file_path ='tmp.xml' 
<BR>    xml2dict = 
XmlFileToDict(xml_file_path)<BR>    tree = 
xml2dict.open_and_parse_xml_file()<BR>    xml_dict = 
xml2dict.dict_list(tree.getroot())<BR>    
pprint.pprint(xml_dict)</FONT></FONT></P>
<P><FONT size=2 face=Arial>
<HR>
<BR></FONT></P></BODY></HTML>