Issue with xml iterparse
bfrederi
brfredericks at gmail.com
Fri Jun 4 12:54:11 EDT 2010
On Jun 3, 4:13 pm, bfrederi <brfrederi... at gmail.com> wrote:
> On Jun 3, 3:59 pm, Chris Rebert <c... at rebertia.com> wrote:
>
>
>
> > On Thu, Jun 3, 2010 at 1:44 PM, bfrederi <brfrederi... at gmail.com> wrote:
> > > I am using lxml iterparse and running into a very obscure error. When
> > > I run iterparse on a file, it will occasionally return an element that
> > > has a element.text == None when the element clearly has text in it.
>
> > > I copy and pasted the problem xml into a python string, used StringIO
> > > to create a file-like object out of it, and ran a test using iterparse
> > > with expected output, and it ran perfectly fine. So it only happens
> > > when I try to run iterparse on the actual file.
>
> > > So then I tried opening the file, reading the data, turning that data
> > > into a file-like object using StringIO, then running iterparse on it,
> > > and the same problem (element.text == None) occurred.
>
> > > I even tried this:
> > > f = codecs.open(abbyy_filename, 'r', encoding='utf-8')
> > > file_data = f.read()
> > > file_like_object = StringIO.StringIO(file_data)
> > > for event, element in iterparse(file_like_object, events=("start",
> > > "end")):
>
> > IIRC, XML parsers operate on bytes directly (since they have to
> > determine the encoding themselves anyway), not pre-decoded Unicode
> > characters, so I think your manual UTF-8 decoding could be the
> > problem.
> > Have you tried simply:
>
> > f = open(abbyy_filename, 'r')
> > for event, element in iterparse(f, events=("start", "end")):
> > #whatever
>
> > ?
>
> > Apologies if you already have, but since you didn't include the
> > original, albeit probably trivial, error-causing code, this relatively
> > simple error couldn't be ruled out.
>
> > Cheers,
> > Chris
> > --http://blog.rebertia.com
>
> Sorry for not mentioning it, but I tried that as well and it failed.
> Here is the relevant class. AbbyyLine and Abbyyword just take the
> element's text and writes it to a file/file-like object. parse_doc is
> where I use iterparse. The relevant part is very minimal and there is
> a lot of fluff to ignore, so I didn't initially post it:
>
> class AbbyyDocParse(object):
>
> """Takes an abbyy filename and parses the contents"""
> def __init__(self, abbyy_filename, extension=DEFAULT_ABBYY_EXT,
> format_list=OUTPUT_TYPES, string_only=False):
> self.extension = extension
> self.format_list = format_list
> #Create the file handles for the output files
> self.create_filehandles(abbyy_filename, string_only)
> #Parse the document
> self.parse_doc(abbyy_filename)
> #Close the output filehandles
> self.close_filehandles(abbyy_filename, string_only)
>
> def create_filehandles(self, abbyy_filename, string_only):
> """Create output filehandles"""
> #if output goes to a file
> if not string_only:
> #Make sure the file is an abbyy file
> if not abbyy_filename.endswith(self.extension):
> raise ParserException, "Bad abbyy filename given: %s"
> \
> % (abbyy_filename)
> #get the base path and filename for output files
> filename = abbyy_filename.replace(self.extension, '')
> #Loop through the different formats
> for format_type in self.format_list:
> #if output goes to a file
> if not string_only:
> #Create output filename
> out_file = "%s%s" % (filename,
> OUTPUT_EXTENSIONS.get(format_type))
> #Opens the format type filehandle
> try:
> setattr(self, "%s_handle" % (format_type),
> open(out_file,'w'))
> except:
> raise IOError, "Could not open file: %s" %
> (out_file)
> #if output goes to a string
> else:
> #Opens the format type StringIO
> try:
> setattr(self, "%s_handle" % (format_type),
> StringIO.StringIO())
> except:
> raise IOError, "Could not open string output: %s"
> % (out_file)
>
> def parse_doc(self, abbyy_filename):
> """Parses the abbyy document"""
> #Write the first line of the xml doc, if specified
> if getattr(self, 'xml_handle', None):
> self.xml_handle.write('<?xml version="1.0"
> encoding="utf-8"?>\n')
> #Memory efficient iterparse opens file and loops through
> content
> for event, element in iterparse(abbyy_filename,
> events=("start", "end")):
> #ignore the namespace, if it has one
> if NAMESPACE_REGEX.search(element.tag, 0):
> element_tag = NAMESPACE_REGEX.search(element.tag,
> 0).group(1)
> else:
> element_tag = element.tag
> #if this is the page element
> if element_tag == 'page':
> self.write_page(event, element)
> #If at the beginning of the line
> elif element_tag == 'line' and event == 'start':
> #Create the line
> line = AbbyyLine(element)
> #Instantiate first word
> word = AbbyyWord(line)
> #If at the end of the line, and an output text file exists
> if element_tag == 'line' and event == 'end' and \
> getattr(self, 'text_handle', None):
> #output line data to text file
> line.write_line(self.text_handle)
> #If at the end of the line, and an output text file exists
> if element_tag == 'line' and event == 'end' and \
> getattr(self, 'xml_handle', None):
> #output line data to text file
> word.write_word(self.xml_handle)
> #if outputting to an xml file, create word data
> if getattr(self, 'xml_handle', None) and \
> element_tag == 'charParams' and event == 'start':
> #Insert character into word
> word.insert_char(element, self.xml_handle)
> #if outputting to a text file, create line data
> if getattr(self, 'text_handle', None) and \
> element_tag == 'charParams' and event == 'start':
> #Insert character into line
> line.insert_char(element)
>
> def write_page(self, event, element):
> """Parse the page contents"""
> #page open tag event
> if event == 'start':
> #Write page info to xml file
> if getattr(self, 'xml_handle', None):
> #Get the page info
> x_dim = element.get('width')
> y_dim = element.get('height')
> resolution = element.get('resolution')
> #Write the page info to the file
> self.xml_handle.write('<page>\n')
> self.xml_handle.write('<filename/>\n')
> self.xml_handle.write('<confidence/>\n')
> self.xml_handle.write("<xDim>%s</xDim>\n" % (x_dim))
> self.xml_handle.write("<yDim>%s</yDim>\n" % (y_dim))
> self.xml_handle.write("<resolution>%s</resolution>\n"
> % (resolution))
> self.xml_handle.write('<zone/>\n')
> self.xml_handle.write('<wordsboundingboxes>\n')
> #page close tag event
> elif event == 'end':
> #Write page info to xml file
> if getattr(self, 'xml_handle', None):
> #Write closing tags to file
> self.xml_handle.write('</wordsboundingboxes>\n')
> self.xml_handle.write('</page>')
>
> def write_line(self, event, element):
> """Parse the line contents"""
> #line open tag event
> if event == 'start':
> pass
> #page close tag event
> elif event == 'end':
> pass
>
> def write_word(self, event, element):
> """Parse the charParams contents"""
> pass
>
> def close_filehandles(self, abbyy_filename, string_only):
> """Close the open filehandles"""
> #if the files exist
> if not string_only:
> #Loop through the different formats
> for format_type in self.format_list:
> #Opens the format type filehandle
> try:
> getattr(self, "%s_handle" % (format_type)).close()
> except:
> raise IOError, "Could not close format type: %s
> for file: %s" \
> % (format_type, abbyy_filename)
I think this is a bug with iterparse. I switched to using regular
parse for the parse_doc function, and it worked just fine:
def parse_doc(self, abbyy_filename):
"""Parses the abbyy document"""
#Write the first line of the xml doc, if specified
if getattr(self, 'xml_handle', None):
self.xml_handle.write('<?xml version="1.0" encoding="utf-8"?>
\n')
#Try to open the abbyy file
try:
f = open(abbyy_filename, "r")
#abbyy_filename is already and instance of a file-like object
except:
#parse the abbyy file
tree = parse(abbyy_filename)
#parse the open abbyyfile
else:
tree = parse(f)
f.close()
root = tree.getroot()
line = None
for element in root.iter("*"):
#ignore the namespace, if it has one
if NAMESPACE_REGEX.search(element.tag, 0):
element_tag = NAMESPACE_REGEX.search(element.tag,
0).group(1)
else:
element_tag = element.tag
#if this is the page element
if element_tag == 'page':
self.write_page('start', element)
#If at the beginning of the new line
elif element_tag == 'line':
#if a line already existed, and there is an output text
file
if line != None:
if getattr(self, 'text_handle', None):
#output line data to text file
line.write_line(self.text_handle)
elif getattr(self, 'xml_handle', None):
#output line data to xml file
word.write_word(self.xml_handle)
#Create the line
line = AbbyyLine(element)
#Instantiate first word
word = AbbyyWord(line)
#if outputting to an xml file, create word data
if getattr(self, 'xml_handle', None) and element_tag ==
'charParams':
#Insert character into word
word.insert_char(element, self.xml_handle)
#if outputting to a text file, create line data
if getattr(self, 'text_handle', None) and element_tag ==
'charParams':
#Insert character into line
line.insert_char(element)
#if a line already existed, and there is an output text file
if line != None:
if getattr(self, 'text_handle', None):
#output line data to text file
line.write_line(self.text_handle)
elif getattr(self, 'xml_handle', None):
#output line data to xml file
word.write_word(self.xml_handle)
self.write_page('end', element)
More information about the Python-list
mailing list