Issue with xml iterparse
bfrederi
brfredericks at gmail.com
Thu Jun 3 17:13:37 EDT 2010
On Jun 3, 3:59 pm, Chris Rebert <c... at rebertia.com> wrote:
> On Thu, Jun 3, 2010 at 1:44 PM, bfrederi <brfrederi... at gmail.com> wrote:
> > I am using lxml iterparse and running into a very obscure error. When
> > I run iterparse on a file, it will occasionally return an element that
> > has a element.text == None when the element clearly has text in it.
>
> > I copy and pasted the problem xml into a python string, used StringIO
> > to create a file-like object out of it, and ran a test using iterparse
> > with expected output, and it ran perfectly fine. So it only happens
> > when I try to run iterparse on the actual file.
>
> > So then I tried opening the file, reading the data, turning that data
> > into a file-like object using StringIO, then running iterparse on it,
> > and the same problem (element.text == None) occurred.
>
> > I even tried this:
> > f = codecs.open(abbyy_filename, 'r', encoding='utf-8')
> > file_data = f.read()
> > file_like_object = StringIO.StringIO(file_data)
> > for event, element in iterparse(file_like_object, events=("start",
> > "end")):
>
> IIRC, XML parsers operate on bytes directly (since they have to
> determine the encoding themselves anyway), not pre-decoded Unicode
> characters, so I think your manual UTF-8 decoding could be the
> problem.
> Have you tried simply:
>
> f = open(abbyy_filename, 'r')
> for event, element in iterparse(f, events=("start", "end")):
> #whatever
>
> ?
>
> Apologies if you already have, but since you didn't include the
> original, albeit probably trivial, error-causing code, this relatively
> simple error couldn't be ruled out.
>
> Cheers,
> Chris
> --http://blog.rebertia.com
Sorry for not mentioning it, but I tried that as well and it failed.
Here is the relevant class. AbbyyLine and Abbyyword just take the
element's text and writes it to a file/file-like object. parse_doc is
where I use iterparse. The relevant part is very minimal and there is
a lot of fluff to ignore, so I didn't initially post it:
class AbbyyDocParse(object):
"""Takes an abbyy filename and parses the contents"""
def __init__(self, abbyy_filename, extension=DEFAULT_ABBYY_EXT,
format_list=OUTPUT_TYPES, string_only=False):
self.extension = extension
self.format_list = format_list
#Create the file handles for the output files
self.create_filehandles(abbyy_filename, string_only)
#Parse the document
self.parse_doc(abbyy_filename)
#Close the output filehandles
self.close_filehandles(abbyy_filename, string_only)
def create_filehandles(self, abbyy_filename, string_only):
"""Create output filehandles"""
#if output goes to a file
if not string_only:
#Make sure the file is an abbyy file
if not abbyy_filename.endswith(self.extension):
raise ParserException, "Bad abbyy filename given: %s"
\
% (abbyy_filename)
#get the base path and filename for output files
filename = abbyy_filename.replace(self.extension, '')
#Loop through the different formats
for format_type in self.format_list:
#if output goes to a file
if not string_only:
#Create output filename
out_file = "%s%s" % (filename,
OUTPUT_EXTENSIONS.get(format_type))
#Opens the format type filehandle
try:
setattr(self, "%s_handle" % (format_type),
open(out_file,'w'))
except:
raise IOError, "Could not open file: %s" %
(out_file)
#if output goes to a string
else:
#Opens the format type StringIO
try:
setattr(self, "%s_handle" % (format_type),
StringIO.StringIO())
except:
raise IOError, "Could not open string output: %s"
% (out_file)
def parse_doc(self, abbyy_filename):
"""Parses the abbyy document"""
#Write the first line of the xml doc, if specified
if getattr(self, 'xml_handle', None):
self.xml_handle.write('<?xml version="1.0"
encoding="utf-8"?>\n')
#Memory efficient iterparse opens file and loops through
content
for event, element in iterparse(abbyy_filename,
events=("start", "end")):
#ignore the namespace, if it has one
if NAMESPACE_REGEX.search(element.tag, 0):
element_tag = NAMESPACE_REGEX.search(element.tag,
0).group(1)
else:
element_tag = element.tag
#if this is the page element
if element_tag == 'page':
self.write_page(event, element)
#If at the beginning of the line
elif element_tag == 'line' and event == 'start':
#Create the line
line = AbbyyLine(element)
#Instantiate first word
word = AbbyyWord(line)
#If at the end of the line, and an output text file exists
if element_tag == 'line' and event == 'end' and \
getattr(self, 'text_handle', None):
#output line data to text file
line.write_line(self.text_handle)
#If at the end of the line, and an output text file exists
if element_tag == 'line' and event == 'end' and \
getattr(self, 'xml_handle', None):
#output line data to text file
word.write_word(self.xml_handle)
#if outputting to an xml file, create word data
if getattr(self, 'xml_handle', None) and \
element_tag == 'charParams' and event == 'start':
#Insert character into word
word.insert_char(element, self.xml_handle)
#if outputting to a text file, create line data
if getattr(self, 'text_handle', None) and \
element_tag == 'charParams' and event == 'start':
#Insert character into line
line.insert_char(element)
def write_page(self, event, element):
"""Parse the page contents"""
#page open tag event
if event == 'start':
#Write page info to xml file
if getattr(self, 'xml_handle', None):
#Get the page info
x_dim = element.get('width')
y_dim = element.get('height')
resolution = element.get('resolution')
#Write the page info to the file
self.xml_handle.write('<page>\n')
self.xml_handle.write('<filename/>\n')
self.xml_handle.write('<confidence/>\n')
self.xml_handle.write("<xDim>%s</xDim>\n" % (x_dim))
self.xml_handle.write("<yDim>%s</yDim>\n" % (y_dim))
self.xml_handle.write("<resolution>%s</resolution>\n"
% (resolution))
self.xml_handle.write('<zone/>\n')
self.xml_handle.write('<wordsboundingboxes>\n')
#page close tag event
elif event == 'end':
#Write page info to xml file
if getattr(self, 'xml_handle', None):
#Write closing tags to file
self.xml_handle.write('</wordsboundingboxes>\n')
self.xml_handle.write('</page>')
def write_line(self, event, element):
"""Parse the line contents"""
#line open tag event
if event == 'start':
pass
#page close tag event
elif event == 'end':
pass
def write_word(self, event, element):
"""Parse the charParams contents"""
pass
def close_filehandles(self, abbyy_filename, string_only):
"""Close the open filehandles"""
#if the files exist
if not string_only:
#Loop through the different formats
for format_type in self.format_list:
#Opens the format type filehandle
try:
getattr(self, "%s_handle" % (format_type)).close()
except:
raise IOError, "Could not close format type: %s
for file: %s" \
% (format_type, abbyy_filename)
More information about the Python-list
mailing list