Issue with xml iterparse

Thu Jun 3 17:13:37 EDT 2010

On Jun 3, 3:59 pm, Chris Rebert <c... at rebertia.com> wrote:
> On Thu, Jun 3, 2010 at 1:44 PM, bfrederi <brfrederi... at gmail.com> wrote:
> > I am using lxml iterparse and running into a very obscure error. When
> > I run iterparse on a file, it will occasionally return an element that
> > has a element.text == None when the element clearly has text in it.
>
> > I copy and pasted the problem xml into a python string, used StringIO
> > to create a file-like object out of it, and ran a test using iterparse
> > with expected output, and it ran perfectly fine. So it only happens
> > when I try to run iterparse on the actual file.
>
> > So then I tried opening the file, reading the data, turning that data
> > into a file-like object using StringIO, then running iterparse on it,
> > and the same problem (element.text == None) occurred.
>
> > I even tried this:
> > f = codecs.open(abbyy_filename, 'r', encoding='utf-8')
> > file_data = f.read()
> > file_like_object = StringIO.StringIO(file_data)
> > for event, element in iterparse(file_like_object, events=("start",
> > "end")):
>
> IIRC, XML parsers operate on bytes directly (since they have to
> determine the encoding themselves anyway), not pre-decoded Unicode
> characters, so I think your manual UTF-8 decoding could be the
> problem.
> Have you tried simply:
>
> f = open(abbyy_filename, 'r')
> for event, element in iterparse(f, events=("start", "end")):
>     #whatever
>
> ?
>
> Apologies if you already have, but since you didn't include the
> original, albeit probably trivial, error-causing code, this relatively
> simple error couldn't be ruled out.
>
> Cheers,
> Chris
> --http://blog.rebertia.com

Sorry for not mentioning it, but I tried that as well and it failed.
Here is the relevant class. AbbyyLine and Abbyyword just take the
element's text and writes it to a file/file-like object. parse_doc is
where I use iterparse. The relevant part is very minimal and there is
a lot of fluff to ignore, so I didn't initially post it:

class AbbyyDocParse(object):

    """Takes an abbyy filename and parses the contents"""
    def __init__(self, abbyy_filename, extension=DEFAULT_ABBYY_EXT,
        format_list=OUTPUT_TYPES, string_only=False):
        self.extension = extension
        self.format_list = format_list
        #Create the file handles for the output files
        self.create_filehandles(abbyy_filename, string_only)
        #Parse the document
        self.parse_doc(abbyy_filename)
        #Close the output filehandles
        self.close_filehandles(abbyy_filename, string_only)

    def create_filehandles(self, abbyy_filename, string_only):
        """Create output filehandles"""
        #if output goes to a file
        if not string_only:
            #Make sure the file is an abbyy file
            if not abbyy_filename.endswith(self.extension):
                raise ParserException, "Bad abbyy filename given: %s"
\
                    % (abbyy_filename)
            #get the base path and filename for output files
            filename = abbyy_filename.replace(self.extension, '')
        #Loop through the different formats
        for format_type in self.format_list:
            #if output goes to a file
            if not string_only:
                #Create output filename
                out_file = "%s%s" % (filename,
OUTPUT_EXTENSIONS.get(format_type))
                #Opens the format type filehandle
                try:
                    setattr(self, "%s_handle" % (format_type),
open(out_file,'w'))
                except:
                    raise IOError, "Could not open file: %s" %
(out_file)
            #if output goes to a string
            else:
                #Opens the format type StringIO
                try:
                    setattr(self, "%s_handle" % (format_type),
StringIO.StringIO())
                except:
                    raise IOError, "Could not open string output: %s"
% (out_file)

    def parse_doc(self, abbyy_filename):
        """Parses the abbyy document"""
        #Write the first line of the xml doc, if specified
        if getattr(self, 'xml_handle', None):
            self.xml_handle.write('<?xml version="1.0"
encoding="utf-8"?>\n')
        #Memory efficient iterparse opens file and loops through
content
        for event, element in iterparse(abbyy_filename,
events=("start", "end")):
            #ignore the namespace, if it has one
            if NAMESPACE_REGEX.search(element.tag, 0):
                element_tag = NAMESPACE_REGEX.search(element.tag,
0).group(1)
            else:
                element_tag = element.tag
            #if this is the page element
            if element_tag == 'page':
                self.write_page(event, element)
            #If at the beginning of the line
            elif element_tag == 'line' and event == 'start':
                #Create the line
                line = AbbyyLine(element)
                #Instantiate first word
                word = AbbyyWord(line)
            #If at the end of the line, and an output text file exists
            if element_tag == 'line' and event == 'end' and \
                getattr(self, 'text_handle', None):
                #output line data to text file
                line.write_line(self.text_handle)
            #If at the end of the line, and an output text file exists
            if element_tag == 'line' and event == 'end' and \
                getattr(self, 'xml_handle', None):
                #output line data to text file
                word.write_word(self.xml_handle)
            #if outputting to an xml file, create word data
            if getattr(self, 'xml_handle', None) and \
                element_tag == 'charParams' and event == 'start':
                #Insert character into word
                word.insert_char(element, self.xml_handle)
            #if outputting to a text file, create line data
            if getattr(self, 'text_handle', None) and \
                element_tag == 'charParams' and event == 'start':
                #Insert character into line
                line.insert_char(element)

    def write_page(self, event, element):
        """Parse the page contents"""
        #page open tag event
        if event == 'start':
            #Write page info to xml file
            if getattr(self, 'xml_handle', None):
                #Get the page info
                x_dim = element.get('width')
                y_dim = element.get('height')
                resolution = element.get('resolution')
                #Write the page info to the file
                self.xml_handle.write('<page>\n')
                self.xml_handle.write('<filename/>\n')
                self.xml_handle.write('<confidence/>\n')
                self.xml_handle.write("<xDim>%s</xDim>\n" % (x_dim))
                self.xml_handle.write("<yDim>%s</yDim>\n" % (y_dim))
                self.xml_handle.write("<resolution>%s</resolution>\n"
% (resolution))
                self.xml_handle.write('<zone/>\n')
                self.xml_handle.write('<wordsboundingboxes>\n')
        #page close tag event
        elif event == 'end':
            #Write page info to xml file
            if getattr(self, 'xml_handle', None):
                #Write closing tags to file
                self.xml_handle.write('</wordsboundingboxes>\n')
                self.xml_handle.write('</page>')

    def write_line(self, event, element):
        """Parse the line contents"""
        #line open tag event
        if event == 'start':
            pass
        #page close tag event
        elif event == 'end':
            pass

    def write_word(self, event, element):
        """Parse the charParams contents"""
        pass

    def close_filehandles(self, abbyy_filename, string_only):
        """Close the open filehandles"""
        #if the files exist
        if not string_only:
            #Loop through the different formats
            for format_type in self.format_list:
                #Opens the format type filehandle
                try:
                    getattr(self, "%s_handle" % (format_type)).close()
                except:
                    raise IOError, "Could not close format type: %s
for file: %s" \
                        % (format_type, abbyy_filename)