Issue with xml iterparse

Fri Jun 4 12:54:11 EDT 2010

On Jun 3, 4:13 pm, bfrederi <brfrederi... at gmail.com> wrote:
> On Jun 3, 3:59 pm, Chris Rebert <c... at rebertia.com> wrote:
>
>
>
> > On Thu, Jun 3, 2010 at 1:44 PM, bfrederi <brfrederi... at gmail.com> wrote:
> > > I am using lxml iterparse and running into a very obscure error. When
> > > I run iterparse on a file, it will occasionally return an element that
> > > has a element.text == None when the element clearly has text in it.
>
> > > I copy and pasted the problem xml into a python string, used StringIO
> > > to create a file-like object out of it, and ran a test using iterparse
> > > with expected output, and it ran perfectly fine. So it only happens
> > > when I try to run iterparse on the actual file.
>
> > > So then I tried opening the file, reading the data, turning that data
> > > into a file-like object using StringIO, then running iterparse on it,
> > > and the same problem (element.text == None) occurred.
>
> > > I even tried this:
> > > f = codecs.open(abbyy_filename, 'r', encoding='utf-8')
> > > file_data = f.read()
> > > file_like_object = StringIO.StringIO(file_data)
> > > for event, element in iterparse(file_like_object, events=("start",
> > > "end")):
>
> > IIRC, XML parsers operate on bytes directly (since they have to
> > determine the encoding themselves anyway), not pre-decoded Unicode
> > characters, so I think your manual UTF-8 decoding could be the
> > problem.
> > Have you tried simply:
>
> > f = open(abbyy_filename, 'r')
> > for event, element in iterparse(f, events=("start", "end")):
> >     #whatever
>
> > ?
>
> > Apologies if you already have, but since you didn't include the
> > original, albeit probably trivial, error-causing code, this relatively
> > simple error couldn't be ruled out.
>
> > Cheers,
> > Chris
> > --http://blog.rebertia.com
>
> Sorry for not mentioning it, but I tried that as well and it failed.
> Here is the relevant class. AbbyyLine and Abbyyword just take the
> element's text and writes it to a file/file-like object. parse_doc is
> where I use iterparse. The relevant part is very minimal and there is
> a lot of fluff to ignore, so I didn't initially post it:
>
> class AbbyyDocParse(object):
>
>     """Takes an abbyy filename and parses the contents"""
>     def __init__(self, abbyy_filename, extension=DEFAULT_ABBYY_EXT,
>         format_list=OUTPUT_TYPES, string_only=False):
>         self.extension = extension
>         self.format_list = format_list
>         #Create the file handles for the output files
>         self.create_filehandles(abbyy_filename, string_only)
>         #Parse the document
>         self.parse_doc(abbyy_filename)
>         #Close the output filehandles
>         self.close_filehandles(abbyy_filename, string_only)
>
>     def create_filehandles(self, abbyy_filename, string_only):
>         """Create output filehandles"""
>         #if output goes to a file
>         if not string_only:
>             #Make sure the file is an abbyy file
>             if not abbyy_filename.endswith(self.extension):
>                 raise ParserException, "Bad abbyy filename given: %s"
> \
>                     % (abbyy_filename)
>             #get the base path and filename for output files
>             filename = abbyy_filename.replace(self.extension, '')
>         #Loop through the different formats
>         for format_type in self.format_list:
>             #if output goes to a file
>             if not string_only:
>                 #Create output filename
>                 out_file = "%s%s" % (filename,
> OUTPUT_EXTENSIONS.get(format_type))
>                 #Opens the format type filehandle
>                 try:
>                     setattr(self, "%s_handle" % (format_type),
> open(out_file,'w'))
>                 except:
>                     raise IOError, "Could not open file: %s" %
> (out_file)
>             #if output goes to a string
>             else:
>                 #Opens the format type StringIO
>                 try:
>                     setattr(self, "%s_handle" % (format_type),
> StringIO.StringIO())
>                 except:
>                     raise IOError, "Could not open string output: %s"
> % (out_file)
>
>     def parse_doc(self, abbyy_filename):
>         """Parses the abbyy document"""
>         #Write the first line of the xml doc, if specified
>         if getattr(self, 'xml_handle', None):
>             self.xml_handle.write('<?xml version="1.0"
> encoding="utf-8"?>\n')
>         #Memory efficient iterparse opens file and loops through
> content
>         for event, element in iterparse(abbyy_filename,
> events=("start", "end")):
>             #ignore the namespace, if it has one
>             if NAMESPACE_REGEX.search(element.tag, 0):
>                 element_tag = NAMESPACE_REGEX.search(element.tag,
> 0).group(1)
>             else:
>                 element_tag = element.tag
>             #if this is the page element
>             if element_tag == 'page':
>                 self.write_page(event, element)
>             #If at the beginning of the line
>             elif element_tag == 'line' and event == 'start':
>                 #Create the line
>                 line = AbbyyLine(element)
>                 #Instantiate first word
>                 word = AbbyyWord(line)
>             #If at the end of the line, and an output text file exists
>             if element_tag == 'line' and event == 'end' and \
>                 getattr(self, 'text_handle', None):
>                 #output line data to text file
>                 line.write_line(self.text_handle)
>             #If at the end of the line, and an output text file exists
>             if element_tag == 'line' and event == 'end' and \
>                 getattr(self, 'xml_handle', None):
>                 #output line data to text file
>                 word.write_word(self.xml_handle)
>             #if outputting to an xml file, create word data
>             if getattr(self, 'xml_handle', None) and \
>                 element_tag == 'charParams' and event == 'start':
>                 #Insert character into word
>                 word.insert_char(element, self.xml_handle)
>             #if outputting to a text file, create line data
>             if getattr(self, 'text_handle', None) and \
>                 element_tag == 'charParams' and event == 'start':
>                 #Insert character into line
>                 line.insert_char(element)
>
>     def write_page(self, event, element):
>         """Parse the page contents"""
>         #page open tag event
>         if event == 'start':
>             #Write page info to xml file
>             if getattr(self, 'xml_handle', None):
>                 #Get the page info
>                 x_dim = element.get('width')
>                 y_dim = element.get('height')
>                 resolution = element.get('resolution')
>                 #Write the page info to the file
>                 self.xml_handle.write('<page>\n')
>                 self.xml_handle.write('<filename/>\n')
>                 self.xml_handle.write('<confidence/>\n')
>                 self.xml_handle.write("<xDim>%s</xDim>\n" % (x_dim))
>                 self.xml_handle.write("<yDim>%s</yDim>\n" % (y_dim))
>                 self.xml_handle.write("<resolution>%s</resolution>\n"
> % (resolution))
>                 self.xml_handle.write('<zone/>\n')
>                 self.xml_handle.write('<wordsboundingboxes>\n')
>         #page close tag event
>         elif event == 'end':
>             #Write page info to xml file
>             if getattr(self, 'xml_handle', None):
>                 #Write closing tags to file
>                 self.xml_handle.write('</wordsboundingboxes>\n')
>                 self.xml_handle.write('</page>')
>
>     def write_line(self, event, element):
>         """Parse the line contents"""
>         #line open tag event
>         if event == 'start':
>             pass
>         #page close tag event
>         elif event == 'end':
>             pass
>
>     def write_word(self, event, element):
>         """Parse the charParams contents"""
>         pass
>
>     def close_filehandles(self, abbyy_filename, string_only):
>         """Close the open filehandles"""
>         #if the files exist
>         if not string_only:
>             #Loop through the different formats
>             for format_type in self.format_list:
>                 #Opens the format type filehandle
>                 try:
>                     getattr(self, "%s_handle" % (format_type)).close()
>                 except:
>                     raise IOError, "Could not close format type: %s
> for file: %s" \
>                         % (format_type, abbyy_filename)

I think this is a bug with iterparse. I switched to using regular
parse for the parse_doc function, and it worked just fine:

def parse_doc(self, abbyy_filename):
    """Parses the abbyy document"""
    #Write the first line of the xml doc, if specified
    if getattr(self, 'xml_handle', None):
        self.xml_handle.write('<?xml version="1.0" encoding="utf-8"?>
\n')
    #Try to open the abbyy file
    try:
        f = open(abbyy_filename, "r")
    #abbyy_filename is already and instance of a file-like object
    except:
        #parse the abbyy file
        tree = parse(abbyy_filename)
    #parse the open abbyyfile
    else:
        tree = parse(f)
        f.close()
    root = tree.getroot()
    line = None
    for element in root.iter("*"):
        #ignore the namespace, if it has one
        if NAMESPACE_REGEX.search(element.tag, 0):
            element_tag = NAMESPACE_REGEX.search(element.tag,
0).group(1)
        else:
            element_tag = element.tag

        #if this is the page element
        if element_tag == 'page':
            self.write_page('start', element)
        #If at the beginning of the new line
        elif element_tag == 'line':
            #if a line already existed, and there is an output text
file
            if line != None:
                if getattr(self, 'text_handle', None):
                    #output line data to text file
                    line.write_line(self.text_handle)
                elif getattr(self, 'xml_handle', None):
                    #output line data to xml file
                    word.write_word(self.xml_handle)
            #Create the line
            line = AbbyyLine(element)
            #Instantiate first word
            word = AbbyyWord(line)

        #if outputting to an xml file, create word data
        if getattr(self, 'xml_handle', None) and element_tag ==
'charParams':
            #Insert character into word
            word.insert_char(element, self.xml_handle)
        #if outputting to a text file, create line data
        if getattr(self, 'text_handle', None) and element_tag ==
'charParams':
            #Insert character into line
            line.insert_char(element)
    #if a line already existed, and there is an output text file
    if line != None:
        if getattr(self, 'text_handle', None):
            #output line data to text file
            line.write_line(self.text_handle)
        elif getattr(self, 'xml_handle', None):
            #output line data to xml file
            word.write_word(self.xml_handle)
    self.write_page('end', element)