converting text and spans to an ElementTree

Tue May 22 16:38:37 EDT 2007

Gabriel Genellina wrote:
> the idea would be as follows:
> 
> - For each span generate two tuples: (start_offset, 1, end_offset, 
> element) and (end_offset, 0, -start_offset, element). If start==end use 
> (start_offset, -1, start_offset, element).
> - Collect all tuples in a list, and sort them. The tuple is made so when 
> at a given offset there is an element that ends and another that starts, 
> the ending element comes first (because it has a 0). For all the 
> elements that end at a given point, the shortest comes first.
> - Initialize an empty list (will be used as a stack of containers), and 
> create a root Element as your  "current container" (CC), the variable 
> "last_used" will keep the last position used on the text.
> - For each tuple in the sorted list:
>   - if the second item is a 1, an element is starting. Insert it into 
> the CC element, push the CC onto the stack, and set the new element as 
> the new CC. The element data is text[last_used:start_offset], and move 
> last_used to start_offset.
>   - if the second item is a 0, an element is ending. Discard the CC, pop 
> an element from the stack as the new CC. The element data is 
> text[last_used:end_offset], move last_used up to end_offset.
>   - if the second item is a -1, it's an element with no content. Insert 
> it into the CC element.

Thanks a lot! This put me on the right track (though the devil's 
definitely in the details). It's working now::

 >>> tree = xmltools.text_and_spans_to_etree('aaa aaa aaaccc cccaaa', [
...     (etree.Element('a'), 0, 21),
...     (etree.Element('b'), 11, 11),
...     (etree.Element('c'), 11, 18),
... ])
 >>> etree.tostring(tree)
'<a>aaa aaa aaa<b /><c>ccc ccc</c>aaa</a>'
 >>> tree = xmltools.text_and_spans_to_etree('bbb\naaaccc\ncccaaa', [
...     (etree.Element('a'), 0, 17),
...     (etree.Element('b'), 0, 4),
...     (etree.Element('c'), 7, 14),
...     (etree.Element('b'), 14, 14),
... ])
 >>> etree.tostring(tree)
'<a><b>bbb\n</b>aaa<c>ccc\nccc</c><b />aaa</a>'
 >>> tree = xmltools.text_and_spans_to_etree('abc', [
...     (etree.Element('a'), 0, 3),
...     (etree.Element('b'), 0, 3),
...     (etree.Element('c'), 0, 3),
... ])
 >>> etree.tostring(tree)
'<a><b><c>abc</c></b></a>'

And for the sake of any poor soul who runs into a similar problem, 
here's the code I wrote using Gabriel's hints above::

def text_and_spans_to_etree(text, spans):
     # Create a list of element starts and ends, sorting them in the
     # order they would appear in an XML version of the text. So for
     # example, with XML text like:
     #     <a> a <b> b </b> a </a>
     # we will see:
     #     starting <a>
     #     starting <b>
     #     ending </b>
     #     ending </a>
     empty = -1
     starting = 0
     ending = 1
     tuples = []
     root_elem = None
     for i, (elem, start, end) in enumerate(spans):
         # validate spans
         if start < 0 or end > len(text):
             msg = 'spans must be in the range 0-%i'
             raise ValueError(msg % len(text))

         # save the first element that spans the entire text as the root
         elif root_elem is None and start == 0 and end == len(text):
             root_elem = elem

         # insert a single tuple for empty elements
         elif start == end:
             tuples.append((start, empty, -end, i, elem))

         # insert starts and ends for all other elements
         else:
             tuples.append((start, starting, -end, i, elem))
             tuples.append((start, ending, -end, -i, elem))
     tuples.sort()

     # make sure we found a root element
     if root_elem is None:
         raise ValueError('one element must span the entire text')

     # iterate through the element starts and ends,
     # updating element texts, tails and children
     last_offset = 0
     last_elem = root_elem
     last_type = starting
     stack = [root_elem]
     for start, offset_type, neg_end, _, elem in tuples:

         # start of an element:
         # add it as a child and add it to the stack
         # next text chunk goes up to the start offset
         if offset_type is starting:
             stack[-1].append(elem)
             stack.append(elem)
             offset = start

         # end of an element:
         # pop if off the stack
         # next text chunk goes up to the end offset
         elif offset_type is ending:
             if elem is not stack[-1]:
                 print elem, stack[-1]
                 assert False
             assert elem is stack.pop()
             offset = -neg_end

         # empty element:
         # add it as a child
         # next text chunk goes up to the start offset
         elif offset_type is empty:
             stack[-1].append(elem)
             offset = start

         # should never get here
         else:
             assert False

         # calculate the next text chunk, and then determine what to do
         # with it based on what we did the last time around:
         # * started an element -> set its .text
         # * ended an element (or inserted an empty) -> set its .tail
         last_text = text[last_offset:offset]
         if last_type is starting:
             last_elem.text = last_text
         elif last_type is ending:
             last_elem.tail = last_text
         elif last_type is empty:
             last_elem.tail = last_text
         else:
             assert False

         # save what we did this time for inspection next time
         last_offset = offset
         last_type = offset_type
         last_elem = elem

     # add any final text before the close of the root element
     last_elem.tail = text[last_offset:]

     return root_elem

Thanks again,

STeVe