Trying to get cleaner XML output from a text file
Gabriel Genellina
gagsl-py2 at yahoo.com.ar
Fri May 29 23:56:52 EDT 2009
En Fri, 29 May 2009 14:09:10 -0300, iainemsley <iainemsley at googlemail.com>
escribió:
> I'm using Python2.5 to try and convert some text files into XML using
> xml.minidom. I'm currently doing some plays which have a structure
> like
> Scene 1
> Act 1
> blah blah
> Act2
> blah blah
> Scene 2
> Act 1
> and so on.
(I think you get the hierarchy wrong: usually a play contains some Acts;
each act contains several Scenes)
> I'm trying to turn it into
> <div type="scene">1
> <div type="act">1
> <speech />
> </div>
> <div type="act">2
> <speech />
> </div>
> </div>
> (or ideally <div type="scene" id="1"> bit I can always come back to
> this later)
Using ElementTree is a lot easier than minidom:
import sys
from itertools import groupby, count
import xml.etree.ElementTree as ET
import re
class Seq:
"Automatic sequencer for acts/scenes"
def __init__(self):
self.act_nr = count(1)
self.scene_nr = count(1)
def next_scene(self):
return self.scene_nr.next()
def next_act(self):
self.scene_nr = count(1)
return self.act_nr.next()
seq = Seq()
def add_act(body, act_text):
act = ET.SubElement(body, "div", type="act", id="a%s" % seq.next_act())
for scene_text in scene_sep.split(act_text):
add_scene(act, scene_text)
def add_scene(act, scene_text):
scene = ET.SubElement(act, "div", type="scene", id="%ss%s" %
(act.get("id"), seq.next_scene()))
for p in paragraphs(scene_text.splitlines(True)):
add_speech(scene, p)
def add_speech(scene, p):
speech = ET.SubElement(scene, "speech")
speech.text = p
body = ET.Element("body")
scene_sep = re.compile(r"Scene\s+[1-9]+", re.I)
act_sep = re.compile(r"Act\s+[1-9]+", re.I)
for act_text in act_sep.split(text):
add_act(body, act_text)
doc = ET.ElementTree(body)
doc.write(sys.stdout)
--
Gabriel Genellina
More information about the Python-list
mailing list