Python nuube needs Unicode help
gheissenberger at gmail.com
gheissenberger at gmail.com
Fri Jan 12 09:44:27 EST 2007
Can you attach files in this forum? Couldn't find the option. Oh well,
here's the file.
#!/usr/bin/python
# Version: 1.1
# Author: Steve Losh
from sets import Set
from optparse import OptionParser
from xml.dom.minidom import parse
AudioPath = 'audio/'
DatafilePath = 'utterances.trmxml'
CONFIDENCE_LOW = None #'500'
CONFIDENCE_HIGH = None #'500'
utterancesFile = None
class Utterance:
def __init__(self, audio, grammarSet, text):
self.audio = audio
self.grammarSet = grammarSet
self.text = text
def __str__(self):
return "SWIrecAcousticStateReset\ntranscription " + self.text \
+ "\nrecognize " + AudioPath + self.audio
def getGrammarPaths():
"""Get the paths of all the grammars needed. Returns a Set
containing the results.
If a grammar is listed more than once in the transcription manifest it
will only appear once in these results.
TODO:
Find a less fragile way to split off the server half of the URIs."""
grammarTags = utterancesFile.getElementsByTagName('Grammar')
grammarURIs = [tag.getAttribute('uri') for tag in grammarTags]
grammarPaths = [uri.split('servlet/CA/')[1] for uri in grammarURIs]
return Set(grammarPaths)
def createGrammarNameFromPath(path):
"""Convert a given path into an appropriate name for the
grammar."""
path = path.replace('/', '-') # Strip the directory slashes
path = path.replace('.', '_') # and the dot before the extension.
return path
def loadGrammars():
"""Output the statements that will load the required grammars."""
grammarPaths = list(getGrammarPaths())
grammarsToLoad = {}
for path in grammarPaths:
grammarName = createGrammarNameFromPath(path)
grammarsToLoad[grammarName] = "grammars/" + path
for grammarName in grammarsToLoad:
print "SWIrecGrammarLoad", grammarName,
grammarsToLoad[grammarName]
def loadGrammarSets():
"""Output the statements that will define the grammar
sets/contexts. Returns a list of the grammar set names."""
grammarSetList =
utterancesFile.getElementsByTagName('GrammarSets')
grammarSets =
grammarSetList[0].getElementsByTagName('GrammarSet')
grammarSetNames = []
for gs in grammarSets:
grammarSetName = gs.getAttribute('id')
print "context_define", grammarSetName, CONFIDENCE_LOW,
CONFIDENCE_HIGH
for g in gs.getElementsByTagName('Grammar'):
path = g.getAttribute('uri').split('servlet/CA/')[1]
print "context_add", createGrammarNameFromPath(path),
'1000'
print "context_end\n"
grammarSetNames.append(grammarSetName)
return grammarSetNames
def buildUtterances(call):
"""This function takes a call tag, builds the utterances belonging
to it and returns a list containing them."""
utts = call.getElementsByTagName('Utt')
utterances = [Utterance( utt.getAttribute('audio'), \
utt.getAttribute('grammarSet'),
utt.getAttribute('transcribedText') ) \
for utt in utts]
return utterances
def getUtterances():
"""Returns a list of all the utterances we want to test."""
callList = utterancesFile.getElementsByTagName('Calls')[0]
calls = callList.getElementsByTagName('Call')
utterances = []
for c in calls:
utterances.extend(buildUtterances(c))
return utterances
def loadData(utterances):
"""Outputs the statements that will tell rec_test what to test."""
contexts = {}
for u in utterances:
if u.grammarSet not in contexts:
contexts[u.grammarSet] = []
contexts[u.grammarSet].append(u)
for c in contexts:
print "open errors " + c + ".errors"
print "open utd " + c + ".utd"
print "context_use", c
for u in contexts[c]:
print u
print "close utd"
print "close errors"
print "\n"
def makeParser():
parser = OptionParser( "usage: %prog -l LOWER CONFIDENCE -h UPPER
CONFIDENCE [-f FILTER1 -f FILTER2 ...] file" )
parser.add_option("-l", "--low-confidence", dest="lower", \
help="The lower confidence level to test at.",
metavar="CONFIDENCE")
parser.add_option("-u", "--upper-confidence", dest="upper", \
help="The upper confidence level to test at.",
metavar="CONFIDENCE")
parser.add_option("-f", "--filter", dest="filter", action="append",
\
help="Only test utterances transcribed as WORD.",
metavar="WORD")
return parser
def main():
global utterancesFile, CONFIDENCE_HIGH, CONFIDENCE_LOW
parser = makeParser()
(options, args) = parser.parse_args()
if len(args) != 1:
parser.error("One data file must be specified.")
elif options.lower == None:
parser.error("A lower confidence level must be specified.")
elif options.upper == None:
parser.error("An upper confidence level must be specified.")
DatafilePath = args[0]
CONFIDENCE_LOW = options.lower
CONFIDENCE_HIGH = options.upper
utterancesFile = parse(DatafilePath)
print ':ACC\n\n'
loadGrammars()
print "\n\n"
grammarSetNames = loadGrammarSets()
utterances = getUtterances()
if options.filter != None:
utterances = [u for u in utterances if u.text in
options.filter]
print "\n\n"
loadData(utterances)
print "report summary summary.txt"
print "report oov oov.txt"
print "report words words.txt"
if __name__ == '__main__':
main()
Peter Otten wrote:
> gheissenberger at gmail.com wrote:
>
> > HELP!
> > Guy who was here before me wrote a script to parse files in Python.
> >
> > Includes line:
> > print u
>
> According to your other posts 'u' seems to be an instance of a custom
> Utterance class with a __str__() method that accidentally returns unicode.
> Try changing the print statement to
>
> print unicode(u)
>
> If you're lucky, it works. Otherwise we need a piece of the actual code. To
> give you an idea what a self-contained demonstration of your problem might
> look like:
>
> >>> class Utterance(object):
> ... def __str__(self): return u"äöü"
> ...
> >>> u = Utterance()
> >>> print u
> Traceback (most recent call last):
> File "<stdin>", line 1, in <module>
> UnicodeEncodeError: 'ascii' codec can't encode characters in position 0-2:
> ordinal not in range(128)
> >>> print unicode(u)
> äöü
>
> Peter
More information about the Python-list
mailing list