Python nuube needs Unicode help

Fri Jan 12 09:44:27 EST 2007

Can you attach files in this forum? Couldn't find the option. Oh well,
here's the file.

#!/usr/bin/python
# Version: 1.1
# Author:  Steve Losh

from sets import Set
from optparse import OptionParser
from xml.dom.minidom import parse

AudioPath     = 'audio/'
DatafilePath  = 'utterances.trmxml'
CONFIDENCE_LOW  = None #'500'
CONFIDENCE_HIGH = None #'500'

utterancesFile = None

class Utterance:
    def __init__(self, audio, grammarSet, text):
        self.audio      = audio
        self.grammarSet = grammarSet
        self.text       = text

    def __str__(self):
        return "SWIrecAcousticStateReset\ntranscription " + self.text \
                + "\nrecognize " + AudioPath + self.audio

def getGrammarPaths():
    """Get the paths of all the grammars needed.  Returns a Set
containing the results.
If a grammar is listed more than once in the transcription manifest it
will only appear once in these results.

TODO:
Find a less fragile way to split off the server half of the URIs."""
    grammarTags  = utterancesFile.getElementsByTagName('Grammar')
    grammarURIs  = [tag.getAttribute('uri') for tag in grammarTags]
    grammarPaths = [uri.split('servlet/CA/')[1] for uri in grammarURIs]
    return Set(grammarPaths)

def createGrammarNameFromPath(path):
    """Convert a given path into an appropriate name for the
grammar."""
    path = path.replace('/', '-')  # Strip the directory slashes
    path = path.replace('.', '_')  # and the dot before the extension.
    return path

def loadGrammars():
    """Output the statements that will load the required grammars."""
    grammarPaths = list(getGrammarPaths())
    grammarsToLoad = {}
    for path in grammarPaths:
        grammarName = createGrammarNameFromPath(path)
        grammarsToLoad[grammarName] = "grammars/" + path
    for grammarName in grammarsToLoad:
        print "SWIrecGrammarLoad", grammarName,
grammarsToLoad[grammarName]

def loadGrammarSets():
    """Output the statements that will define the grammar
sets/contexts.  Returns a list of the grammar set names."""
    grammarSetList  =
utterancesFile.getElementsByTagName('GrammarSets')
    grammarSets     =
grammarSetList[0].getElementsByTagName('GrammarSet')
    grammarSetNames = []
    for gs in grammarSets:
        grammarSetName = gs.getAttribute('id')
        print "context_define", grammarSetName, CONFIDENCE_LOW,
CONFIDENCE_HIGH
        for g in gs.getElementsByTagName('Grammar'):
            path = g.getAttribute('uri').split('servlet/CA/')[1]
            print "context_add", createGrammarNameFromPath(path),
'1000'
        print "context_end\n"
        grammarSetNames.append(grammarSetName)
    return grammarSetNames

def buildUtterances(call):
    """This function takes a call tag, builds the utterances belonging
to it and returns a list containing them."""
    utts = call.getElementsByTagName('Utt')
    utterances = [Utterance( utt.getAttribute('audio'), \
                             utt.getAttribute('grammarSet'),
                             utt.getAttribute('transcribedText') ) \
                  for utt in utts]
    return utterances

def getUtterances():
    """Returns a list of all the utterances we want to test."""
    callList = utterancesFile.getElementsByTagName('Calls')[0]
    calls = callList.getElementsByTagName('Call')
    utterances = []
    for c in calls:
        utterances.extend(buildUtterances(c))
    return utterances

def loadData(utterances):
    """Outputs the statements that will tell rec_test what to test."""
    contexts = {}
    for u in utterances:
        if u.grammarSet not in contexts:
            contexts[u.grammarSet] = []
        contexts[u.grammarSet].append(u)
    for c in contexts:
        print "open errors " + c + ".errors"
        print "open utd "    + c + ".utd"
        print "context_use", c
        for u in contexts[c]:
	    print u
        print "close utd"
        print "close errors"
        print "\n"

def makeParser():
    parser = OptionParser( "usage: %prog -l LOWER CONFIDENCE -h UPPER
CONFIDENCE [-f FILTER1 -f FILTER2 ...] file" )
    parser.add_option("-l", "--low-confidence", dest="lower", \
                      help="The lower confidence level to test at.",
metavar="CONFIDENCE")
    parser.add_option("-u", "--upper-confidence", dest="upper", \
                      help="The upper confidence level to test at.",
metavar="CONFIDENCE")
    parser.add_option("-f", "--filter", dest="filter", action="append",
\
                      help="Only test utterances transcribed as WORD.",
metavar="WORD")
    return parser

def main():
    global utterancesFile, CONFIDENCE_HIGH, CONFIDENCE_LOW
    parser = makeParser()
    (options, args) = parser.parse_args()
    if len(args) != 1:
        parser.error("One data file must be specified.")
    elif options.lower == None:
        parser.error("A lower confidence level must be specified.")
    elif options.upper == None:
        parser.error("An upper confidence level must be specified.")

    DatafilePath = args[0]
    CONFIDENCE_LOW  = options.lower
    CONFIDENCE_HIGH = options.upper

    utterancesFile = parse(DatafilePath)

    print ':ACC\n\n'
    loadGrammars()
    print "\n\n"

    grammarSetNames = loadGrammarSets()

    utterances = getUtterances()
    if options.filter != None:
        utterances = [u for u in utterances if u.text in
options.filter]

    print "\n\n"
    loadData(utterances)

    print "report summary summary.txt"
    print "report oov oov.txt"
    print "report words words.txt"

if __name__ == '__main__':
    main()

Peter Otten wrote:
> gheissenberger at gmail.com wrote:
>
> > HELP!
> > Guy who was here before me wrote a script to parse files in Python.
> >
> > Includes line:
> > print u
>
> According to your other posts 'u' seems to be an instance of a custom
> Utterance class with a __str__() method that accidentally returns unicode.
> Try changing the print statement to
>
> print unicode(u)
>
> If you're lucky, it works. Otherwise we need a piece of the actual code. To
> give you an idea what a self-contained demonstration of your problem might
> look like:
>
> >>> class Utterance(object):
> ...     def __str__(self): return u"äöü"
> ...
> >>> u = Utterance()
> >>> print u
> Traceback (most recent call last):
>   File "<stdin>", line 1, in <module>
> UnicodeEncodeError: 'ascii' codec can't encode characters in position 0-2:
> ordinal not in range(128)
> >>> print unicode(u)
> äöü
> 
> Peter