Suggest more finesse, please. I/O and sequences.

Fri Mar 25 15:51:59 EST 2005

Qertoip wrote:
> import sys
> 
> def moreCommonWord( x, y ):
> 	if x[1] != y[1]:
> 		return cmp( x[1], y[1] ) * -1
> 	return cmp( x[0], y[0] )

If you want to keep this, use:

     def moreCommonWord(x, y):
         if x[1] != y[1]:
             return cmp(y[1], x[1])
         return cmp(x[0], y[0])

...
I don't like type-based names (Charles Simonyi never convinced me), so:
 > wordsDic = {}
     corpus = {}

...
> for word in inFile.read().split():
> 	if wordsDic.has_key( word ):
> 		wordsDic[word] = wordsDic[word] + 1
> 	else:
> 		wordsDic[word] = 1
> inFile.close()

How about:
     for line in inFile:
         for word in line.split():
             try:
                 corpus[word] += 1
             except KeyError:
                 corpus[word] = 1

...

> wordsLst = wordsDic.items()
> wordsLst.sort( moreCommonWord )
OK, here I'm going to get version specific.
For Python 2.4 and later:
     words = sorted((-freq, word) for word, freq
                                     in corpus.iteritems())
For at least Python 2.2:
     words = [(-freq, word) for word, freq in corpus.iteritems()]
     words.sort()
For before Python 2.2:
     words = corpus.items()
     words.sort(moreCommonWord)

> for pair in wordsLst:
> 	outFile.write( str( pair[1] ).rjust( 7 ) + " : " + str( pair[0] ) + "\n" )
> outFile.close()

Before python 2.2 (because we use different data for words):
     for word, frequency in words:
	print >>outFile, '%7d : %s' % (frequency, word)

After python 2.2:
  for negfrequency, word in words:
	print >>outFile, '%7d : %s' % (-negfrequency, word)

So, with all my prejudices in place and python 2.4 on my box, I'd
lift a few things to functions:

     def refcount(corpus, infile):
         '''Update corpus counters in corpus from words in infile'''
         for line in infile:
             for word in line.split():
                 try:
                     corpus[word] += 1
                 except KeyError:
                     corpus[word] = 1

     def main(sources, output=None):
         '''Count words in sources and report frequencies to output'''
         corpus = {}
         for source in sources:
             f = open(source)
             refcount(corpus, f)
             f.close()
         for negfrequency, word in sorted((-frequency, word) for
                               word, frequency in corpus.iteritems()):
             print >>output, '%7d : %s' % (-negfrequency, word)

     if __name__ == '__main__':
         import sys

         if len(sys.argv) < 2:
             main(sys.argv[1 :])
         else:
             output = open(sys.argv[-1], 'w')
             try:
                 main(sys.argv[1 : -1], output)
             finally:
                 output.close()

--Scott David Daniels
Scott.Daniels at Acm.Org