Suggest more finesse, please. I/O and sequences.
Scott David Daniels
Scott.Daniels at Acm.Org
Fri Mar 25 15:51:59 EST 2005
Qertoip wrote:
> import sys
>
> def moreCommonWord( x, y ):
> if x[1] != y[1]:
> return cmp( x[1], y[1] ) * -1
> return cmp( x[0], y[0] )
If you want to keep this, use:
def moreCommonWord(x, y):
if x[1] != y[1]:
return cmp(y[1], x[1])
return cmp(x[0], y[0])
...
I don't like type-based names (Charles Simonyi never convinced me), so:
> wordsDic = {}
corpus = {}
...
> for word in inFile.read().split():
> if wordsDic.has_key( word ):
> wordsDic[word] = wordsDic[word] + 1
> else:
> wordsDic[word] = 1
> inFile.close()
How about:
for line in inFile:
for word in line.split():
try:
corpus[word] += 1
except KeyError:
corpus[word] = 1
...
> wordsLst = wordsDic.items()
> wordsLst.sort( moreCommonWord )
OK, here I'm going to get version specific.
For Python 2.4 and later:
words = sorted((-freq, word) for word, freq
in corpus.iteritems())
For at least Python 2.2:
words = [(-freq, word) for word, freq in corpus.iteritems()]
words.sort()
For before Python 2.2:
words = corpus.items()
words.sort(moreCommonWord)
> for pair in wordsLst:
> outFile.write( str( pair[1] ).rjust( 7 ) + " : " + str( pair[0] ) + "\n" )
> outFile.close()
Before python 2.2 (because we use different data for words):
for word, frequency in words:
print >>outFile, '%7d : %s' % (frequency, word)
After python 2.2:
for negfrequency, word in words:
print >>outFile, '%7d : %s' % (-negfrequency, word)
So, with all my prejudices in place and python 2.4 on my box, I'd
lift a few things to functions:
def refcount(corpus, infile):
'''Update corpus counters in corpus from words in infile'''
for line in infile:
for word in line.split():
try:
corpus[word] += 1
except KeyError:
corpus[word] = 1
def main(sources, output=None):
'''Count words in sources and report frequencies to output'''
corpus = {}
for source in sources:
f = open(source)
refcount(corpus, f)
f.close()
for negfrequency, word in sorted((-frequency, word) for
word, frequency in corpus.iteritems()):
print >>output, '%7d : %s' % (-negfrequency, word)
if __name__ == '__main__':
import sys
if len(sys.argv) < 2:
main(sys.argv[1 :])
else:
output = open(sys.argv[-1], 'w')
try:
main(sys.argv[1 : -1], output)
finally:
output.close()
--Scott David Daniels
Scott.Daniels at Acm.Org
More information about the Python-list
mailing list