Help Optimizing Word Search
Ganesan R
rganesan at myrealbox.com
Wed Jan 12 03:59:54 EST 2005
>>>>> "Case" == Case Nelson <case.nelson at gmail.com> writes:
> Hi there I've just been playing around with some python code and I've
> got a fun little optimization problem I could use some help with.
> Basically, the program needs to take in a random list of no more than
> 10 letters, and find all possible mutations that match a word in my
> dictionary (80k words). However a wildcard letter '?' is also an
> acceptable character which increases the worst case time significantly.
> So if the letters are ['a','b','c'] check a, b, c, ab, ac, ba, bc, ca,
> cb, abc, acb, bac, bca, cab, cba where only a, ba and cab would be
> added to the dict of words. If the letters are ['?','?'] check a-z, aa,
> ab, ac, ad, ..., az, ba, bb, bc, bd, ..., zz
You're trying to cheat at scrabble aren't you ;-). I once wrote a solution
using regular expressions, which I brushed up and posted here. Can you
check how it compares with your version?
Call as "wordsearch abcd". You can also use wildcards.
=====
.#!/usr/bin/python
.
.import sys
.import os
.import re
.
.wordlist = "/usr/share/dict/words"
.
.def get_freq(str):
. """Create a hash of number of times a character occurs in string"""
. freq = { }
. for c in str:
. freq[c] = freq.get(c, 0) + 1
. return freq
.
.
.def main(argv = sys.argv):
. chars = argv[1]
. if argv[2]:
. wordlist = argv[2]
. wild = chars.count("?")
. if wild:
. chars = chars.replace("?", "") # remove the wild cards
. if chars:
. charpat = "[" + chars + "]*"
. else:
. charpat = ""
.
. # create a regexp suitable for matching
. pattern = charpat
. for w in xrange(wild):
. pattern += ".?" + charpat
. pattern = "^" + pattern + "$"
. print >> sys.stderr, pattern
.
. # char count for our list of chars
. charfreq = get_freq(chars)
.
. pattern_re = re.compile(pattern, re.IGNORECASE)
. # for line in os.popen("zegrep -i '%s' %s" % (pattern, wordlist)):
. for line in open(wordlist):
. line = line.rstrip()
. if not pattern_re.search(line):
. continue
. if not line or len(line) > len(chars) + wild:
. continue
. line = line.lower()
. linefreq = get_freq(line)
.
. # check if chars occur fewer times than in the given string
. extra = 0
. for c in linefreq.keys():
. if linefreq[c] > charfreq.get(c, 0):
. extra += linefreq[c] - charfreq.get(c, 0)
. if extra > wild:
. break
. else:
. print line
.
.if __name__=='__main__':
. main()
=====
More information about the Python-list
mailing list