Help Optimizing Word Search

Wed Jan 12 03:59:54 EST 2005

>>>>> "Case" == Case Nelson <case.nelson at gmail.com> writes:

> Hi there I've just been playing around with some python code and I've
> got a fun little optimization problem I could use some help with.

> Basically, the program needs to take in a random list of no more than
> 10 letters,  and find all possible mutations that match a word in my
> dictionary (80k words). However a wildcard letter '?' is also an
> acceptable character which increases the worst case time significantly.
> So if the letters are ['a','b','c'] check a, b, c, ab, ac, ba, bc, ca,
> cb, abc, acb, bac, bca, cab, cba where only a, ba and cab would be
> added to the dict of words. If the letters are ['?','?'] check a-z, aa,
> ab, ac, ad, ..., az, ba, bb, bc, bd, ..., zz

You're trying to cheat at scrabble aren't you ;-). I once wrote a solution 
using regular expressions, which I brushed up and posted here. Can you
check how it compares with your version? 

Call as "wordsearch abcd". You can also use wildcards.

=====
.#!/usr/bin/python
.
.import sys
.import os
.import re
.
.wordlist = "/usr/share/dict/words"
.
.def get_freq(str):
.    """Create a hash of number of times a character occurs in string"""
.    freq = { }
.    for c in str:
.        freq[c] = freq.get(c, 0) + 1
.    return freq
.        
.
.def main(argv = sys.argv):
.    chars = argv[1]
.    if argv[2]:
.        wordlist = argv[2]
.    wild = chars.count("?")
.    if wild:
.        chars = chars.replace("?", "")  # remove the wild cards
.    if chars:
.        charpat = "[" + chars + "]*"
.    else:
.        charpat = ""
.
.    # create a regexp suitable for matching
.    pattern = charpat
.    for w in xrange(wild):
.        pattern += ".?" + charpat
.    pattern = "^" + pattern + "$"
.    print >> sys.stderr, pattern
.
.    # char count for our list of chars
.    charfreq = get_freq(chars)
.
.    pattern_re = re.compile(pattern, re.IGNORECASE)
.    # for line in os.popen("zegrep -i '%s' %s" % (pattern, wordlist)):
.    for line in open(wordlist):
.        line = line.rstrip()
.        if not pattern_re.search(line):
.            continue
.        if not line or len(line) > len(chars) + wild:
.            continue
.        line = line.lower()
.        linefreq = get_freq(line)
.        
.        # check if chars occur fewer times than in the given string
.        extra = 0
.        for c in linefreq.keys():
.            if linefreq[c] > charfreq.get(c, 0):
.                extra += linefreq[c] - charfreq.get(c, 0)
.            if extra > wild:
.                break
.        else:
.            print line
.        
.if __name__=='__main__':
.    main()

=====