Fuzzy string matching?
Tim Peters
tim_one at email.msn.com
Sat Aug 28 04:05:21 EDT 1999
[John Machin, disses Python's soundex implementation]
[Skip Montanaro]
> ...
> Guido would like sondex.c out of the core. Tim Peters sent a Python
> replacement to me. I was supposed to give it some life. It's still
> sitting in my inbox.
So what does it need to come to life? It's code; it works <wink>.
> I'll try to take care of it next week (bug fixes and all).
It didn't have the bugs John's worried about.
> (I realize that doesn't make Soundex any better for the fuzzy
> matching algorithm, but I should at least fix any obvious bugs...)
I find Knuth's description of the algorithm ambiguous in several respects, and
there are many incompatible soundex algorithms "out there". The one I sent
before probably should have treated non-letters as if they didn't exist, so
I'll attach a changed version that does so.
soundex-is-a-pragmatic-hack-so-right-vs-wrong-is-fuzzy-ly y'rs - tim
NDIGITS = 3
import string
_upper = string.upper
_translate = string.translate
# P for Punctuation -- all characters treated as if they weren't there
_tran = ["P"] * 256
def _setcode(letters, value):
for ch in letters:
_tran[ord(_upper(ch))] = _tran[ord(ch)] = value
_setcode("bfpv", "1")
_setcode("cgjkqsxz", "2")
_setcode("dt", "3")
_setcode("l", "4")
_setcode("mn", "5")
_setcode("r", "6")
# B for Break -- these guys break runs
_setcode("aeiouy", "B")
# I for Invisible -- they're invisible except as first char
_setcode("hw", "I")
_punctuation = filter(lambda i: _tran[i] == "P", range(256))
_punctuation = string.join(map(chr, _punctuation), "")
assert len(_punctuation) == 256 - 52, "Soundex initialization failed"
_tran = string.join(_tran, "")
del string, _setcode
def soundex(name):
"""name -> Soundex code, following Knuth Vol 3 Ed 2 pg 394."""
while name and _tran[ord(name[0])] == "P":
name = name[1:]
if not name:
raise ValueError("soundex requires non-empty name argument")
coded = _translate(name, _tran, _punctuation)
out = _upper(name[0])
lastrealcode = coded[0]
ignore_same = 1
for thiscode in coded[1:]:
if thiscode == "B":
ignore_same = 0
continue
if thiscode == "I":
continue
if ignore_same and lastrealcode == thiscode:
continue
out = out + thiscode
lastrealcode = thiscode
ignore_same = 1
if len(out) > NDIGITS:
break
if len(out) < NDIGITS + 1:
out = out + "0" * (NDIGITS + 1 - len(out))
return out
def _test():
global nerrors
def check(name, expected):
global nerrors
got = soundex(name)
if got != expected:
nerrors = nerrors + 1
print "error in Soundex test: name", name, \
"expected", expected, "got", got
nerrors = 0
check("Euler", "E460")
check("Ellery", "E460")
check("guass", "G200")
check("gauss", "G200")
check("Ghosh", "G200")
check("HILBERT", "H416")
check("Heilbronn", "H416")
check("Knuth", "K530")
check("K ** n U123t9247h ", "K530")
check("!Kant", "K530")
check("Lloyd", "L300")
check("Liddy", "L300")
check("Lukasiewicz", "L222")
check("Lissajous", "L222")
check("Wachs", "W200")
check("Waugh", "W200")
check("HYHYH", "H000")
check("kkkkkkkwwwwkkkkkhhhhhkkkkemmnmnhmn", "K500")
check("Rogers", "R262")
check("Rodgers", "R326")
check("Sinclair", "S524")
check("St. Clair", "S324")
check("Tchebysheff", "T212")
check("@@=T-c=h,e+b#y{s}h\000e\377f:f|", "T212")
check(" C h e b y s h e v ", "C121")
if nerrors:
raise SystemError("soundex test failed with " + `nerrors` +
" errors")
if __name__ == "__main__":
_test()
a
More information about the Python-list
mailing list