Jarow-Winkler algorithm: Measuring similarity between strings
Øyvind
oyvinrog at gmail.com
Sat Dec 20 00:02:08 CET 2008
Based on examples and formulas from http://en.wikipedia.org/wiki/Jaro-Winkler.
Useful for measuring similarity between two strings. For example if
you want to detect that the user did a typo.
def jarow(s1,s2):
""" Returns a number between 1 and 0, where 1 is the most similar
example:
print jarow("martha","marhta")
"""
m= jarow_m(s1,s2)
t1 = jarow_t(s1,s2)
t2 = jarow_t(s2,s1)
t = float(t1)/float(t2)
d = 0.1
# this is the jaro-distance
d_j = 1.0/3.0 * ((m/len(s1)) + (m/len(s2)) + ((m - t)/float(m)))
# if the strings are prefixed similar, they are weighted more
heavily
l = winkler_l(s1,s2)
print l
return d_j + (l * 0.1 * (1 - d_j))
def winkler_l(s1,s2):
""" Number of the four first characters matching """
l = 0
counter = 0
for s_j,s_i in zip(s1,s2):
if s_j == s_i:
l += 1
counter += 1
if counter > 4:
break
return l
def jarow_m(s1,s2):
""" Number of matching characters """
m = 0
d = {}
for s in s1:
d[s] = True
for s in s2:
if d.has_key(s):
m += 1
return m
def jarow_t(s1,s2):
"""
Number of transpositions
"""
t= 0
pos ={}
counter = 0
for s in s1:
pos[s] = counter
counter += 1
counter = 0
for s in s2:
if pos.has_key(s):
if pos[s] != counter:
t += 1
counter += 1
return t
More information about the Python-list
mailing list