# Jarow-Winkler algorithm: Measuring similarity between strings

Øyvind oyvinrog at gmail.com
Sat Dec 20 00:02:08 CET 2008

```Based on examples and formulas from http://en.wikipedia.org/wiki/Jaro-Winkler.
Useful for measuring similarity between two strings. For example if
you want to detect that the user did a typo.

def jarow(s1,s2):

"""  Returns a number between 1 and 0, where 1 is the most similar

example:

print jarow("martha","marhta")

"""
m= jarow_m(s1,s2)
t1 = jarow_t(s1,s2)
t2 = jarow_t(s2,s1)
t = float(t1)/float(t2)

d = 0.1

# this is the jaro-distance
d_j = 1.0/3.0 * ((m/len(s1)) + (m/len(s2)) + ((m - t)/float(m)))

# if the strings are prefixed similar, they are weighted more
heavily
l = winkler_l(s1,s2)
print l
return d_j + (l * 0.1 * (1 - d_j))

def winkler_l(s1,s2):
""" Number of the four first characters matching """

l = 0
counter = 0
for s_j,s_i in zip(s1,s2):

if s_j == s_i:

l += 1
counter += 1

if counter > 4:
break

return l

def jarow_m(s1,s2):

""" Number of matching characters """
m = 0
d = {}
for s in s1:

d[s] = True

for s in s2:

if d.has_key(s):

m += 1
return m
def jarow_t(s1,s2):

"""
Number of transpositions

"""

t= 0
pos ={}
counter = 0
for s in s1:

pos[s] = counter
counter += 1
counter = 0
for s in s2:

if pos.has_key(s):

if pos[s] != counter:

t += 1

counter += 1

return t

```