Jarow-Winkler algorithm: Measuring similarity between strings

Øyvind oyvinrog at gmail.com
Sat Dec 20 00:02:08 CET 2008


Based on examples and formulas from http://en.wikipedia.org/wiki/Jaro-Winkler.
Useful for measuring similarity between two strings. For example if
you want to detect that the user did a typo.



def jarow(s1,s2):

    """  Returns a number between 1 and 0, where 1 is the most similar

        example:

        print jarow("martha","marhta")

        """
    m= jarow_m(s1,s2)
    t1 = jarow_t(s1,s2)
    t2 = jarow_t(s2,s1)
    t = float(t1)/float(t2)

    d = 0.1

    # this is the jaro-distance
    d_j = 1.0/3.0 * ((m/len(s1)) + (m/len(s2)) + ((m - t)/float(m)))

    # if the strings are prefixed similar, they are weighted more
heavily
    l = winkler_l(s1,s2)
    print l
    return d_j + (l * 0.1 * (1 - d_j))

def winkler_l(s1,s2):
    """ Number of the four first characters matching """

    l = 0
    counter = 0
    for s_j,s_i in zip(s1,s2):

        if s_j == s_i:

            l += 1
        counter += 1

        if counter > 4:
            break

    return l



def jarow_m(s1,s2):

    """ Number of matching characters """
    m = 0
    d = {}
    for s in s1:

        d[s] = True

    for s in s2:


        if d.has_key(s):

            m += 1
    return m
def jarow_t(s1,s2):

    """
    Number of transpositions

    """

    t= 0
    pos ={}
    counter = 0
    for s in s1:

        pos[s] = counter
        counter += 1
    counter = 0
    for s in s2:

        if pos.has_key(s):

            if pos[s] != counter:

                t += 1

        counter += 1

    return t



More information about the Python-list mailing list