#!/usr/bin/env python '''levenshtein.py Created by Vincent Van Asch on 09/09/09. Copyright (c) 2009 CNTS. All rights reserved. # License: GNU General Public License, see http://www.clips.ua.ac.be/~vincent/scripts/LICENSE.txt ''' __author__='Vincent Van Asch' __date__='09/09/09' __url__ = 'http://www.clips.ua.ac.be/~vincent/software.html' def distance(w1, w2, encoding='utf8'): '''Returns the Levenshtein distance between two words according to: http://en.wikipedia.org/wiki/Levenshtein_distance w1, w2: unicode strings If w1,w2 are not unicode then the encoding parameter is used to decode them. ''' if isinstance(w1, str): w1 = w1.decode(encoding) if isinstance(w2, str): w2 = w2.decode(encoding) if not (isinstance(w1, unicode) and isinstance(w2, unicode)): raise TypeError('w1 and w2 must be unicode strings') length1=len(w1)+1 length2=len(w2)+1 d=[] for i in range(length1): if i == 0: d.append(range(length2)) else: d.append([i]+[0]*(length2-1)) for j in range(1,length2): for i in range(1,length1): if w2[j-1]==w1[i-1]: cost=0 else: cost=1 v= min( d[i-1][j]+1, #insertion d[i][j-1]+1, #deletion d[i-1][j-1]+cost ) #substitution d[i][j] = v return d[length1-1][length2-1]