Update __init__.py

Use _levenshtein_distance() to replace _unigram_overlap. We still need to adjust weight for better result. Right now following two sentences are still regarded as same.

synonyms.compare('目前你用什么方法来保护朋友',  '目前你用什么方法来保护家人')
This commit is contained in:
bobbercheng 2017-11-14 09:53:28 -06:00 committed by GitHub
parent 902fd83808
commit cdb85530f4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -185,6 +185,30 @@ def _unigram_overlap(sentence1, sentence2):
return ((float)(len(intersection)) / (float)(len(union)))
def _levenshtein_distance(sentence1, sentence2):
'''
Return the Levenshtein distance between two strings.
Based on:
http://rosettacode.org/wiki/Levenshtein_distance#Python
'''
first = sentence1.split()
second = sentence2.split()
if len(first) > len(second):
first, second = second, first
distances = range(len(first) + 1)
for index2, char2 in enumerate(second):
new_distances = [index2 + 1]
for index1, char1 in enumerate(first):
if char1 == char2:
new_distances.append(distances[index1])
else:
new_distances.append(1 + min((distances[index1],
distances[index1 + 1],
new_distances[-1])))
distances = new_distances
levenshtein = distances[-1]
return 2 ** (-1 * levenshtein)
def _similarity_distance(s1, s2):
'''
@ -194,9 +218,9 @@ def _similarity_distance(s1, s2):
b = _sim_molecule(_get_wv(s2))
# https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.linalg.norm.html
g = 1 / (np.linalg.norm(a - b) + 1)
u = _unigram_overlap(s1, s2)
u = _levenshtein_distance(s1, s2)
r = g * 1.4 + u * 0.2
r = min((r * 10 + 0.1) , 1.0)
r = min((r * 10 + 0.1), 1.0)
return float("%.3f" % r)