update edit sim.

This commit is contained in:
shibing624 2023-02-01 15:24:31 +08:00
parent d3d424d384
commit 472164b766

View File

@ -83,10 +83,10 @@ def edit_distance(str1, str2):
# very fast
# http://stackoverflow.com/questions/14260126/how-python-levenshtein-ratio-is-computed
import Levenshtein
d = Levenshtein.distance(str1, str2) / float(max(len(str1), len(str2)))
d = 1.0 - Levenshtein.distance(str1, str2) / float(max(len(str1), len(str2)))
except:
# https://docs.python.org/2/library/difflib.html
d = 1. - SequenceMatcher(lambda x: x == " ", str1, str2).ratio()
d = 1.0 - SequenceMatcher(lambda x: x == " ", str1, str2).ratio()
return d
@ -109,7 +109,7 @@ def is_str_match(str1, str2, threshold=1.0):
if float(threshold) == 1.0:
return str1 == str2
else:
return (1. - edit_distance(str1, str2)) >= threshold
return (1.0 - edit_distance(str1, str2)) >= threshold
def longest_match_size(str1, str2):
@ -205,8 +205,9 @@ if __name__ == '__main__':
str1_test = "你到底是谁?"
str2_test = "没想到我是谁,是真样子"
print('strs:', str1_test, str2_test)
print('strs:', str1_test, ' vs ', str2_test)
print(edit_distance(str1_test, str2_test))
print('edit sim:', 1 - edit_distance(str1_test, str2_test))
print(num_of_common_sub_str(str1_test, str2_test))
print(max_min_normalize(vec1_test)) # 归一化0-1
print(z_score(vec1_test)) # 标准化0附近正负
@ -216,4 +217,3 @@ if __name__ == '__main__':
print(f"{str1} vs {str2} common sub str: {num_of_common_sub_str(str1, str2)}")
print(f"{str1} vs {str2} longest match size: {longest_match_size(str1, str2)}")
print(f"{str1} vs {str2} longest match ratio: {longest_match_ratio(str1, str2)}")