#60 compare 支持交换句子

This commit is contained in:
Hai Liang Wang 2018-05-28 11:44:36 +08:00
parent c580b3d82d
commit 4a44eff88f
5 changed files with 32 additions and 21 deletions

View File

@ -1,3 +1,5 @@
# 3.6
* Fix Bug: compare 保证交换两个句子后分数一致 [#60](https://github.com/huyingxi/Synonyms/issues/60)
# 3.5
* 根据实际情况,降低向量距离对近似度分数的影响

View File

@ -1 +1 @@
synonyms>=3.5
synonyms>=3.6

View File

@ -114,6 +114,15 @@ class Test(unittest.TestCase):
r = synonyms.compare(sen1, sen2, seg=False)
print("%s vs %s" % (sen1, sen2), r)
def test_swap_sent(self):
print("test_swap_sent")
s1 = synonyms.compare("教学", "老师")
s2 = synonyms.compare("老师", "教学")
print('"教学", "老师": %s ' % s1)
print('"老师", "教学": %s ' % s2)
assert s1 == s2, "Scores should be the same after swap sents"
def test_nearby(self):
synonyms.display("奥运") # synonyms.display calls synonyms.nearby
synonyms.display("北新桥") # synonyms.display calls synonyms.nearby

View File

@ -13,7 +13,7 @@ Welcome
setup(
name='synonyms',
version='3.5.0',
version='3.6.0',
description='Chinese Synonyms for Natural Language Processing and Understanding',
long_description=LONGDOC,
author='Hai Liang Wang, Hu Ying Xi',

View File

@ -211,28 +211,28 @@ def _nearby_levenshtein_distance(s1, s2):
使用空间距离近的词汇优化编辑距离计算
'''
s1_len, s2_len = len(s1), len(s2)
maxlen = max(s1_len, s2_len)
first, second = (s2, s1) if s1_len == maxlen else (s1, s2)
ft_1 = set() # all related words with first sentence
maxlen = s1_len
if s1_len == s2_len:
first, second = sorted([s1, s2])
elif s1_len < s2_len:
first = s1
second = s2
maxlen = s2_len
else:
first = s2
second = s1
ft = set() # all related words with first sentence
for x in first:
ft_1.add(x)
ft.add(x)
n, _ = nearby(x)
for o in n[:5]:
ft_1.add(o)
ft_2 = set() # all related words with second sentence
for x in second:
ft_2.add(x)
n, _ = nearby(x)
for o in n[:5]:
ft_2.add(0)
for o in n[:10]:
ft.add(o)
scores = []
if len(ft_1) == 0 or len(ft_2) == 0: return 0.0 # invalid length
for x in ft_1:
for y in ft_2:
scores.append([_levenshtein_distance(x, y)])
s = np.sum(scores) / (s1_len * s2_len)
for x in second:
scores.append(max([_levenshtein_distance(x, y) for y in ft]))
s = np.sum(scores) / maxlen
return s
def _similarity_distance(s1, s2, ignore):