update same chars sim and seq matcher sim.

This commit is contained in:
shibing624 2022-12-11 11:59:24 +08:00
parent 59f3084495
commit d236454cc6
5 changed files with 349 additions and 24 deletions

View File

@ -8,8 +8,16 @@ from loguru import logger
sys.path.append('..')
from similarities.literalsim import SimHashSimilarity, TfidfSimilarity, BM25Similarity, \
WordEmbeddingSimilarity, CilinSimilarity, HownetSimilarity
from similarities import (
SimHashSimilarity,
TfidfSimilarity,
BM25Similarity,
WordEmbeddingSimilarity,
CilinSimilarity,
HownetSimilarity,
SameCharsSimilarity,
SequenceMatcherSimilarity,
)
logger.remove()
logger.add(sys.stderr, level="INFO")
@ -54,7 +62,8 @@ if __name__ == '__main__':
queries = [
'我的花呗开通了?',
'乌克兰被俄罗斯警告'
'乌克兰被俄罗斯警告',
'更改绑定银行卡',
]
print('text1: ', text1)
print('text2: ', text2)
@ -65,3 +74,5 @@ if __name__ == '__main__':
sim_and_search(WordEmbeddingSimilarity())
sim_and_search(CilinSimilarity())
sim_and_search(HownetSimilarity())
sim_and_search(SameCharsSimilarity())
sim_and_search(SequenceMatcherSimilarity())

View File

@ -16,12 +16,14 @@ from similarities.literalsim import (
BM25Similarity,
WordEmbeddingSimilarity,
CilinSimilarity,
HownetSimilarity
HownetSimilarity,
SameCharsSimilarity,
SequenceMatcherSimilarity,
)
from similarities.imagesim import (
ImageHashSimilarity,
ClipSimilarity,
SiftSimilarity
SiftSimilarity,
)
from similarities.data_loader import SearchDataLoader
from similarities import evaluation

View File

@ -20,7 +20,7 @@ from tqdm import tqdm
from text2vec import Word2Vec
from similarities.similarity import SimilarityABC
from similarities.utils.distance import string_hash, hamming_distance
from similarities.utils.distance import string_hash, hamming_distance, longest_match_ratio
from similarities.utils.rank_bm25 import BM25Okapi
from similarities.utils.tfidf import TFIDF, load_stopwords, default_stopwords_file
from similarities.utils.util import cos_sim, semantic_search
@ -114,12 +114,21 @@ class SimHashSimilarity(SimilarityABC):
hash_code = hash_code + '0'
return hash_code
def ori_simhash(self, sentence: str):
"""
Compute SimHash for a given text.
:param sentence: str
:return: hash code
"""
hash_code = string_hash(sentence)
return hash_code
def _sim_score(self, seq1, seq2):
"""Convert hamming distance to similarity score."""
# 将距离转化为相似度
score = 0.0
if len(seq1) > 2 and len(seq2) > 2:
score = 1 - hamming_distance(seq1, seq2) / len(seq1)
score = 1 - hamming_distance(seq1, seq2, normalize=True)
return score
def similarity(self, a: Union[str, List[str]], b: Union[str, List[str]]):
@ -754,3 +763,194 @@ class HownetSimilarity(SimilarityABC):
result[qid][corpus_id] = score
return result
class SameCharsSimilarity(SimilarityABC):
"""
Compute text chars similarity between two sentences and retrieves most
similar sentence for a given corpus.
不考虑文本字符位置顺序基于相同字符数占比计算相似度
"""
def __init__(self, corpus: Union[List[str], Dict[str, str]] = None):
super().__init__()
self.corpus = {}
self.corpus_ids_map = {}
if corpus is not None:
self.add_corpus(corpus)
def __len__(self):
"""Get length of corpus."""
return len(self.corpus)
def __str__(self):
base = f"Similarity: {self.__class__.__name__}, matching_model: TextChars"
if self.corpus:
base += f", corpus size: {len(self.corpus)}"
return base
def add_corpus(self, corpus: Union[List[str], Dict[str, str]]):
"""
Extend the corpus with new documents.
Parameters
----------
corpus : list of str
"""
corpus_new = {}
start_id = len(self.corpus) if self.corpus else 0
if isinstance(corpus, list):
corpus = list(set(corpus))
for id, doc in enumerate(corpus):
if doc not in list(self.corpus.values()):
corpus_new[start_id + id] = doc
else:
for id, doc in corpus.items():
if doc not in list(self.corpus.values()):
corpus_new[id] = doc
self.corpus.update(corpus_new)
self.corpus_ids_map = {i: id for i, id in enumerate(list(self.corpus.keys()))}
logger.info(f"Start add new docs: {len(corpus_new)}")
logger.info(f"Add {len(corpus)} docs, total: {len(self.corpus)}")
def similarity(self, a: Union[str, List[str]], b: Union[str, List[str]]):
"""
Compute Chars similarity between two texts.
:param a:
:param b:
:return:
"""
if isinstance(a, str):
a = [a]
if isinstance(b, str):
b = [b]
if len(a) != len(b):
raise ValueError("expected two inputs of the same length")
def calc_pair_sim(sentence1, sentence2):
if not sentence1 or not sentence2:
return 0.0
same = set(sentence1) & set(sentence2)
similarity_score = max(len(same) / len(set(sentence1)), len(same) / len(set(sentence2)))
return similarity_score
return [calc_pair_sim(sentence1, sentence2) for sentence1, sentence2 in zip(a, b)]
def distance(self, a: Union[str, List[str]], b: Union[str, List[str]]):
"""Compute cosine distance between two texts."""
return [1 - s for s in self.similarity(a, b)]
def most_similar(self, queries: Union[str, List[str], Dict[str, str]], topn: int = 10):
"""Find the topn most similar texts to the query against the corpus."""
if isinstance(queries, str) or not hasattr(queries, '__len__'):
queries = [queries]
if isinstance(queries, list):
queries = {id: query for id, query in enumerate(queries)}
result = {qid: {} for qid, query in queries.items()}
for qid, query in queries.items():
q_res = []
for corpus_id, doc in self.corpus.items():
score = self.similarity(query, doc)[0]
q_res.append((corpus_id, score))
q_res.sort(key=lambda x: x[1], reverse=True)
q_res = q_res[:topn]
for corpus_id, score in q_res:
result[qid][corpus_id] = score
return result
class SequenceMatcherSimilarity(SimilarityABC):
"""
Compute text sequence matcher similarity between two sentences and retrieves most
similar sentence for a given corpus.
考虑文本字符位置顺序基于最长公共子串占比计算相似度
"""
def __init__(self, corpus: Union[List[str], Dict[str, str]] = None):
super().__init__()
self.corpus = {}
self.corpus_ids_map = {}
if corpus is not None:
self.add_corpus(corpus)
def __len__(self):
"""Get length of corpus."""
return len(self.corpus)
def __str__(self):
base = f"Similarity: {self.__class__.__name__}, matching_model: TextMatcher"
if self.corpus:
base += f", corpus size: {len(self.corpus)}"
return base
def add_corpus(self, corpus: Union[List[str], Dict[str, str]]):
"""
Extend the corpus with new documents.
Parameters
----------
corpus : list of str
"""
corpus_new = {}
start_id = len(self.corpus) if self.corpus else 0
if isinstance(corpus, list):
corpus = list(set(corpus))
for id, doc in enumerate(corpus):
if doc not in list(self.corpus.values()):
corpus_new[start_id + id] = doc
else:
for id, doc in corpus.items():
if doc not in list(self.corpus.values()):
corpus_new[id] = doc
self.corpus.update(corpus_new)
self.corpus_ids_map = {i: id for i, id in enumerate(list(self.corpus.keys()))}
logger.info(f"Start add new docs: {len(corpus_new)}")
logger.info(f"Add {len(corpus)} docs, total: {len(self.corpus)}")
def similarity(self, a: Union[str, List[str]], b: Union[str, List[str]]):
"""
Compute Chars similarity between two texts.
:param a:
:param b:
:return:
"""
if isinstance(a, str):
a = [a]
if isinstance(b, str):
b = [b]
if len(a) != len(b):
raise ValueError("expected two inputs of the same length")
def calc_pair_sim(sentence1, sentence2):
if not sentence1 or not sentence2:
return 0.0
similarity_score = longest_match_ratio(sentence1, sentence2)
return similarity_score
return [calc_pair_sim(sentence1, sentence2) for sentence1, sentence2 in zip(a, b)]
def distance(self, a: Union[str, List[str]], b: Union[str, List[str]]):
"""Compute cosine distance between two texts."""
return [1 - s for s in self.similarity(a, b)]
def most_similar(self, queries: Union[str, List[str], Dict[str, str]], topn: int = 10):
"""Find the topn most similar texts to the query against the corpus."""
if isinstance(queries, str) or not hasattr(queries, '__len__'):
queries = [queries]
if isinstance(queries, list):
queries = {id: query for id, query in enumerate(queries)}
result = {qid: {} for qid, query in queries.items()}
for qid, query in queries.items():
q_res = []
for corpus_id, doc in self.corpus.items():
score = self.similarity(query, doc)[0]
q_res.append((corpus_id, score))
q_res.sort(key=lambda x: x[1], reverse=True)
q_res = q_res[:topn]
for corpus_id, score in q_res:
result[qid][corpus_id] = score
return result

View File

@ -113,12 +113,14 @@ def is_str_match(str1, str2, threshold=1.0):
def longest_match_size(str1, str2):
"""最长公共子串长度"""
sq = SequenceMatcher(lambda x: x == " ", str1, str2)
match = sq.find_longest_match(0, len(str1), 0, len(str2))
return match.size
def longest_match_ratio(str1, str2):
"""最长公共子串占比"""
sq = SequenceMatcher(lambda x: x == " ", str1, str2)
match = sq.find_longest_match(0, len(str1), 0, len(str2))
return try_divide(match.size, min(len(str1), len(str2)))
@ -134,7 +136,7 @@ def jaccard_coef(A, B):
def num_of_common_sub_str(str1, str2):
"""
求两个字符串的最长公共子串
求两个字符串的最长公共子串同longest_match_size
思想建立一个二维数组保存连续位相同与否的状态
"""
lstr1 = len(str1)
@ -194,18 +196,24 @@ def z_score(x, axis=0):
if __name__ == '__main__':
vec1_test = np.array([1, 38, 17, 32])
vec2_test = np.array([5, 6, 8, 9])
str1_test = "你到底是谁?"
str2_test = "没想到我是谁,是真样子"
vec1_test = np.array([1.0, 38.0, 17.0, 32.0])
vec2_test = np.array([5.0, 6.0, 8.0, 9.0])
print(euclidean_distance(vec1_test, vec2_test))
print(cosine_distance(vec1_test, vec2_test))
print(manhattan_distance(vec1_test, vec2_test))
str1_test = "你到底是谁?"
str2_test = "没想到我是谁,是真样子"
print('strs:', str1_test, str2_test)
print(edit_distance(str1_test, str2_test))
print(num_of_common_sub_str(str1_test, str2_test))
print(max_min_normalize(vec1_test)) # 归一化0-1
print(z_score(vec1_test)) # 标准化0附近正负
str1 = '刘若英是演员和歌手'
str2 = '刘若英是演员吗?'
print(f"{str1} vs {str2} common sub str: {num_of_common_sub_str(str1, str2)}")
print(f"{str1} vs {str2} longest match size: {longest_match_size(str1, str2)}")
print(f"{str1} vs {str2} longest match ratio: {longest_match_ratio(str1, str2)}")

View File

@ -9,8 +9,16 @@ import unittest
sys.path.append('..')
from similarities.literalsim import SimHashSimilarity, TfidfSimilarity, BM25Similarity, WordEmbeddingSimilarity, \
CilinSimilarity, HownetSimilarity
from similarities.literalsim import (
SimHashSimilarity,
TfidfSimilarity,
BM25Similarity,
WordEmbeddingSimilarity,
CilinSimilarity,
HownetSimilarity,
SameCharsSimilarity,
SequenceMatcherSimilarity,
)
from similarities.utils.distance import hamming_distance
@ -27,7 +35,7 @@ class LiteralCase(unittest.TestCase):
r = 1.0 - hamming_distance(seq1, seq2) / 64
print(hamming_distance(seq1, seq2))
print(r)
print(m.similarity(text1, text2))
print(f"{text1} vs {text2} sim score {m.similarity(text1, text2)}")
text1 = '刘若英是个演员'
text2 = ''
@ -36,25 +44,25 @@ class LiteralCase(unittest.TestCase):
seq2 = m.simhash(text2)
print(seq1)
print(seq2)
print(m.similarity(text1, text2))
print(f"{text1} vs {text2} sim score {m.similarity(text1, text2)}")
text1 = '刘若'
text2 = ''
text1 = '刘若英唱歌'
text2 = '唱歌'
m = SimHashSimilarity()
seq1 = m.simhash(text1)
seq2 = m.simhash(text2)
print(seq1)
print(seq2)
print(m.similarity(text1, text2))
print(f"{text1} vs {text2} sim score {m.similarity(text1, text2)}")
text1 = '刘若'
text2 = '他他唱歌很好听他唱歌很好听他唱歌很好听他唱歌很好听他唱歌很好听他唱歌很好听,他唱歌很好听?他唱歌很好听?他唱歌很好听。。'
text2 = '刘若他他唱歌很好听他唱歌很好听他唱歌很好听他唱歌很好听他唱歌很好听他唱歌很好听,他唱歌很好听?他唱歌很好听?他唱歌很好听。。'
m = SimHashSimilarity()
seq1 = m.simhash(text1)
seq2 = m.simhash(text2)
print(seq1)
print(seq2)
print(m.similarity(text1, text2))
print(f"{text1} vs {text2} sim score {m.similarity(text1, text2)}")
text1 = '刘若 他唱歌很好听他唱歌很好听他唱歌很好听他唱歌很好听他唱歌很好听他唱歌很好听,他唱歌很好听?他唱歌很好听?他唱歌很好'
text2 = '他他唱歌很好听他唱歌很好听他唱歌很好听他唱歌很好听他唱歌很好听他唱歌很好听,他唱歌很好听?他唱歌很好听?他唱歌很好听。。'
@ -64,7 +72,7 @@ class LiteralCase(unittest.TestCase):
print(seq1)
print(seq2)
s = m.similarity(text1, text2)
print(s)
print(f"{text1} vs {text2} sim score {s}")
self.assertTrue(s[0] > 0.5)
def test_simhash(self):
@ -72,7 +80,7 @@ class LiteralCase(unittest.TestCase):
text1 = '刘若英是个演员'
text2 = '他唱歌很好听'
m = SimHashSimilarity()
print(m.similarity(text1, text2))
print(f"{text1} vs {text2} sim score {m.similarity(text1, text2)}")
print(m.distance(text1, text2))
r = m.most_similar('刘若英是演员')
self.assertEqual(len(r[0]), 0)
@ -82,6 +90,58 @@ class LiteralCase(unittest.TestCase):
print(r)
self.assertEqual(len(r[0]), 2)
def test_short_text_simhash(self):
text1 = '你妈妈喊你回家吃饭哦,回家罗回家罗'
text2 = '你妈妈叫你回家吃饭哦,回家罗回家罗'
m = SimHashSimilarity()
seq1 = m.ori_simhash(text1)
seq2 = m.ori_simhash(text2)
print(seq1)
print(seq2)
scores = [m._sim_score(seq1, seq2) for seq1, seq2 in zip([seq1], [seq2])]
print(f"{text1} vs {text2} ori_simhash sim score {scores}")
def simhash_demo(text_a, text_b):
"""
求两文本的相似度
:param text_a:
:param text_b:
:return:
"""
from simhash import Simhash
a_simhash = Simhash(text_a)
b_simhash = Simhash(text_b)
print(a_simhash.value)
max_hashbit = max(len(bin(a_simhash.value)), len(bin(b_simhash.value)))
# 汉明距离
distince = a_simhash.distance(b_simhash)
print(distince)
similar = 1 - distince / max_hashbit
return similar
similar = simhash_demo(text1, text2)
print(f"{text1} vs {text2} simhash_demo sim score {similar}")
print(f"{text1} vs {text2} simhash sim score {m.similarity(text1, text2)}")
text1 = "平台专注于游戏领域,多年的AI技术积淀,一站式提供文本、图片、音/视频内容审核,游戏AI以及数据平台服务"
text2 = "平台专注于游戏领域,多年的AI技术积淀,二站式提供文本、图片、音 视频内容审核,游戏AI以及数据平台服务"
text3 = '平台专注于游戏领域,多年的AI技术积淀,三站式提供文本、图片、音视频内容审核'
similar = simhash_demo(text1, text2)
similar2 = simhash_demo(text1, text3)
similar3 = simhash_demo(text2, text3)
print(similar)
print(similar2)
print(similar3)
print(f"{text1} vs {text2} sim score {m.similarity(text1, text2)}")
print(m.distance(text1, text2))
r = m.most_similar('刘若英是演员')
self.assertEqual(len(r[0]), 0)
zh_list = ['刘若英是个演员', '他唱歌很好听', 'women喜欢这首歌']
m.add_corpus(zh_list)
r = m.most_similar('刘若英是演员', topn=2)
print(r)
def test_tfidf(self):
"""test_tfidf"""
text1 = '刘若英是个演员'
@ -164,6 +224,50 @@ class LiteralCase(unittest.TestCase):
print(r)
self.assertEqual(len(r), 2)
def test_samechars(self):
"""test_samechars"""
text1 = '周杰伦是一个歌手'
text2 = '刘若英是个演员'
m = SameCharsSimilarity()
print(m.similarity(text1, text2))
print(m.distance(text1, text2))
text1 = '刘若英是演员'
text2 = '刘若英是个演员'
m = SameCharsSimilarity()
print(m.similarity(text1, text2))
print(m.distance(text1, text2))
zh_list = ['刘若英是个演员', '他唱歌很好听', 'women喜欢这首歌']
m.add_corpus(zh_list)
r = m.most_similar('刘若英是演员', topn=3)
print(r)
self.assertEqual(len(r[0]), 3)
r = m.most_similar(['刘若英是演员', '唱歌很好听'])
print(r)
self.assertEqual(len(r), 2)
def test_seqmatcher(self):
"""test_seqmatcher"""
text1 = '周杰伦是一个歌手'
text2 = '刘若英是个演员'
m = SequenceMatcherSimilarity()
print(m.similarity(text1, text2))
print(m.distance(text1, text2))
text1 = '刘若英是演员'
text2 = '刘若英是个演员'
m = SequenceMatcherSimilarity()
print(m.similarity(text1, text2))
print(m.distance(text1, text2))
zh_list = ['刘若英是个演员', '他唱歌很好听', 'women喜欢这首歌']
m.add_corpus(zh_list)
r = m.most_similar('刘若英是演员', topn=3)
print(r)
self.assertEqual(len(r[0]), 3)
r = m.most_similar(['刘若英是演员', '唱歌很好听'])
print(r)
self.assertEqual(len(r), 2)
if __name__ == '__main__':
unittest.main()