update readme

2022-03-05 23:48:17 +08:00 · 2022-03-05 23:48:17 +08:00 · 16e1d283c1
commit 16e1d283c1
parent 34fd94d9c6
3 changed files with 68 additions and 16 deletions
--- a/README.md
+++ b/README.md
@ -8,11 +8,11 @@
 [![Wechat Group](http://vlog.sfyc.ltd/wechat_everyday/wxgroup_logo.png?imageView2/0/w/60/h/20)](#Contact)

 # Similarities
-Similarities is a toolkit for Compute Similarity Score between texts. 
+Similarities is a toolkit for similarity calculation and semantic search based on matching model. 

-相似度计算工具包，实现多种字面、语义匹配模型。
+similarities：相似度计算、语义匹配搜索工具包。

-**similarities**实现了Word2Vec、RankBM25、BERT、Sentence-BERT、CoSENT等多种文本表征、文本相似度计算模型，并在文本语义匹配（相似度计算）任务上比较了各模型的效果。
+**similarities**基于多种字面、语义匹配模型，实现了各模型的相似度计算、匹配搜索功能，python3开发，pip安装，开箱即用。


 **Guide**
--- a/similarities/literalsim.py
+++ b/similarities/literalsim.py
@ -22,7 +22,7 @@ from similarities.utils.distance import cosine_distance
 from simhash import Simhash
 from similarities.utils.tfidf import TFIDF

-pwd_path = os.path.dirname(os.path.abspath(__file__))
+pwd_path = os.path.abspath(os.path.dirname(__file__))
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


--- a/similarities/semanticsim.py
+++ b/similarities/semanticsim.py
@ -13,7 +13,7 @@ from text2vec import SentenceModel

 from similarities.similarity import cos_sim, semantic_search, dot_score

-pwd_path = os.path.dirname(os.path.abspath(__file__))
+pwd_path = os.path.abspath(os.path.dirname(__file__))
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


@ -32,7 +32,7 @@ class BertSimilarity(object):
            self.add_documents(docs)

    def __len__(self):
-        """Get length of index."""
+        """Get length of docs."""
        return self.docs_embeddings.shape[0]

    def __str__(self):
@ -44,7 +44,7 @@ class BertSimilarity(object):

        Parameters
        ----------
-        docs : iterable of list of str
+        docs : list of str
        """
        self.docs += docs
        docs_embeddings = self.get_vector(docs)
@ -64,20 +64,72 @@ class BertSimilarity(object):

    def distance(self, text1, text2):
        """Compute cosine distance between two keys.
-        Calculate 1 - :meth:`~gensim.models.keyedvectors.KeyedVectors.similarity`.
+        """
+        return 1 - self.similarity(text1, text2)
+
+    def most_similar(self, query, topn=10):
+        result = []
+        query_embeddings = self.get_vector(query)
+        hits = semantic_search(query_embeddings, self.docs_embeddings, top_k=topn)
+        hits = hits[0]  # Get the hits for the first query
+
+        print("Input question:", query)
+        for hit in hits[0:topn]:
+            result.append((self.docs[hit['corpus_id']], round(hit['score'], 4)))
+            print("\t{:.3f}\t{}".format(hit['score'], self.docs[hit['corpus_id']]))
+
+        print("\n\n========\n")
+        return result
+
+
+
+class AnnoySimilarity(object):
+    """
+    Computes cosine similarities between word embeddings and retrieves most
+    similar terms for a given term.
+    """
+
+    def __init__(self, sentencemodel: SentenceModel, docs: List[str] = None):
+        # super().__init__()
+        self.sentencemodel = sentencemodel
+        self.docs = []
+        self.docs_embeddings = np.array([])
+        if docs is not None:
+            self.add_documents(docs)
+
+    def __len__(self):
+        """Get length of docs."""
+        return self.docs_embeddings.shape[0]
+
+    def __str__(self):
+        return "%s" % (self.__class__.__name__)
+
+    def add_documents(self, docs):
+        """
+        Extend the docs_embeddings with new documents.

        Parameters
        ----------
-        w1 : str
-            Input key.
-        w2 : str
-            Input key.
+        docs : list of str
+        """
+        self.docs += docs
+        docs_embeddings = self.get_vector(docs)
+        if self.docs_embeddings.size > 0:
+            self.docs_embeddings = np.vstack((self.docs_embeddings, docs_embeddings))
+        else:
+            self.docs_embeddings = docs_embeddings
+        logger.info(f"Add docs size: {len(docs)}, total size: {len(self.docs)}")

-        Returns
-        -------
-        float
-            Distance between `w1` and `w2`.
+    def get_vector(self, text):
+        return self.sentencemodel.encode(text)

+    def similarity(self, text1, text2, score_function=cos_sim):
+        text_emb1 = self.get_vector(text1)
+        text_emb2 = self.get_vector(text2)
+        return score_function(text_emb1, text_emb2)
+
+    def distance(self, text1, text2):
+        """Compute cosine distance between two keys.
        """
        return 1 - self.similarity(text1, text2)