Macropodus/macropodus/similarity/similarity_word2vec_char.py

70 lines
2.2 KiB
Python

# !/usr/bin/python
# -*- coding: utf-8 -*-
# @time : 2019/12/17 14:50
# @author : Mo
# @function: similarity of sentence of word2vec
from macropodus.base.word2vec import W2v
class SimW2vChar(W2v):
def __init__(self, use_cache=True):
super().__init__(use_cache)
def encode(self, sent, type_encode="other"):
"""
生成句向量, 字符级别, char
:param sent: str, like "大漠帝国"
:param type_encode: str, like "avg", "other"
:return: vector
"""
sentence_vec = self.w2v_char.wv[self.w2v_char.index2word[1]] * 0
len_sent = len(sent)
for i in range(len_sent):
word = sent[i]
try:
sentence_vec = sentence_vec + self.w2v_char.wv[word]
except Exception as e:
sentence_vec = sentence_vec + 0.01 # unknow_know词加0.01
if type_encode == "avg":
sentence_vec = sentence_vec / len_sent
return sentence_vec
def similarity(self, sent1, sent2, type_sim="total", type_encode="avg"):
"""
相似度计算, 默认余弦相似度+jaccard相似度
:param sen1: str, like "大漠帝国"
:param sen2: str, like "Macropodus"
:param type_sim: str, like "total" or "cosine"
:param type_encode: str, like "other" or "avg"
:return: float, like 0.998
"""
if sent1 and sent2:
encode_sen1 = self.encode(sent1, type_encode)
encode_sen2 = self.encode(sent2, type_encode)
score_res = self.cosine(encode_sen1, encode_sen2)
else:
score_res = 0.0
if type_sim=="total":
score_jaccard = self.jaccard(sent1, sent2)
score_res = (score_res + score_jaccard)/2
return score_res
if __name__ == '__main__':
sent1 = "大漠帝国"
sent2 = "macropodus"
swc = SimW2vChar(use_cache=True)
sen_encede = swc.encode(sent1)
score = swc.similarity(sent1, sent2)
print(score)
gg = 0
while True:
print("请输入sent1:")
sent1 = input()
print("请输入sent2:")
sent2 = input()
print(swc.similarity(sent1, sent2))