nlp_xiaojiang/FeatureProject/sentence_sim_feature.py

388 lines
16 KiB
Python
Raw Normal View History

2019-04-13 00:37:00 +08:00
# -*- coding: UTF-8 -*-
# !/usr/bin/python
# @time :2019/4/1 10:35
# @author :Mo
# @function :calculate Similarity of text and vector, which are tf-idf and pinyin
2019-04-09 15:26:07 +08:00
from FeatureProject.distance_text_or_vec import euclidean_distance, cosine_distance, manhattan_distance, euclidean_distance, jaccard_similarity_coefficient_distance
from FeatureProject.distance_text_or_vec import chebyshev_distance, minkowski_distance, euclidean_distance_standardized
from FeatureProject.distance_text_or_vec import mahalanobis_distance, bray_curtis_distance, pearson_correlation_distance
from FeatureProject.distance_text_or_vec import wmd_distance, normalization, z_score
from FeatureProject.distance_text_or_vec import hamming_distance, edit_levenshtein, ratio_levenshtein, jaro_levenshtein, set_ratio_fuzzywuzzy, sort_ratio_fuzzywuzzy
from FeatureProject.distance_text_or_vec import clear_sentence, chinese2pinyin, num_of_common_sub_str
from conf.path_config import word2_vec_path, td_idf_path, td_idf_path_pinyin
from FeatureProject.distance_vec_TS_SS import TS_SS
from gensim import corpora, models, matutils
from conf.path_config import projectdir
from gensim.models import KeyedVectors
import pandas as pd
import numpy as np
import pickle
import jieba
import time
import os
class SentenceSimFeature:
def __init__(self):
self.sen1 = None
self.sen2 = None
self.seg1 = None
self.seg2 = None
self.sen_vec1 = None
self.sen_vec2 = None
self.tfidf_vec1 = None
self.tfidf_vec2 = None
self.dictionary = None
self.tfidf_model = None
self.w2c_model = None
self.tfidf_pinyin_model = None
self.dictionary_pinyin = None
self.sen1_pinyin = None
self.sen2_pinyin = None
self.seg1_pinyin = None
self.seg2_pinyin = None
self.tfidf_vec1_pinyin = None
self.tfidf_vec2_pinyin = None
def set_data(self, sen1, sen2):
sen1 = clear_sentence(sen1)
sen2 = clear_sentence(sen2)
self.sen1 = str(sen1).strip()
self.sen2 = str(sen2).strip()
self.seg1 = list(jieba.cut(sen1))
self.seg2 = list(jieba.cut(sen2))
self.sen1_pinyin = chinese2pinyin(sen1)
self.sen2_pinyin = chinese2pinyin(sen2)
self.seg1_pinyin = (self.sen1_pinyin).split(' ')
self.seg2_pinyin = (self.sen2_pinyin).split(' ')
self.sen_vec1 = np.zeros(300)
self.sen_vec2 = np.zeros(300)
# self.tfidf_vec1 = np.array((self.tfidf_model.transform([' '.join(self.seg1)])).toarray().tolist()[0])
# self.tfidf_vec2 = np.array((self.tfidf_model.transform([' '.join(self.seg2)])).toarray().tolist()[0])
# self.tfidf_vec1_pinyin = np.array((self.tfidf_pinyin_model.transform([' '.join(self.seg1_pinyin)])).toarray().tolist()[0])
# self.tfidf_vec2_pinyin = np.array((self.tfidf_pinyin_model.transform([' '.join(self.seg2_pinyin)])).toarray().tolist()[0])
self.tfidf_vec1 = self.tfidf_model[self.dictionary.doc2bow(self.seg1)]
self.tfidf_vec2 = self.tfidf_model[self.dictionary.doc2bow(self.seg2)]
self.tfidf_vec1_pinyin = self.tfidf_pinyin_model[self.dictionary_pinyin.doc2bow(self.seg1_pinyin)]
self.tfidf_vec2_pinyin = self.tfidf_pinyin_model[self.dictionary_pinyin.doc2bow(self.seg2_pinyin)]
def same_word_count(self):
count_left = 0
for s in self.seg1:
if s in self.seg2:
count_left += 1
count_right = 0
for s in self.seg2:
if s in self.seg1:
count_right += 1
return min(count_left, count_right)
def same_char_count(self):
seg1 = list(self.sen1)
seg2 = list(self.sen2)
count_left = 0
for s in seg1:
if s in seg2:
count_left += 1
count_right = 0
for s in seg2:
if s in seg1:
count_right += 1
return min(count_left, count_right)
def sentence_length(self):
len_sen1 = len(self.sen1)
len_sen2 = len(self.sen2)
len_abs_sub = abs(len_sen1 - len_sen2)
len_rate = len_sen1 / len_sen2
len_add_rate = len_sen1 * len_sen2 / (len_sen1 + len_sen2)
return [len_abs_sub, len_rate, len_add_rate]
def init_sentence_vector(self):
# file_path = os.path.dirname(__file__)
print('load w2v model begin')
# model_path = os.path.join(file_path, word2_vec_path)
self.w2c_model = KeyedVectors.load_word2vec_format(word2_vec_path, unicode_errors='ignore', limit=None) # ,binary=True)
print('load w2v model success')
def encode_sentence_vector(self):
for s in self.seg1:
try:
self.sen_vec1 += self.w2c_model[s]
except:
self.sen_vec1 += np.zeros(300)
continue
for s in self.seg2:
try:
self.sen_vec2 += self.w2c_model[s]
except:
self.sen_vec2 += np.zeros(300)
continue
def init_tfidf(self):
file = open(td_idf_path, 'rb')
tfidf_dictionary_model = pickle.load(file)
self.dictionary = tfidf_dictionary_model[0]
self.tfidf_model = tfidf_dictionary_model[1]
file = open(td_idf_path_pinyin, 'rb')
tfidf_dictionary_pinyin_model = pickle.load(file)
self.dictionary_pinyin = tfidf_dictionary_pinyin_model[0]
self.tfidf_pinyin_model = tfidf_dictionary_pinyin_model[1]
print("init_tfidf ok!")
def w2c_all_vec(self):
w2c_Cosine = cosine_distance(self.sen_vec1, self.sen_vec2)
w2c_TS_SS = TS_SS(self.sen_vec1, self.sen_vec2)
w2c_Manhattan = manhattan_distance(self.sen_vec1, self.sen_vec2)
w2c_Euclidean = euclidean_distance(self.sen_vec1, self.sen_vec2)
w2c_Jaccard = jaccard_similarity_coefficient_distance(self.sen_vec1, self.sen_vec2)
w2c_Chebyshev = chebyshev_distance(self.sen_vec1, self.sen_vec2)
w2c_Minkowski = minkowski_distance(self.sen_vec1, self.sen_vec2)
w2c_Euclidean_Standard = euclidean_distance_standardized(self.sen_vec1, self.sen_vec2)
w2c_Mahalanobis = mahalanobis_distance(self.sen_vec1, self.sen_vec2)
w2c_Bray = bray_curtis_distance(self.sen_vec1, self.sen_vec2)
w2c_Pearson = pearson_correlation_distance(self.sen_vec1, self.sen_vec2)
# w2c_Wmd = Wmd_Distance(self.w2c_model, self.sen_vec1, self.sen_vec2)
return [w2c_Cosine, w2c_TS_SS, w2c_Manhattan, w2c_Euclidean, w2c_Jaccard, w2c_Chebyshev,
w2c_Minkowski, w2c_Euclidean_Standard, w2c_Mahalanobis, w2c_Bray, w2c_Pearson]
def tdidf_all_vec(self):
return matutils.cossim(self.tfidf_vec1, self.tfidf_vec2)
def edit_all_str(self):
str_hamming = hamming_distance(self.sen1, self.sen2)
str_edit = edit_levenshtein(self.sen1, self.sen2)
str_ratio = ratio_levenshtein(self.sen1, self.sen2)
str_jaro = jaro_levenshtein(self.sen1, self.sen2)
str_set_ratio_fuzz = set_ratio_fuzzywuzzy(self.sen1, self.sen2)
str_sort_ratio_fuzz = sort_ratio_fuzzywuzzy(self.sen1, self.sen2)
str_commonsubstr = num_of_common_sub_str(self.sen1, self.sen2)
str_list_Wmd = wmd_distance(self.w2c_model, self.seg1, self.seg2)
return [str_hamming, str_edit, str_ratio, str_jaro,
str_set_ratio_fuzz, str_sort_ratio_fuzz, str_commonsubstr, str_list_Wmd]
def word_jaccard(self):
a = list(set(self.seg1).intersection(set(self.seg2)))
b = list(set(self.seg1).union(set(self.seg2)))
return float(len(a) / len(b))
def char_jaccard(self):
a = list(set(list(self.sen1)).intersection(set(list(self.sen2))))
b = list(set(list(self.sen1)).union(set(list(self.sen2))))
return float(len(a) / len(b))
def tdidf_all_vec_pinyin(self):
return matutils.cossim(self.tfidf_vec1_pinyin, self.tfidf_vec2_pinyin)
def edit_all_pinyin(self):
pinyin_hamming = hamming_distance(self.sen1_pinyin, self.sen2_pinyin)
pinyin_edit = edit_levenshtein(self.sen1_pinyin, self.sen2_pinyin)
pinyin_ratio = ratio_levenshtein(self.sen1_pinyin, self.sen2_pinyin)
pinyin_jaro = jaro_levenshtein(self.sen1_pinyin, self.sen2_pinyin)
pinyin_set_ratio_fuzz = set_ratio_fuzzywuzzy(self.sen1_pinyin, self.sen2_pinyin)
pinyin_sort_ratio_fuzz = sort_ratio_fuzzywuzzy(self.sen1_pinyin, self.sen2_pinyin)
pinyin_commonsubstr = num_of_common_sub_str(self.sen1_pinyin, self.sen2_pinyin)
pinyin_list_Wmd = wmd_distance(self.w2c_model, self.seg1_pinyin, self.seg2_pinyin)
return [pinyin_hamming, pinyin_edit, pinyin_ratio, pinyin_jaro,
pinyin_set_ratio_fuzz, pinyin_sort_ratio_fuzz, pinyin_commonsubstr, pinyin_list_Wmd]
def word_jaccard_pinyin(self):
a = list(set(self.seg1_pinyin).intersection(set(self.seg2_pinyin)))
b = list(set(self.seg1_pinyin).union(set(self.seg2_pinyin)))
return float(len(a) / len(b))
def char_jaccard_pinyin(self):
a = list(set(list(self.seg1_pinyin)).intersection(set(list(self.seg2_pinyin))))
b = list(set(list(self.seg1_pinyin)).union(set(list(self.seg2_pinyin))))
return float(len(a) / len(b))
def sentence_input_t():
while True:
s1 = input('s1: ')
s2 = input('s2: ')
start_time = time.time()
ssf.set_data(s1, s2)
ssf.encode_sentence_vector()
time1 = time.time()
print('set_data time' + str(time1 - start_time))
# 相同词、长度
same_word_count = ssf.same_word_count()
time2 = time.time()
print('same_word_count time' + str(time2 - time1))
same_char_count = ssf.same_char_count()
time3 = time.time()
print('same_char_count time' + str(time3 - time2))
[len_abs_sub, len_rate, len_add_rate] = ssf.sentence_length()
time4 = time.time()
print('sentence_length time' + str(time4 - time3))
# w2c_all_vec
[w2c_Cosine, w2c_TS_SS, w2c_Manhattan, w2c_Euclidean,
w2c_Jaccard, w2c_Chebyshev, w2c_Minkowski, w2c_Euclidean_Standard, w2c_Mahalanobis,
w2c_Bray, w2c_Pearson] = ssf.w2c_all_vec()
time5 = time.time()
print('w2c_all_vec time' + str(time5 - time4))
# tdidf_all_vec
# [tdidf_Cosine, tdidf_TS_SS, tdidf_Manhattan, tdidf_Euclidean,
# tdidf_Jaccard, tdidf_Chebyshev,tdidf_Minkowski, tdidf_Euclidean_Standard, tdidf_Mahalanobis,
# tdidf_Bray, tdidf_Pearson] = ssf.tdidf_all_vec()
tdidf_cossim = ssf.tdidf_all_vec()
time6 = time.time()
print('tdidf_all_vec time' + str(time6 - time5))
# edit_all_str
[str_hamming, str_edit, str_ratio, str_jaro,
str_set_ratio_fuzz, str_sort_ratio_fuzz, str_commonsubstr, str_list_Wmd] = ssf.edit_all_str()
time7 = time.time()
print('edit_all_str time' + str(time7 - time6))
# jaccard系数
word_jaccard = ssf.word_jaccard()
char_jaccard = ssf.char_jaccard()
time8 = time.time()
print('jaccard系数 time' + str(time8 - time7))
# tdidf_all_vec_pinyin
# [tdidf_piyin_Cosine, tdidf_piyin_TS_SS, tdidf_piyin_Manhattan, tdidf_piyin_Euclidean, tdidf_piyin_Jaccard,
# tdidf_piyin_Chebyshev, tdidf_piyin_Minkowski, tdidf_piyin_Euclidean_Standard, tdidf_piyin_Mahalanobis,
# tdidf_piyin_Bray, tdidf_piyin_Pearson] = ssf.tdidf_all_vec_pinyin()
tdidf_pinyin_cossim = ssf.tdidf_all_vec_pinyin()
time9 = time.time()
print('tdidf_all_vec_pinyin time' + str(time9 - time8))
# edit_all_pinyin
[pinyin_hamming, pinyin_edit, pinyin_ratio, pinyin_jaro,
pinyin_set_ratio_fuzz, pinyin_sort_ratio_fuzz, pinyin_commonsubstr, pinyin_list_Wmd] = ssf.edit_all_pinyin()
time10 = time.time()
print('edit_all_pinyin time' + str(time10 - time9))
# jaccard系数
word_jaccard_pinyin = ssf.word_jaccard_pinyin()
char_jaccard_pinyin = ssf.char_jaccard_pinyin()
time11 = time.time()
print('jaccard系数pinyin time' + str(time11 - time10))
sim_all_last = [same_word_count, same_char_count, len_abs_sub, len_rate, len_add_rate,
w2c_Cosine, w2c_TS_SS, w2c_Manhattan, w2c_Euclidean, w2c_Jaccard, w2c_Chebyshev, w2c_Minkowski,
w2c_Euclidean_Standard, w2c_Mahalanobis, w2c_Bray, w2c_Pearson,
tdidf_cossim, str_hamming, str_edit, str_ratio, str_jaro, str_set_ratio_fuzz,
str_sort_ratio_fuzz,
str_commonsubstr, str_list_Wmd,
word_jaccard, char_jaccard, tdidf_pinyin_cossim,
pinyin_hamming, pinyin_edit, pinyin_ratio, pinyin_jaro, pinyin_set_ratio_fuzz,
pinyin_sort_ratio_fuzz,
pinyin_commonsubstr, pinyin_list_Wmd,
word_jaccard_pinyin, char_jaccard_pinyin]
print("sim: ")
print(sim_all_last)
if __name__ == '__main__':
ssf = SentenceSimFeature()
ssf.init_sentence_vector()
ssf.init_tfidf()
s1 = "你知道Mo的能力上限吗"
s2 = "你好呀Mo水平很差"
start_time = time.time()
ssf.set_data(s1, s2)
ssf.encode_sentence_vector()
time1 = time.time()
print('set_data time' + str(time1 - start_time))
# 相同词、长度
same_word_count = ssf.same_word_count()
time2 = time.time()
print('same_word_count time' + str(time2 - time1))
same_char_count = ssf.same_char_count()
time3 = time.time()
print('same_char_count time' + str(time3 - time2))
[len_abs_sub, len_rate, len_add_rate] = ssf.sentence_length()
time4 = time.time()
print('sentence_length time' + str(time4 - time3))
# w2c_all_vec
[w2c_Cosine, w2c_TS_SS, w2c_Manhattan, w2c_Euclidean,
w2c_Jaccard, w2c_Chebyshev, w2c_Minkowski, w2c_Euclidean_Standard, w2c_Mahalanobis,
w2c_Bray, w2c_Pearson] = ssf.w2c_all_vec()
time5 = time.time()
print('w2c_all_vec time' + str(time5 - time4))
# tdidf_all_vec
tdidf_cossim = ssf.tdidf_all_vec()
time6 = time.time()
print('tdidf_all_vec time' + str(time6 - time5))
# edit_all_str
[str_hamming, str_edit, str_ratio, str_jaro,
str_set_ratio_fuzz, str_sort_ratio_fuzz, str_commonsubstr, str_list_Wmd] = ssf.edit_all_str()
time7 = time.time()
print('edit_all_str time' + str(time7 - time6))
# jaccard系数
word_jaccard = ssf.word_jaccard()
char_jaccard = ssf.char_jaccard()
time8 = time.time()
print('jaccard系数 time' + str(time8 - time7))
# pinyin
tdidf_pinyin_cossim = ssf.tdidf_all_vec_pinyin()
time9 = time.time()
print('tdidf_all_vec_pinyin time' + str(time9 - time8))
# edit_all_pinyin
[pinyin_hamming, pinyin_edit, pinyin_ratio, pinyin_jaro,
pinyin_set_ratio_fuzz, pinyin_sort_ratio_fuzz, pinyin_commonsubstr, pinyin_list_Wmd] = ssf.edit_all_pinyin()
time10 = time.time()
print('edit_all_pinyin time' + str(time10 - time9))
# jaccard系数
word_jaccard_pinyin = ssf.word_jaccard_pinyin()
char_jaccard_pinyin = ssf.char_jaccard_pinyin()
time11 = time.time()
print('jaccard系数pinyin time' + str(time11 - time10))
sim_all_last = [same_word_count, same_char_count, len_abs_sub, len_rate, len_add_rate,
w2c_Cosine, w2c_TS_SS, w2c_Manhattan, w2c_Euclidean, w2c_Jaccard, w2c_Chebyshev, w2c_Minkowski,
w2c_Euclidean_Standard, w2c_Mahalanobis, w2c_Bray, w2c_Pearson,
tdidf_cossim, str_hamming, str_edit, str_ratio, str_jaro, str_set_ratio_fuzz, str_sort_ratio_fuzz,
str_commonsubstr, str_list_Wmd,
word_jaccard, char_jaccard, tdidf_pinyin_cossim,
pinyin_hamming, pinyin_edit, pinyin_ratio, pinyin_jaro, pinyin_set_ratio_fuzz,
pinyin_sort_ratio_fuzz,
pinyin_commonsubstr, pinyin_list_Wmd,
word_jaccard_pinyin, char_jaccard_pinyin]
print("小姜机器人计算sim: ")
print(sim_all_last)
sentence_input_t()