nlp_xiaojiang/FeatureProject/sentence_sim_feature.py
2019-04-13 00:37:00 +08:00

388 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: UTF-8 -*-
# !/usr/bin/python
# @time :2019/4/1 10:35
# @author :Mo
# @function :calculate Similarity of text and vector, which are tf-idf and pinyin
from FeatureProject.distance_text_or_vec import euclidean_distance, cosine_distance, manhattan_distance, euclidean_distance, jaccard_similarity_coefficient_distance
from FeatureProject.distance_text_or_vec import chebyshev_distance, minkowski_distance, euclidean_distance_standardized
from FeatureProject.distance_text_or_vec import mahalanobis_distance, bray_curtis_distance, pearson_correlation_distance
from FeatureProject.distance_text_or_vec import wmd_distance, normalization, z_score
from FeatureProject.distance_text_or_vec import hamming_distance, edit_levenshtein, ratio_levenshtein, jaro_levenshtein, set_ratio_fuzzywuzzy, sort_ratio_fuzzywuzzy
from FeatureProject.distance_text_or_vec import clear_sentence, chinese2pinyin, num_of_common_sub_str
from conf.path_config import word2_vec_path, td_idf_path, td_idf_path_pinyin
from FeatureProject.distance_vec_TS_SS import TS_SS
from gensim import corpora, models, matutils
from conf.path_config import projectdir
from gensim.models import KeyedVectors
import pandas as pd
import numpy as np
import pickle
import jieba
import time
import os
class SentenceSimFeature:
def __init__(self):
self.sen1 = None
self.sen2 = None
self.seg1 = None
self.seg2 = None
self.sen_vec1 = None
self.sen_vec2 = None
self.tfidf_vec1 = None
self.tfidf_vec2 = None
self.dictionary = None
self.tfidf_model = None
self.w2c_model = None
self.tfidf_pinyin_model = None
self.dictionary_pinyin = None
self.sen1_pinyin = None
self.sen2_pinyin = None
self.seg1_pinyin = None
self.seg2_pinyin = None
self.tfidf_vec1_pinyin = None
self.tfidf_vec2_pinyin = None
def set_data(self, sen1, sen2):
sen1 = clear_sentence(sen1)
sen2 = clear_sentence(sen2)
self.sen1 = str(sen1).strip()
self.sen2 = str(sen2).strip()
self.seg1 = list(jieba.cut(sen1))
self.seg2 = list(jieba.cut(sen2))
self.sen1_pinyin = chinese2pinyin(sen1)
self.sen2_pinyin = chinese2pinyin(sen2)
self.seg1_pinyin = (self.sen1_pinyin).split(' ')
self.seg2_pinyin = (self.sen2_pinyin).split(' ')
self.sen_vec1 = np.zeros(300)
self.sen_vec2 = np.zeros(300)
# self.tfidf_vec1 = np.array((self.tfidf_model.transform([' '.join(self.seg1)])).toarray().tolist()[0])
# self.tfidf_vec2 = np.array((self.tfidf_model.transform([' '.join(self.seg2)])).toarray().tolist()[0])
# self.tfidf_vec1_pinyin = np.array((self.tfidf_pinyin_model.transform([' '.join(self.seg1_pinyin)])).toarray().tolist()[0])
# self.tfidf_vec2_pinyin = np.array((self.tfidf_pinyin_model.transform([' '.join(self.seg2_pinyin)])).toarray().tolist()[0])
self.tfidf_vec1 = self.tfidf_model[self.dictionary.doc2bow(self.seg1)]
self.tfidf_vec2 = self.tfidf_model[self.dictionary.doc2bow(self.seg2)]
self.tfidf_vec1_pinyin = self.tfidf_pinyin_model[self.dictionary_pinyin.doc2bow(self.seg1_pinyin)]
self.tfidf_vec2_pinyin = self.tfidf_pinyin_model[self.dictionary_pinyin.doc2bow(self.seg2_pinyin)]
def same_word_count(self):
count_left = 0
for s in self.seg1:
if s in self.seg2:
count_left += 1
count_right = 0
for s in self.seg2:
if s in self.seg1:
count_right += 1
return min(count_left, count_right)
def same_char_count(self):
seg1 = list(self.sen1)
seg2 = list(self.sen2)
count_left = 0
for s in seg1:
if s in seg2:
count_left += 1
count_right = 0
for s in seg2:
if s in seg1:
count_right += 1
return min(count_left, count_right)
def sentence_length(self):
len_sen1 = len(self.sen1)
len_sen2 = len(self.sen2)
len_abs_sub = abs(len_sen1 - len_sen2)
len_rate = len_sen1 / len_sen2
len_add_rate = len_sen1 * len_sen2 / (len_sen1 + len_sen2)
return [len_abs_sub, len_rate, len_add_rate]
def init_sentence_vector(self):
# file_path = os.path.dirname(__file__)
print('load w2v model begin')
# model_path = os.path.join(file_path, word2_vec_path)
self.w2c_model = KeyedVectors.load_word2vec_format(word2_vec_path, unicode_errors='ignore', limit=None) # ,binary=True)
print('load w2v model success')
def encode_sentence_vector(self):
for s in self.seg1:
try:
self.sen_vec1 += self.w2c_model[s]
except:
self.sen_vec1 += np.zeros(300)
continue
for s in self.seg2:
try:
self.sen_vec2 += self.w2c_model[s]
except:
self.sen_vec2 += np.zeros(300)
continue
def init_tfidf(self):
file = open(td_idf_path, 'rb')
tfidf_dictionary_model = pickle.load(file)
self.dictionary = tfidf_dictionary_model[0]
self.tfidf_model = tfidf_dictionary_model[1]
file = open(td_idf_path_pinyin, 'rb')
tfidf_dictionary_pinyin_model = pickle.load(file)
self.dictionary_pinyin = tfidf_dictionary_pinyin_model[0]
self.tfidf_pinyin_model = tfidf_dictionary_pinyin_model[1]
print("init_tfidf ok!")
def w2c_all_vec(self):
w2c_Cosine = cosine_distance(self.sen_vec1, self.sen_vec2)
w2c_TS_SS = TS_SS(self.sen_vec1, self.sen_vec2)
w2c_Manhattan = manhattan_distance(self.sen_vec1, self.sen_vec2)
w2c_Euclidean = euclidean_distance(self.sen_vec1, self.sen_vec2)
w2c_Jaccard = jaccard_similarity_coefficient_distance(self.sen_vec1, self.sen_vec2)
w2c_Chebyshev = chebyshev_distance(self.sen_vec1, self.sen_vec2)
w2c_Minkowski = minkowski_distance(self.sen_vec1, self.sen_vec2)
w2c_Euclidean_Standard = euclidean_distance_standardized(self.sen_vec1, self.sen_vec2)
w2c_Mahalanobis = mahalanobis_distance(self.sen_vec1, self.sen_vec2)
w2c_Bray = bray_curtis_distance(self.sen_vec1, self.sen_vec2)
w2c_Pearson = pearson_correlation_distance(self.sen_vec1, self.sen_vec2)
# w2c_Wmd = Wmd_Distance(self.w2c_model, self.sen_vec1, self.sen_vec2)
return [w2c_Cosine, w2c_TS_SS, w2c_Manhattan, w2c_Euclidean, w2c_Jaccard, w2c_Chebyshev,
w2c_Minkowski, w2c_Euclidean_Standard, w2c_Mahalanobis, w2c_Bray, w2c_Pearson]
def tdidf_all_vec(self):
return matutils.cossim(self.tfidf_vec1, self.tfidf_vec2)
def edit_all_str(self):
str_hamming = hamming_distance(self.sen1, self.sen2)
str_edit = edit_levenshtein(self.sen1, self.sen2)
str_ratio = ratio_levenshtein(self.sen1, self.sen2)
str_jaro = jaro_levenshtein(self.sen1, self.sen2)
str_set_ratio_fuzz = set_ratio_fuzzywuzzy(self.sen1, self.sen2)
str_sort_ratio_fuzz = sort_ratio_fuzzywuzzy(self.sen1, self.sen2)
str_commonsubstr = num_of_common_sub_str(self.sen1, self.sen2)
str_list_Wmd = wmd_distance(self.w2c_model, self.seg1, self.seg2)
return [str_hamming, str_edit, str_ratio, str_jaro,
str_set_ratio_fuzz, str_sort_ratio_fuzz, str_commonsubstr, str_list_Wmd]
def word_jaccard(self):
a = list(set(self.seg1).intersection(set(self.seg2)))
b = list(set(self.seg1).union(set(self.seg2)))
return float(len(a) / len(b))
def char_jaccard(self):
a = list(set(list(self.sen1)).intersection(set(list(self.sen2))))
b = list(set(list(self.sen1)).union(set(list(self.sen2))))
return float(len(a) / len(b))
def tdidf_all_vec_pinyin(self):
return matutils.cossim(self.tfidf_vec1_pinyin, self.tfidf_vec2_pinyin)
def edit_all_pinyin(self):
pinyin_hamming = hamming_distance(self.sen1_pinyin, self.sen2_pinyin)
pinyin_edit = edit_levenshtein(self.sen1_pinyin, self.sen2_pinyin)
pinyin_ratio = ratio_levenshtein(self.sen1_pinyin, self.sen2_pinyin)
pinyin_jaro = jaro_levenshtein(self.sen1_pinyin, self.sen2_pinyin)
pinyin_set_ratio_fuzz = set_ratio_fuzzywuzzy(self.sen1_pinyin, self.sen2_pinyin)
pinyin_sort_ratio_fuzz = sort_ratio_fuzzywuzzy(self.sen1_pinyin, self.sen2_pinyin)
pinyin_commonsubstr = num_of_common_sub_str(self.sen1_pinyin, self.sen2_pinyin)
pinyin_list_Wmd = wmd_distance(self.w2c_model, self.seg1_pinyin, self.seg2_pinyin)
return [pinyin_hamming, pinyin_edit, pinyin_ratio, pinyin_jaro,
pinyin_set_ratio_fuzz, pinyin_sort_ratio_fuzz, pinyin_commonsubstr, pinyin_list_Wmd]
def word_jaccard_pinyin(self):
a = list(set(self.seg1_pinyin).intersection(set(self.seg2_pinyin)))
b = list(set(self.seg1_pinyin).union(set(self.seg2_pinyin)))
return float(len(a) / len(b))
def char_jaccard_pinyin(self):
a = list(set(list(self.seg1_pinyin)).intersection(set(list(self.seg2_pinyin))))
b = list(set(list(self.seg1_pinyin)).union(set(list(self.seg2_pinyin))))
return float(len(a) / len(b))
def sentence_input_t():
while True:
s1 = input('s1: ')
s2 = input('s2: ')
start_time = time.time()
ssf.set_data(s1, s2)
ssf.encode_sentence_vector()
time1 = time.time()
print('set_data time' + str(time1 - start_time))
# 相同词、长度
same_word_count = ssf.same_word_count()
time2 = time.time()
print('same_word_count time' + str(time2 - time1))
same_char_count = ssf.same_char_count()
time3 = time.time()
print('same_char_count time' + str(time3 - time2))
[len_abs_sub, len_rate, len_add_rate] = ssf.sentence_length()
time4 = time.time()
print('sentence_length time' + str(time4 - time3))
# w2c_all_vec
[w2c_Cosine, w2c_TS_SS, w2c_Manhattan, w2c_Euclidean,
w2c_Jaccard, w2c_Chebyshev, w2c_Minkowski, w2c_Euclidean_Standard, w2c_Mahalanobis,
w2c_Bray, w2c_Pearson] = ssf.w2c_all_vec()
time5 = time.time()
print('w2c_all_vec time' + str(time5 - time4))
# tdidf_all_vec
# [tdidf_Cosine, tdidf_TS_SS, tdidf_Manhattan, tdidf_Euclidean,
# tdidf_Jaccard, tdidf_Chebyshev,tdidf_Minkowski, tdidf_Euclidean_Standard, tdidf_Mahalanobis,
# tdidf_Bray, tdidf_Pearson] = ssf.tdidf_all_vec()
tdidf_cossim = ssf.tdidf_all_vec()
time6 = time.time()
print('tdidf_all_vec time' + str(time6 - time5))
# edit_all_str
[str_hamming, str_edit, str_ratio, str_jaro,
str_set_ratio_fuzz, str_sort_ratio_fuzz, str_commonsubstr, str_list_Wmd] = ssf.edit_all_str()
time7 = time.time()
print('edit_all_str time' + str(time7 - time6))
# jaccard系数
word_jaccard = ssf.word_jaccard()
char_jaccard = ssf.char_jaccard()
time8 = time.time()
print('jaccard系数 time' + str(time8 - time7))
# tdidf_all_vec_pinyin
# [tdidf_piyin_Cosine, tdidf_piyin_TS_SS, tdidf_piyin_Manhattan, tdidf_piyin_Euclidean, tdidf_piyin_Jaccard,
# tdidf_piyin_Chebyshev, tdidf_piyin_Minkowski, tdidf_piyin_Euclidean_Standard, tdidf_piyin_Mahalanobis,
# tdidf_piyin_Bray, tdidf_piyin_Pearson] = ssf.tdidf_all_vec_pinyin()
tdidf_pinyin_cossim = ssf.tdidf_all_vec_pinyin()
time9 = time.time()
print('tdidf_all_vec_pinyin time' + str(time9 - time8))
# edit_all_pinyin
[pinyin_hamming, pinyin_edit, pinyin_ratio, pinyin_jaro,
pinyin_set_ratio_fuzz, pinyin_sort_ratio_fuzz, pinyin_commonsubstr, pinyin_list_Wmd] = ssf.edit_all_pinyin()
time10 = time.time()
print('edit_all_pinyin time' + str(time10 - time9))
# jaccard系数
word_jaccard_pinyin = ssf.word_jaccard_pinyin()
char_jaccard_pinyin = ssf.char_jaccard_pinyin()
time11 = time.time()
print('jaccard系数pinyin time' + str(time11 - time10))
sim_all_last = [same_word_count, same_char_count, len_abs_sub, len_rate, len_add_rate,
w2c_Cosine, w2c_TS_SS, w2c_Manhattan, w2c_Euclidean, w2c_Jaccard, w2c_Chebyshev, w2c_Minkowski,
w2c_Euclidean_Standard, w2c_Mahalanobis, w2c_Bray, w2c_Pearson,
tdidf_cossim, str_hamming, str_edit, str_ratio, str_jaro, str_set_ratio_fuzz,
str_sort_ratio_fuzz,
str_commonsubstr, str_list_Wmd,
word_jaccard, char_jaccard, tdidf_pinyin_cossim,
pinyin_hamming, pinyin_edit, pinyin_ratio, pinyin_jaro, pinyin_set_ratio_fuzz,
pinyin_sort_ratio_fuzz,
pinyin_commonsubstr, pinyin_list_Wmd,
word_jaccard_pinyin, char_jaccard_pinyin]
print("sim: ")
print(sim_all_last)
if __name__ == '__main__':
ssf = SentenceSimFeature()
ssf.init_sentence_vector()
ssf.init_tfidf()
s1 = "你知道Mo的能力上限吗"
s2 = "你好呀Mo水平很差"
start_time = time.time()
ssf.set_data(s1, s2)
ssf.encode_sentence_vector()
time1 = time.time()
print('set_data time' + str(time1 - start_time))
# 相同词、长度
same_word_count = ssf.same_word_count()
time2 = time.time()
print('same_word_count time' + str(time2 - time1))
same_char_count = ssf.same_char_count()
time3 = time.time()
print('same_char_count time' + str(time3 - time2))
[len_abs_sub, len_rate, len_add_rate] = ssf.sentence_length()
time4 = time.time()
print('sentence_length time' + str(time4 - time3))
# w2c_all_vec
[w2c_Cosine, w2c_TS_SS, w2c_Manhattan, w2c_Euclidean,
w2c_Jaccard, w2c_Chebyshev, w2c_Minkowski, w2c_Euclidean_Standard, w2c_Mahalanobis,
w2c_Bray, w2c_Pearson] = ssf.w2c_all_vec()
time5 = time.time()
print('w2c_all_vec time' + str(time5 - time4))
# tdidf_all_vec
tdidf_cossim = ssf.tdidf_all_vec()
time6 = time.time()
print('tdidf_all_vec time' + str(time6 - time5))
# edit_all_str
[str_hamming, str_edit, str_ratio, str_jaro,
str_set_ratio_fuzz, str_sort_ratio_fuzz, str_commonsubstr, str_list_Wmd] = ssf.edit_all_str()
time7 = time.time()
print('edit_all_str time' + str(time7 - time6))
# jaccard系数
word_jaccard = ssf.word_jaccard()
char_jaccard = ssf.char_jaccard()
time8 = time.time()
print('jaccard系数 time' + str(time8 - time7))
# pinyin
tdidf_pinyin_cossim = ssf.tdidf_all_vec_pinyin()
time9 = time.time()
print('tdidf_all_vec_pinyin time' + str(time9 - time8))
# edit_all_pinyin
[pinyin_hamming, pinyin_edit, pinyin_ratio, pinyin_jaro,
pinyin_set_ratio_fuzz, pinyin_sort_ratio_fuzz, pinyin_commonsubstr, pinyin_list_Wmd] = ssf.edit_all_pinyin()
time10 = time.time()
print('edit_all_pinyin time' + str(time10 - time9))
# jaccard系数
word_jaccard_pinyin = ssf.word_jaccard_pinyin()
char_jaccard_pinyin = ssf.char_jaccard_pinyin()
time11 = time.time()
print('jaccard系数pinyin time' + str(time11 - time10))
sim_all_last = [same_word_count, same_char_count, len_abs_sub, len_rate, len_add_rate,
w2c_Cosine, w2c_TS_SS, w2c_Manhattan, w2c_Euclidean, w2c_Jaccard, w2c_Chebyshev, w2c_Minkowski,
w2c_Euclidean_Standard, w2c_Mahalanobis, w2c_Bray, w2c_Pearson,
tdidf_cossim, str_hamming, str_edit, str_ratio, str_jaro, str_set_ratio_fuzz, str_sort_ratio_fuzz,
str_commonsubstr, str_list_Wmd,
word_jaccard, char_jaccard, tdidf_pinyin_cossim,
pinyin_hamming, pinyin_edit, pinyin_ratio, pinyin_jaro, pinyin_set_ratio_fuzz,
pinyin_sort_ratio_fuzz,
pinyin_commonsubstr, pinyin_list_Wmd,
word_jaccard_pinyin, char_jaccard_pinyin]
print("小姜机器人计算sim: ")
print(sim_all_last)
sentence_input_t()