2019-04-09 15:26:07 +08:00
|
|
|
|
# -*- coding: UTF-8 -*-
|
|
|
|
|
# !/usr/bin/python
|
|
|
|
|
# @time :2019/4/4 10:00
|
|
|
|
|
# @author :Mo
|
2019-04-13 00:32:41 +08:00
|
|
|
|
# @function :calculate distances of Varity
|
2019-04-09 15:26:07 +08:00
|
|
|
|
|
|
|
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
|
|
|
from utils.text_tools import txtRead, get_syboml, strQ2B
|
|
|
|
|
import Levenshtein as Leven
|
|
|
|
|
from fuzzywuzzy import fuzz
|
|
|
|
|
import jieba.analyse
|
|
|
|
|
import numpy as np
|
|
|
|
|
import xpinyin
|
|
|
|
|
import pickle
|
|
|
|
|
import jieba
|
|
|
|
|
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
zero_bit = 0.000000001
|
|
|
|
|
pin = xpinyin.Pinyin()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def clear_sentence(sentence):
|
|
|
|
|
"""
|
|
|
|
|
数据清晰,全角转半角
|
|
|
|
|
:param sentence: str, input sentence
|
|
|
|
|
:return: str, clearned sentences
|
|
|
|
|
"""
|
|
|
|
|
corpus_one_clear = str(sentence).replace(' ', '').strip()
|
|
|
|
|
ques_q2b = strQ2B(corpus_one_clear.strip())
|
|
|
|
|
ques_q2b_syboml = get_syboml(ques_q2b)
|
|
|
|
|
return ques_q2b_syboml
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def chinese2pinyin(sentence):
|
|
|
|
|
"""
|
|
|
|
|
chinese translate to pingyin
|
|
|
|
|
:param sentence: str, input sentence
|
|
|
|
|
:return: str, output pingyin
|
|
|
|
|
"""
|
|
|
|
|
ques_q2b_syboml_pinying = pin.get_pinyin(sentence, ' ')
|
|
|
|
|
return ques_q2b_syboml_pinying
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def hamming_distance(v1, v2):
|
|
|
|
|
n = int(v1, 2) ^ int(v2, 2)
|
|
|
|
|
return bin(n & 0xffffffff).count('1')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def cosine_distance(v1, v2): # 余弦距离
|
|
|
|
|
if v1.all() and v2.all():
|
|
|
|
|
return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
|
|
|
|
|
else:
|
|
|
|
|
return 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def euclidean_distance(v1, v2): # 欧氏距离
|
|
|
|
|
return np.sqrt(np.sum(np.square(v1 - v2)))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def manhattan_distance(v1, v2): # 曼哈顿距离
|
|
|
|
|
return np.sum(np.abs(v1 - v2))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def chebyshev_distance(v1, v2): # 切比雪夫距离
|
|
|
|
|
return np.max(np.abs(v1 - v2))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def minkowski_distance(v1, v2): # 闵可夫斯基距离
|
|
|
|
|
return np.sqrt(np.sum(np.square(v1 - v2)))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def euclidean_distance_standardized(v1, v2): # 标准化欧氏距离
|
|
|
|
|
v1_v2 = np.vstack([v1, v2])
|
|
|
|
|
sk_v1_v2 = np.var(v1_v2, axis=0, ddof=1)
|
|
|
|
|
return np.sqrt(((v1 - v2) ** 2 / (sk_v1_v2 + zero_bit * np.ones_like(sk_v1_v2))).sum())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def mahalanobis_distance(v1, v2): # 马氏距离
|
|
|
|
|
# 马氏距离要求样本数要大于维数,否则无法求协方差矩阵
|
|
|
|
|
# 此处进行转置,表示10个样本,每个样本2维
|
|
|
|
|
X = np.vstack([v1, v2])
|
|
|
|
|
XT = X.T
|
|
|
|
|
|
|
|
|
|
# 方法一:根据公式求解
|
|
|
|
|
S = np.cov(X) # 两个维度之间协方差矩阵
|
|
|
|
|
try:
|
|
|
|
|
SI = np.linalg.inv(S) # 协方差矩阵的逆矩阵 todo
|
|
|
|
|
except:
|
|
|
|
|
SI = np.zeros_like(S)
|
|
|
|
|
# 马氏距离计算两个样本之间的距离,此处共有10个样本,两两组合,共有45个距离。
|
|
|
|
|
n = XT.shape[0]
|
|
|
|
|
distance_all = []
|
|
|
|
|
for i in range(0, n):
|
|
|
|
|
for j in range(i + 1, n):
|
|
|
|
|
delta = XT[i] - XT[j]
|
|
|
|
|
distance_1 = np.sqrt(np.dot(np.dot(delta, SI), delta.T))
|
|
|
|
|
distance_all.append(distance_1)
|
|
|
|
|
return np.sum(np.abs(distance_all))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def bray_curtis_distance(v1, v2): # 布雷柯蒂斯距离, 生物学生态距离
|
|
|
|
|
up_v1_v2 = np.sum(np.abs(v2 - v1))
|
|
|
|
|
down_v1_v2 = np.sum(v1) + np.sum(v2)
|
|
|
|
|
return up_v1_v2 / (down_v1_v2 + zero_bit)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def pearson_correlation_distance(v1, v2): # 皮尔逊相关系数(Pearson correlation)
|
|
|
|
|
v1_v2 = np.vstack([v1, v2])
|
|
|
|
|
return np.corrcoef(v1_v2)[0][1]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def jaccard_similarity_coefficient_distance(v1, v2): # 杰卡德相似系数(Jaccard similarity coefficient)
|
|
|
|
|
# 方法一:根据公式求解
|
|
|
|
|
v1 = np.asarray(v1)
|
|
|
|
|
v2 = np.asarray(v2)
|
|
|
|
|
up = np.double(np.bitwise_and((v1 != v2), np.bitwise_or(v1 != 0, v2 != 0)).sum())
|
|
|
|
|
down = np.double(np.bitwise_or(v1 != 0, v2 != 0).sum() + zero_bit)
|
|
|
|
|
return up / down
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def wmd_distance(model, sent1_cut_list, sent2_cut_list): # WMD距离
|
|
|
|
|
# model.init_sims(replace=True)
|
|
|
|
|
distance = model.wmdistance(sent1_cut_list, sent2_cut_list)
|
|
|
|
|
return distance
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# def HamMings_Levenshtein(str1, str2):
|
|
|
|
|
# sim = Leven.hamming(str1, str2)
|
|
|
|
|
# return sim
|
|
|
|
|
|
|
|
|
|
def edit_levenshtein(str1, str2):
|
|
|
|
|
return Leven.distance(str1, str2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def ratio_levenshtein(str1, str2):
|
|
|
|
|
return Leven.ratio(str1, str2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def jaro_levenshtein(str1, str2):
|
|
|
|
|
return Leven.jaro(str1, str2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def set_ratio_fuzzywuzzy(str1, str2):
|
|
|
|
|
return fuzz.token_set_ratio(str1, str2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def sort_ratio_fuzzywuzzy(str1, str2):
|
|
|
|
|
return fuzz.token_sort_ratio(str1, str2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def num_of_common_sub_str(str1, str2):
|
|
|
|
|
'''
|
|
|
|
|
求两个字符串的最长公共子串
|
|
|
|
|
思想:建立一个二维数组,保存连续位相同与否的状态
|
|
|
|
|
'''
|
|
|
|
|
lstr1 = len(str1)
|
|
|
|
|
lstr2 = len(str2)
|
|
|
|
|
record = [[0 for i in range(lstr2 + 1)] for j in range(lstr1 + 1)] # 多一位
|
|
|
|
|
maxNum = 0 # 最长匹配长度
|
|
|
|
|
p = 0 # 匹配的起始位
|
|
|
|
|
|
|
|
|
|
for i in range(lstr1):
|
|
|
|
|
for j in range(lstr2):
|
|
|
|
|
if str1[i] == str2[j]:
|
|
|
|
|
# 相同则累加
|
|
|
|
|
record[i + 1][j + 1] = record[i][j] + 1
|
|
|
|
|
if record[i + 1][j + 1] > maxNum:
|
|
|
|
|
# 获取最大匹配长度
|
|
|
|
|
maxNum = record[i + 1][j + 1]
|
|
|
|
|
# 记录最大匹配长度的终止位置
|
|
|
|
|
p = i + 1
|
|
|
|
|
# return str1[p - maxNum:p], maxNum
|
|
|
|
|
return maxNum
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
####################################################### 汉明距离
|
|
|
|
|
def string_hash(source):
|
|
|
|
|
if source == "":
|
|
|
|
|
return 0
|
|
|
|
|
else:
|
|
|
|
|
x = ord(source[0]) << 7
|
|
|
|
|
m = 1000003
|
|
|
|
|
mask = 2 ** 128 - 1
|
|
|
|
|
for c in source:
|
|
|
|
|
x = ((x * m) ^ ord(c)) & mask
|
|
|
|
|
x ^= len(source)
|
|
|
|
|
if x == -1:
|
|
|
|
|
x = -2
|
|
|
|
|
x = bin(x).replace('0b', '').zfill(64)[-64:]
|
|
|
|
|
|
|
|
|
|
return str(x)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def sim_hash(content):
|
|
|
|
|
seg = jieba.cut(content)
|
|
|
|
|
keyWord = jieba.analyse.extract_tags('|'.join(seg), topK=20, withWeight=True, allowPOS=())
|
|
|
|
|
# 先按照权重排序,再按照词排序
|
|
|
|
|
keyList = []
|
|
|
|
|
# print(keyWord)
|
|
|
|
|
for feature, weight in keyWord:
|
|
|
|
|
weight = int(weight * 20)
|
|
|
|
|
feature = string_hash(feature)
|
|
|
|
|
temp = []
|
|
|
|
|
for f in feature:
|
|
|
|
|
if f == '1':
|
|
|
|
|
temp.append(weight)
|
|
|
|
|
else:
|
|
|
|
|
temp.append(-weight)
|
|
|
|
|
keyList.append(temp)
|
|
|
|
|
content_list = np.sum(np.array(keyList), axis=0)
|
|
|
|
|
# 编码读不出来
|
|
|
|
|
if len(keyList) == 0:
|
|
|
|
|
return '00'
|
|
|
|
|
simhash = ''
|
|
|
|
|
for c in content_list:
|
|
|
|
|
if c > 0:
|
|
|
|
|
simhash = simhash + '1'
|
|
|
|
|
else:
|
|
|
|
|
simhash = simhash + '0'
|
|
|
|
|
return simhash
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def hamming_distance_equal(v1, v2):
|
|
|
|
|
n = int(v1, 2) ^ int(v2, 2)
|
|
|
|
|
return bin(n & 0xffffffff).count('1')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def hamming_distance(sen1, sen2):
|
|
|
|
|
return hamming_distance_equal(sim_hash(sen1), sim_hash(sen2))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def normalization(x):
|
|
|
|
|
"""
|
|
|
|
|
归一化,最大最小值
|
|
|
|
|
:param x:
|
|
|
|
|
:return:
|
|
|
|
|
"""
|
|
|
|
|
return [(float(i) - min(x)) / float(max(x) - min(x) + zero_bit) for i in x]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def z_score(x, axis=0):
|
|
|
|
|
"""
|
|
|
|
|
标准化
|
|
|
|
|
:param x: arrary, numpy
|
|
|
|
|
:param axis: int, 0
|
|
|
|
|
:return: arrary, numpy
|
|
|
|
|
"""
|
|
|
|
|
x = np.array(x).astype(float)
|
|
|
|
|
xr = np.rollaxis(x, axis=axis)
|
|
|
|
|
xr -= np.mean(x, axis=axis)
|
|
|
|
|
xr /= np.std(x, axis=axis)
|
|
|
|
|
# print(x)
|
|
|
|
|
return x
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def tok_td_idf(data_path):
|
|
|
|
|
if os.path.exists(data_path + 'td_idf_cut.csv'):
|
|
|
|
|
'''#计算TD-DIDF,获取训练测试数据'''
|
|
|
|
|
datas = txtRead(data_path + 'td_idf_cut.csv')
|
|
|
|
|
# 默认值只匹配长度≥2的单词,修改为1;ngram_range特征所以有2个词的,总计词语50428个
|
|
|
|
|
# vec_tdidf = TfidfVectorizer(ngram_range=(1, 2), token_pattern=r"(?u)\b\w+\b", min_df=1, max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1,max_features=30000)
|
|
|
|
|
vec_tdidf = TfidfVectorizer(ngram_range=(1, 2), token_pattern=r"(?u)\b\w+\b", min_df=3,
|
|
|
|
|
max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1, max_features=50000)
|
|
|
|
|
vec_tdidf.fit_transform(datas)
|
|
|
|
|
file_vec_tdidf = open(data_path + 'td_idf_cut_model.pkl', 'wb')
|
|
|
|
|
pickle.dump(vec_tdidf, file_vec_tdidf)
|
|
|
|
|
|
|
|
|
|
return vec_tdidf
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def tok_td_idf_pinyin(data_path):
|
|
|
|
|
if os.path.exists(data_path + 'td_idf_cut_pinyin.csv'):
|
|
|
|
|
'''#计算TD-DIDF,获取训练测试数据'''
|
|
|
|
|
datas = txtRead(data_path + 'td_idf_cut_pinyin.csv')
|
|
|
|
|
# 默认值只匹配长度≥2的单词,修改为1;ngram_range特征所以有2个词的,总计词语50428个
|
|
|
|
|
# vec_tdidf = TfidfVectorizer(ngram_range=(1, 2), token_pattern=r"(?u)\b\w+\b", min_df=1, max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1,max_features=30000)
|
|
|
|
|
vec_tdidf = TfidfVectorizer(ngram_range=(1, 2), token_pattern=r"(?u)\b\w+\b", min_df=3,
|
|
|
|
|
max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1, max_features=50000)
|
|
|
|
|
vec_tdidf.fit_transform(datas)
|
|
|
|
|
file_vec_tdidf = open(data_path + 'td_idf_cut_pinyin_model.pkl', 'wb')
|
|
|
|
|
pickle.dump(vec_tdidf, file_vec_tdidf)
|
|
|
|
|
|
|
|
|
|
return vec_tdidf
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
vec1_test = np.array([1, 38, 17, 32])
|
|
|
|
|
vec2_test = np.array([5, 6, 8, 9])
|
|
|
|
|
|
|
|
|
|
str1_test = "你到底是谁?"
|
|
|
|
|
str2_test = "没想到我是谁,是真样子"
|
|
|
|
|
|
|
|
|
|
print(clear_sentence(str1_test)) # 数据处理
|
|
|
|
|
print(chinese2pinyin(str1_test)) # 中文转拼音
|
|
|
|
|
|
|
|
|
|
print(euclidean_distance(vec1_test, vec2_test))
|
|
|
|
|
print(cosine_distance(vec1_test, vec2_test))
|
|
|
|
|
print(manhattan_distance(vec1_test, vec2_test))
|
|
|
|
|
print(euclidean_distance(vec1_test, vec2_test))
|
|
|
|
|
print(chebyshev_distance(vec1_test, vec2_test))
|
|
|
|
|
print(minkowski_distance(vec1_test, vec2_test))
|
|
|
|
|
|
|
|
|
|
print(euclidean_distance_standardized(vec1_test, vec2_test))
|
|
|
|
|
print(mahalanobis_distance(vec1_test, vec2_test))
|
|
|
|
|
|
|
|
|
|
print('###############################################')
|
|
|
|
|
|
|
|
|
|
print(bray_curtis_distance(vec1_test, vec2_test))
|
|
|
|
|
print(pearson_correlation_distance(vec1_test, vec2_test))
|
|
|
|
|
print(jaccard_similarity_coefficient_distance(vec1_test, vec2_test))
|
|
|
|
|
|
|
|
|
|
print('###############################################')
|
|
|
|
|
|
|
|
|
|
# print(HamMings_Levenshtein(str1, str2)),需要等长
|
|
|
|
|
# print(Wmd_distance(model, sent1_cut_list, sent2_cut_list)) # 需要gensim word2vec model
|
|
|
|
|
|
|
|
|
|
print(hamming_distance(str1_test, str2_test))
|
|
|
|
|
print(edit_levenshtein(str1_test, str2_test))
|
|
|
|
|
print(ratio_levenshtein(str1_test, str2_test))
|
|
|
|
|
print(jaro_levenshtein(str1_test, str2_test))
|
|
|
|
|
print(set_ratio_fuzzywuzzy(str1_test, str2_test))
|
|
|
|
|
print(sort_ratio_fuzzywuzzy(str1_test, str2_test))
|
|
|
|
|
print(num_of_common_sub_str(str1_test, str2_test))
|
|
|
|
|
print(normalization(vec1_test)) # 归一化(0-1)
|
|
|
|
|
print(z_score(vec1_test)) # 标准化(0附近,正负)
|