nlp_xiaojiang/FeatureProject/distance_text_or_vec.py

327 lines
11 KiB
Python
Raw Permalink Normal View History

2019-04-09 15:26:07 +08:00
# -*- coding: UTF-8 -*-
# !/usr/bin/python
# @time :2019/4/4 10:00
# @author :Mo
2019-04-13 00:32:41 +08:00
# @function :calculate distances of Varity
2019-04-09 15:26:07 +08:00
from sklearn.feature_extraction.text import TfidfVectorizer
from utils.text_tools import txtRead, get_syboml, strQ2B
import Levenshtein as Leven
from fuzzywuzzy import fuzz
import jieba.analyse
import numpy as np
import xpinyin
import pickle
import jieba
import os
zero_bit = 0.000000001
pin = xpinyin.Pinyin()
def clear_sentence(sentence):
"""
数据清晰全角转半角
:param sentence: str, input sentence
:return: str, clearned sentences
"""
corpus_one_clear = str(sentence).replace(' ', '').strip()
ques_q2b = strQ2B(corpus_one_clear.strip())
ques_q2b_syboml = get_syboml(ques_q2b)
return ques_q2b_syboml
def chinese2pinyin(sentence):
"""
chinese translate to pingyin
:param sentence: str, input sentence
:return: str, output pingyin
"""
ques_q2b_syboml_pinying = pin.get_pinyin(sentence, ' ')
return ques_q2b_syboml_pinying
def hamming_distance(v1, v2):
n = int(v1, 2) ^ int(v2, 2)
return bin(n & 0xffffffff).count('1')
def cosine_distance(v1, v2): # 余弦距离
if v1.all() and v2.all():
return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
else:
return 0
def euclidean_distance(v1, v2): # 欧氏距离
return np.sqrt(np.sum(np.square(v1 - v2)))
def manhattan_distance(v1, v2): # 曼哈顿距离
return np.sum(np.abs(v1 - v2))
def chebyshev_distance(v1, v2): # 切比雪夫距离
return np.max(np.abs(v1 - v2))
def minkowski_distance(v1, v2): # 闵可夫斯基距离
return np.sqrt(np.sum(np.square(v1 - v2)))
def euclidean_distance_standardized(v1, v2): # 标准化欧氏距离
v1_v2 = np.vstack([v1, v2])
sk_v1_v2 = np.var(v1_v2, axis=0, ddof=1)
return np.sqrt(((v1 - v2) ** 2 / (sk_v1_v2 + zero_bit * np.ones_like(sk_v1_v2))).sum())
def mahalanobis_distance(v1, v2): # 马氏距离
# 马氏距离要求样本数要大于维数,否则无法求协方差矩阵
# 此处进行转置表示10个样本每个样本2维
X = np.vstack([v1, v2])
XT = X.T
# 方法一:根据公式求解
S = np.cov(X) # 两个维度之间协方差矩阵
try:
SI = np.linalg.inv(S) # 协方差矩阵的逆矩阵 todo
except:
SI = np.zeros_like(S)
# 马氏距离计算两个样本之间的距离此处共有10个样本两两组合共有45个距离。
n = XT.shape[0]
distance_all = []
for i in range(0, n):
for j in range(i + 1, n):
delta = XT[i] - XT[j]
distance_1 = np.sqrt(np.dot(np.dot(delta, SI), delta.T))
distance_all.append(distance_1)
return np.sum(np.abs(distance_all))
def bray_curtis_distance(v1, v2): # 布雷柯蒂斯距离, 生物学生态距离
up_v1_v2 = np.sum(np.abs(v2 - v1))
down_v1_v2 = np.sum(v1) + np.sum(v2)
return up_v1_v2 / (down_v1_v2 + zero_bit)
def pearson_correlation_distance(v1, v2): # 皮尔逊相关系数Pearson correlation
v1_v2 = np.vstack([v1, v2])
return np.corrcoef(v1_v2)[0][1]
def jaccard_similarity_coefficient_distance(v1, v2): # 杰卡德相似系数(Jaccard similarity coefficient)
# 方法一:根据公式求解
v1 = np.asarray(v1)
v2 = np.asarray(v2)
up = np.double(np.bitwise_and((v1 != v2), np.bitwise_or(v1 != 0, v2 != 0)).sum())
down = np.double(np.bitwise_or(v1 != 0, v2 != 0).sum() + zero_bit)
return up / down
def wmd_distance(model, sent1_cut_list, sent2_cut_list): # WMD距离
# model.init_sims(replace=True)
distance = model.wmdistance(sent1_cut_list, sent2_cut_list)
return distance
# def HamMings_Levenshtein(str1, str2):
# sim = Leven.hamming(str1, str2)
# return sim
def edit_levenshtein(str1, str2):
return Leven.distance(str1, str2)
def ratio_levenshtein(str1, str2):
return Leven.ratio(str1, str2)
def jaro_levenshtein(str1, str2):
return Leven.jaro(str1, str2)
def set_ratio_fuzzywuzzy(str1, str2):
return fuzz.token_set_ratio(str1, str2)
def sort_ratio_fuzzywuzzy(str1, str2):
return fuzz.token_sort_ratio(str1, str2)
def num_of_common_sub_str(str1, str2):
'''
求两个字符串的最长公共子串
思想建立一个二维数组保存连续位相同与否的状态
'''
lstr1 = len(str1)
lstr2 = len(str2)
record = [[0 for i in range(lstr2 + 1)] for j in range(lstr1 + 1)] # 多一位
maxNum = 0 # 最长匹配长度
p = 0 # 匹配的起始位
for i in range(lstr1):
for j in range(lstr2):
if str1[i] == str2[j]:
# 相同则累加
record[i + 1][j + 1] = record[i][j] + 1
if record[i + 1][j + 1] > maxNum:
# 获取最大匹配长度
maxNum = record[i + 1][j + 1]
# 记录最大匹配长度的终止位置
p = i + 1
# return str1[p - maxNum:p], maxNum
return maxNum
####################################################### 汉明距离
def string_hash(source):
if source == "":
return 0
else:
x = ord(source[0]) << 7
m = 1000003
mask = 2 ** 128 - 1
for c in source:
x = ((x * m) ^ ord(c)) & mask
x ^= len(source)
if x == -1:
x = -2
x = bin(x).replace('0b', '').zfill(64)[-64:]
return str(x)
def sim_hash(content):
seg = jieba.cut(content)
keyWord = jieba.analyse.extract_tags('|'.join(seg), topK=20, withWeight=True, allowPOS=())
# 先按照权重排序,再按照词排序
keyList = []
# print(keyWord)
for feature, weight in keyWord:
weight = int(weight * 20)
feature = string_hash(feature)
temp = []
for f in feature:
if f == '1':
temp.append(weight)
else:
temp.append(-weight)
keyList.append(temp)
content_list = np.sum(np.array(keyList), axis=0)
# 编码读不出来
if len(keyList) == 0:
return '00'
simhash = ''
for c in content_list:
if c > 0:
simhash = simhash + '1'
else:
simhash = simhash + '0'
return simhash
def hamming_distance_equal(v1, v2):
n = int(v1, 2) ^ int(v2, 2)
return bin(n & 0xffffffff).count('1')
def hamming_distance(sen1, sen2):
return hamming_distance_equal(sim_hash(sen1), sim_hash(sen2))
def normalization(x):
"""
归一化最大最小值
:param x:
:return:
"""
return [(float(i) - min(x)) / float(max(x) - min(x) + zero_bit) for i in x]
def z_score(x, axis=0):
"""
标准化
:param x: arrary, numpy
:param axis: int, 0
:return: arrary, numpy
"""
x = np.array(x).astype(float)
xr = np.rollaxis(x, axis=axis)
xr -= np.mean(x, axis=axis)
xr /= np.std(x, axis=axis)
# print(x)
return x
def tok_td_idf(data_path):
if os.path.exists(data_path + 'td_idf_cut.csv'):
'''#计算TD-DIDF获取训练测试数据'''
datas = txtRead(data_path + 'td_idf_cut.csv')
# 默认值只匹配长度≥2的单词,修改为1ngram_range特征所以有2个词的,总计词语50428个
# vec_tdidf = TfidfVectorizer(ngram_range=(1, 2), token_pattern=r"(?u)\b\w+\b", min_df=1, max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1,max_features=30000)
vec_tdidf = TfidfVectorizer(ngram_range=(1, 2), token_pattern=r"(?u)\b\w+\b", min_df=3,
max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1, max_features=50000)
vec_tdidf.fit_transform(datas)
file_vec_tdidf = open(data_path + 'td_idf_cut_model.pkl', 'wb')
pickle.dump(vec_tdidf, file_vec_tdidf)
return vec_tdidf
def tok_td_idf_pinyin(data_path):
if os.path.exists(data_path + 'td_idf_cut_pinyin.csv'):
'''#计算TD-DIDF获取训练测试数据'''
datas = txtRead(data_path + 'td_idf_cut_pinyin.csv')
# 默认值只匹配长度≥2的单词,修改为1ngram_range特征所以有2个词的,总计词语50428个
# vec_tdidf = TfidfVectorizer(ngram_range=(1, 2), token_pattern=r"(?u)\b\w+\b", min_df=1, max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1,max_features=30000)
vec_tdidf = TfidfVectorizer(ngram_range=(1, 2), token_pattern=r"(?u)\b\w+\b", min_df=3,
max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1, max_features=50000)
vec_tdidf.fit_transform(datas)
file_vec_tdidf = open(data_path + 'td_idf_cut_pinyin_model.pkl', 'wb')
pickle.dump(vec_tdidf, file_vec_tdidf)
return vec_tdidf
if __name__ == '__main__':
vec1_test = np.array([1, 38, 17, 32])
vec2_test = np.array([5, 6, 8, 9])
str1_test = "你到底是谁?"
str2_test = "没想到我是谁,是真样子"
print(clear_sentence(str1_test)) # 数据处理
print(chinese2pinyin(str1_test)) # 中文转拼音
print(euclidean_distance(vec1_test, vec2_test))
print(cosine_distance(vec1_test, vec2_test))
print(manhattan_distance(vec1_test, vec2_test))
print(euclidean_distance(vec1_test, vec2_test))
print(chebyshev_distance(vec1_test, vec2_test))
print(minkowski_distance(vec1_test, vec2_test))
print(euclidean_distance_standardized(vec1_test, vec2_test))
print(mahalanobis_distance(vec1_test, vec2_test))
print('###############################################')
print(bray_curtis_distance(vec1_test, vec2_test))
print(pearson_correlation_distance(vec1_test, vec2_test))
print(jaccard_similarity_coefficient_distance(vec1_test, vec2_test))
print('###############################################')
# print(HamMings_Levenshtein(str1, str2)),需要等长
# print(Wmd_distance(model, sent1_cut_list, sent2_cut_list)) # 需要gensim word2vec model
print(hamming_distance(str1_test, str2_test))
print(edit_levenshtein(str1_test, str2_test))
print(ratio_levenshtein(str1_test, str2_test))
print(jaro_levenshtein(str1_test, str2_test))
print(set_ratio_fuzzywuzzy(str1_test, str2_test))
print(sort_ratio_fuzzywuzzy(str1_test, str2_test))
print(num_of_common_sub_str(str1_test, str2_test))
print(normalization(vec1_test)) # 归一化0-1
print(z_score(vec1_test)) # 标准化0附近正负