nlp_xiaojiang/FeatureProject/distance_text_or_vec.py
2019-04-13 00:32:41 +08:00

327 lines
11 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: UTF-8 -*-
# !/usr/bin/python
# @time :2019/4/4 10:00
# @author :Mo
# @function :calculate distances of Varity
from sklearn.feature_extraction.text import TfidfVectorizer
from utils.text_tools import txtRead, get_syboml, strQ2B
import Levenshtein as Leven
from fuzzywuzzy import fuzz
import jieba.analyse
import numpy as np
import xpinyin
import pickle
import jieba
import os
zero_bit = 0.000000001
pin = xpinyin.Pinyin()
def clear_sentence(sentence):
"""
数据清晰,全角转半角
:param sentence: str, input sentence
:return: str, clearned sentences
"""
corpus_one_clear = str(sentence).replace(' ', '').strip()
ques_q2b = strQ2B(corpus_one_clear.strip())
ques_q2b_syboml = get_syboml(ques_q2b)
return ques_q2b_syboml
def chinese2pinyin(sentence):
"""
chinese translate to pingyin
:param sentence: str, input sentence
:return: str, output pingyin
"""
ques_q2b_syboml_pinying = pin.get_pinyin(sentence, ' ')
return ques_q2b_syboml_pinying
def hamming_distance(v1, v2):
n = int(v1, 2) ^ int(v2, 2)
return bin(n & 0xffffffff).count('1')
def cosine_distance(v1, v2): # 余弦距离
if v1.all() and v2.all():
return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
else:
return 0
def euclidean_distance(v1, v2): # 欧氏距离
return np.sqrt(np.sum(np.square(v1 - v2)))
def manhattan_distance(v1, v2): # 曼哈顿距离
return np.sum(np.abs(v1 - v2))
def chebyshev_distance(v1, v2): # 切比雪夫距离
return np.max(np.abs(v1 - v2))
def minkowski_distance(v1, v2): # 闵可夫斯基距离
return np.sqrt(np.sum(np.square(v1 - v2)))
def euclidean_distance_standardized(v1, v2): # 标准化欧氏距离
v1_v2 = np.vstack([v1, v2])
sk_v1_v2 = np.var(v1_v2, axis=0, ddof=1)
return np.sqrt(((v1 - v2) ** 2 / (sk_v1_v2 + zero_bit * np.ones_like(sk_v1_v2))).sum())
def mahalanobis_distance(v1, v2): # 马氏距离
# 马氏距离要求样本数要大于维数,否则无法求协方差矩阵
# 此处进行转置表示10个样本每个样本2维
X = np.vstack([v1, v2])
XT = X.T
# 方法一:根据公式求解
S = np.cov(X) # 两个维度之间协方差矩阵
try:
SI = np.linalg.inv(S) # 协方差矩阵的逆矩阵 todo
except:
SI = np.zeros_like(S)
# 马氏距离计算两个样本之间的距离此处共有10个样本两两组合共有45个距离。
n = XT.shape[0]
distance_all = []
for i in range(0, n):
for j in range(i + 1, n):
delta = XT[i] - XT[j]
distance_1 = np.sqrt(np.dot(np.dot(delta, SI), delta.T))
distance_all.append(distance_1)
return np.sum(np.abs(distance_all))
def bray_curtis_distance(v1, v2): # 布雷柯蒂斯距离, 生物学生态距离
up_v1_v2 = np.sum(np.abs(v2 - v1))
down_v1_v2 = np.sum(v1) + np.sum(v2)
return up_v1_v2 / (down_v1_v2 + zero_bit)
def pearson_correlation_distance(v1, v2): # 皮尔逊相关系数Pearson correlation
v1_v2 = np.vstack([v1, v2])
return np.corrcoef(v1_v2)[0][1]
def jaccard_similarity_coefficient_distance(v1, v2): # 杰卡德相似系数(Jaccard similarity coefficient)
# 方法一:根据公式求解
v1 = np.asarray(v1)
v2 = np.asarray(v2)
up = np.double(np.bitwise_and((v1 != v2), np.bitwise_or(v1 != 0, v2 != 0)).sum())
down = np.double(np.bitwise_or(v1 != 0, v2 != 0).sum() + zero_bit)
return up / down
def wmd_distance(model, sent1_cut_list, sent2_cut_list): # WMD距离
# model.init_sims(replace=True)
distance = model.wmdistance(sent1_cut_list, sent2_cut_list)
return distance
# def HamMings_Levenshtein(str1, str2):
# sim = Leven.hamming(str1, str2)
# return sim
def edit_levenshtein(str1, str2):
return Leven.distance(str1, str2)
def ratio_levenshtein(str1, str2):
return Leven.ratio(str1, str2)
def jaro_levenshtein(str1, str2):
return Leven.jaro(str1, str2)
def set_ratio_fuzzywuzzy(str1, str2):
return fuzz.token_set_ratio(str1, str2)
def sort_ratio_fuzzywuzzy(str1, str2):
return fuzz.token_sort_ratio(str1, str2)
def num_of_common_sub_str(str1, str2):
'''
求两个字符串的最长公共子串
思想:建立一个二维数组,保存连续位相同与否的状态
'''
lstr1 = len(str1)
lstr2 = len(str2)
record = [[0 for i in range(lstr2 + 1)] for j in range(lstr1 + 1)] # 多一位
maxNum = 0 # 最长匹配长度
p = 0 # 匹配的起始位
for i in range(lstr1):
for j in range(lstr2):
if str1[i] == str2[j]:
# 相同则累加
record[i + 1][j + 1] = record[i][j] + 1
if record[i + 1][j + 1] > maxNum:
# 获取最大匹配长度
maxNum = record[i + 1][j + 1]
# 记录最大匹配长度的终止位置
p = i + 1
# return str1[p - maxNum:p], maxNum
return maxNum
####################################################### 汉明距离
def string_hash(source):
if source == "":
return 0
else:
x = ord(source[0]) << 7
m = 1000003
mask = 2 ** 128 - 1
for c in source:
x = ((x * m) ^ ord(c)) & mask
x ^= len(source)
if x == -1:
x = -2
x = bin(x).replace('0b', '').zfill(64)[-64:]
return str(x)
def sim_hash(content):
seg = jieba.cut(content)
keyWord = jieba.analyse.extract_tags('|'.join(seg), topK=20, withWeight=True, allowPOS=())
# 先按照权重排序,再按照词排序
keyList = []
# print(keyWord)
for feature, weight in keyWord:
weight = int(weight * 20)
feature = string_hash(feature)
temp = []
for f in feature:
if f == '1':
temp.append(weight)
else:
temp.append(-weight)
keyList.append(temp)
content_list = np.sum(np.array(keyList), axis=0)
# 编码读不出来
if len(keyList) == 0:
return '00'
simhash = ''
for c in content_list:
if c > 0:
simhash = simhash + '1'
else:
simhash = simhash + '0'
return simhash
def hamming_distance_equal(v1, v2):
n = int(v1, 2) ^ int(v2, 2)
return bin(n & 0xffffffff).count('1')
def hamming_distance(sen1, sen2):
return hamming_distance_equal(sim_hash(sen1), sim_hash(sen2))
def normalization(x):
"""
归一化,最大最小值
:param x:
:return:
"""
return [(float(i) - min(x)) / float(max(x) - min(x) + zero_bit) for i in x]
def z_score(x, axis=0):
"""
标准化
:param x: arrary, numpy
:param axis: int, 0
:return: arrary, numpy
"""
x = np.array(x).astype(float)
xr = np.rollaxis(x, axis=axis)
xr -= np.mean(x, axis=axis)
xr /= np.std(x, axis=axis)
# print(x)
return x
def tok_td_idf(data_path):
if os.path.exists(data_path + 'td_idf_cut.csv'):
'''#计算TD-DIDF获取训练测试数据'''
datas = txtRead(data_path + 'td_idf_cut.csv')
# 默认值只匹配长度≥2的单词,修改为1ngram_range特征所以有2个词的,总计词语50428个
# vec_tdidf = TfidfVectorizer(ngram_range=(1, 2), token_pattern=r"(?u)\b\w+\b", min_df=1, max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1,max_features=30000)
vec_tdidf = TfidfVectorizer(ngram_range=(1, 2), token_pattern=r"(?u)\b\w+\b", min_df=3,
max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1, max_features=50000)
vec_tdidf.fit_transform(datas)
file_vec_tdidf = open(data_path + 'td_idf_cut_model.pkl', 'wb')
pickle.dump(vec_tdidf, file_vec_tdidf)
return vec_tdidf
def tok_td_idf_pinyin(data_path):
if os.path.exists(data_path + 'td_idf_cut_pinyin.csv'):
'''#计算TD-DIDF获取训练测试数据'''
datas = txtRead(data_path + 'td_idf_cut_pinyin.csv')
# 默认值只匹配长度≥2的单词,修改为1ngram_range特征所以有2个词的,总计词语50428个
# vec_tdidf = TfidfVectorizer(ngram_range=(1, 2), token_pattern=r"(?u)\b\w+\b", min_df=1, max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1,max_features=30000)
vec_tdidf = TfidfVectorizer(ngram_range=(1, 2), token_pattern=r"(?u)\b\w+\b", min_df=3,
max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1, max_features=50000)
vec_tdidf.fit_transform(datas)
file_vec_tdidf = open(data_path + 'td_idf_cut_pinyin_model.pkl', 'wb')
pickle.dump(vec_tdidf, file_vec_tdidf)
return vec_tdidf
if __name__ == '__main__':
vec1_test = np.array([1, 38, 17, 32])
vec2_test = np.array([5, 6, 8, 9])
str1_test = "你到底是谁?"
str2_test = "没想到我是谁,是真样子"
print(clear_sentence(str1_test)) # 数据处理
print(chinese2pinyin(str1_test)) # 中文转拼音
print(euclidean_distance(vec1_test, vec2_test))
print(cosine_distance(vec1_test, vec2_test))
print(manhattan_distance(vec1_test, vec2_test))
print(euclidean_distance(vec1_test, vec2_test))
print(chebyshev_distance(vec1_test, vec2_test))
print(minkowski_distance(vec1_test, vec2_test))
print(euclidean_distance_standardized(vec1_test, vec2_test))
print(mahalanobis_distance(vec1_test, vec2_test))
print('###############################################')
print(bray_curtis_distance(vec1_test, vec2_test))
print(pearson_correlation_distance(vec1_test, vec2_test))
print(jaccard_similarity_coefficient_distance(vec1_test, vec2_test))
print('###############################################')
# print(HamMings_Levenshtein(str1, str2)),需要等长
# print(Wmd_distance(model, sent1_cut_list, sent2_cut_list)) # 需要gensim word2vec model
print(hamming_distance(str1_test, str2_test))
print(edit_levenshtein(str1_test, str2_test))
print(ratio_levenshtein(str1_test, str2_test))
print(jaro_levenshtein(str1_test, str2_test))
print(set_ratio_fuzzywuzzy(str1_test, str2_test))
print(sort_ratio_fuzzywuzzy(str1_test, str2_test))
print(num_of_common_sub_str(str1_test, str2_test))
print(normalization(vec1_test)) # 归一化0-1
print(z_score(vec1_test)) # 标准化0附近正负