修复 numpy warning问题
RuntimeWarning: invalid value encountered in true_divide matrix_org_norm = (matrix_org / np.sqrt((matrix_org ** 2).sum(-1))[..., np.newaxis]).astype(numpy_type)
This commit is contained in:
parent
62c7af0922
commit
977e93701f
@ -4,19 +4,18 @@
|
||||
# @author :Mo
|
||||
# @function :chatbot based search, encode sentence_vec by char
|
||||
|
||||
from conf.path_config import w2v_model_char_path
|
||||
from conf.path_config import matrix_ques_part_path_char
|
||||
from utils.text_tools import txtRead, txtWrite, getChinese
|
||||
from conf.path_config import projectdir, chicken_and_gossip_path
|
||||
from numpy import float32 as numpy_type
|
||||
from collections import Counter
|
||||
import pickle, jieba, os, re
|
||||
import jieba.posseg as pseg
|
||||
from gensim import matutils
|
||||
from math import log
|
||||
import numpy as np
|
||||
import os
|
||||
import pickle
|
||||
|
||||
import gensim
|
||||
import jieba
|
||||
import numpy as np
|
||||
from gensim import matutils
|
||||
from numpy import float32 as numpy_type
|
||||
|
||||
from nlp_xiaojiang.conf.path_config import matrix_ques_part_path_char
|
||||
from nlp_xiaojiang.conf.path_config import projectdir, chicken_and_gossip_path
|
||||
from nlp_xiaojiang.conf.path_config import w2v_model_char_path
|
||||
from nlp_xiaojiang.utils.text_tools import txtRead, getChinese
|
||||
|
||||
|
||||
def load_word2vec_model(path, bin=False, limit=None):
|
||||
@ -24,31 +23,51 @@ def load_word2vec_model(path, bin=False, limit=None):
|
||||
return word2vec_model
|
||||
|
||||
|
||||
def encoding_question(w2v_model, char_list):
|
||||
def question_encoding(w2v_model, char_list):
|
||||
''' 生成句子向量
|
||||
:param wordlist: 分词list
|
||||
:param is_replaced: 是否替换default true
|
||||
:param debug_mode: default false
|
||||
:return: array句子的向量 len=300
|
||||
'''
|
||||
try:
|
||||
sentence_vec = w2v_model.wv[word2vec_model.index2word[1]] * 0
|
||||
except:
|
||||
sentence_vec = w2v_model.wv[word2vec_model.index2word[0]] * 0
|
||||
|
||||
sentence_vec = w2v_model.wv[w2v_model.index2word[1]] * 0
|
||||
for k in range(len(char_list)):
|
||||
char_list_one = char_list[k]
|
||||
if type(char_list_one) == str:
|
||||
word = char_list[k]
|
||||
try:
|
||||
sentence_vec = sentence_vec + w2v_model.wv[char_list_one]
|
||||
sentence_vec = sentence_vec + w2v_model.wv[word]
|
||||
except Exception as e:
|
||||
print(str(e))
|
||||
if char_list_one not in [' ', '']:
|
||||
sentence_vec = sentence_vec + 1
|
||||
sentence_vec = sentence_vec + 1 # un_know词加1
|
||||
return sentence_vec
|
||||
|
||||
|
||||
def most_similar_sentence_vec(vec_ques, matrix_org, top_vec=20):
|
||||
def basic_questions_matrix_init(matrix_org, top_vec=20):
|
||||
"""
|
||||
单位化和初始化基本问题矩阵,以方便点乘, 减小计算量等
|
||||
:param matrix_org:
|
||||
:param top_vec:
|
||||
:return:
|
||||
"""
|
||||
len_matrix_org = len(matrix_org)
|
||||
# 防止top_vec越界
|
||||
top_vec = min(len(matrix_org), top_vec)
|
||||
# 首先对句向量矩阵标号
|
||||
matrix_org_index = list(range(len_matrix_org))
|
||||
# matrix_org单位化
|
||||
# 每个句向量求平方
|
||||
matrix_org_xinxin = matrix_org ** 2
|
||||
# 每个句向量求和, 压缩为一个数,当axis为1时, 是压缩列, 即将每一行的元素相加, 将矩阵压缩为一列
|
||||
matrix_org_sum = matrix_org_xinxin.sum(-1)
|
||||
# 每个数求根号, np.newaxis新增一个元素
|
||||
matrix_org_sqrt = np.sqrt(matrix_org_sum)[:, np.newaxis] # + 1e-9
|
||||
# 解决warning问题
|
||||
matrix_org_sqrt[matrix_org_sqrt == 0] = 1e-9
|
||||
# 句向量矩阵除以它的平均数
|
||||
matrix_org_norm = (matrix_org / matrix_org_sqrt).astype(numpy_type)
|
||||
return matrix_org_norm, matrix_org_index, top_vec
|
||||
|
||||
|
||||
def calculate_text_similar(vec_ques, matrix_org_norm, matrix_org_index, top_vec):
|
||||
"""
|
||||
最相似的句子,句向量与矩阵点乘
|
||||
:param vec:
|
||||
@ -57,19 +76,13 @@ def most_similar_sentence_vec(vec_ques, matrix_org, top_vec=20):
|
||||
:param topn:
|
||||
:return:
|
||||
"""
|
||||
# 首先对句向量矩阵标号
|
||||
matrix_org_index = list(range(len(matrix_org)))
|
||||
# Scale a vector to unit length. The only exception is the zerovector, which is returned back unchanged.
|
||||
# 问句向量标准化, Scale a vector to unit length. The only exception is the zero vector, which is returned back unchanged.
|
||||
vec_ques_mean = matutils.unitvec(np.array([vec_ques]).mean(axis=0)).astype(numpy_type)
|
||||
# matrix_org单位化
|
||||
matrix_org_norm = (matrix_org / np.sqrt((matrix_org ** 2).sum(-1))[..., np.newaxis]).astype(numpy_type)
|
||||
# 计算两个向量之间的相似度,使用numpy的dot函数,矩阵点乘
|
||||
# 矩阵点乘, 即问句与标准问句库里边的问句点乘,
|
||||
matrix_vec_dot = np.dot(matrix_org_norm, vec_ques_mean)
|
||||
# 防止top_vec越界
|
||||
top_vec = min(len(matrix_org), top_vec)
|
||||
# 相似度排序
|
||||
most_similar_sentence_vec_sort = matutils.argsort(matrix_vec_dot, topn=top_vec, reverse=True)
|
||||
|
||||
# 获取最相似标准问句的index和得分score
|
||||
index_score = []
|
||||
for t in most_similar_sentence_vec_sort[:top_vec]:
|
||||
index_score.append([matrix_org_index[t], float(matrix_vec_dot[t])])
|
||||
@ -97,7 +110,7 @@ def create_matrix_org_np(sen_count, word2vec_model, qa_path, matrix_ques_path):
|
||||
for qa_dail_one in qa_dail:
|
||||
ques = getChinese(qa_dail_one.split('\t')[0])
|
||||
char_list = [ques_char for ques_char in ques]
|
||||
sentence_vec = encoding_question(word2vec_model, char_list)
|
||||
sentence_vec = question_encoding(word2vec_model, char_list)
|
||||
matrix_ques.append(sentence_vec)
|
||||
if len(matrix_ques)%sen_count == 0 and len(matrix_ques) != 0:
|
||||
print("count: " + str(count))
|
||||
@ -106,11 +119,10 @@ def create_matrix_org_np(sen_count, word2vec_model, qa_path, matrix_ques_path):
|
||||
matrix_ques = []
|
||||
break
|
||||
|
||||
# count += 1
|
||||
# np.savetxt(projectdir + "/Data/sentence_vec_encode_char/" + str(count)+".txt", matrix_ques)
|
||||
count += 1
|
||||
np.savetxt(projectdir + "/Data/sentence_vec_encode_char/" + str(count)+".txt", matrix_ques)
|
||||
|
||||
print('create_matrix_org_pkl ok!')
|
||||
# return matrix_ques
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
@ -123,16 +135,21 @@ if __name__ == '__main__':
|
||||
if not os.path.exists(matrix_ques_part_path_char):
|
||||
# matrix_ques = create_matrix_org_np(sen_count=100000, word2vec_model=word2vec_model, qa_path=chicken_and_gossip_path, matrix_ques_path=matrix_ques_part_path_char)
|
||||
create_matrix_org_np(sen_count=100000, word2vec_model=word2vec_model, qa_path=chicken_and_gossip_path, matrix_ques_path=matrix_ques_part_path_char)
|
||||
# 重载
|
||||
|
||||
# 读取标准问句矩阵
|
||||
print("np.loadtxt(matrix_ques_part_path) start!")
|
||||
matrix_ques = np.loadtxt(matrix_ques_part_path_char)
|
||||
print("np.loadtxt(matrix_ques_part_path_char) ok!")
|
||||
print("np.loadtxt(matrix_ques_part_path) end!")
|
||||
# 标准问句矩阵初始化和预处理
|
||||
matrix_org_norm, matrix_org_index, top_vec = basic_questions_matrix_init(matrix_ques, top_vec=20)
|
||||
|
||||
while True:
|
||||
print("你问: ")
|
||||
ques_ask = input()
|
||||
ques_clean = getChinese(ques_ask)
|
||||
char_list = [ques_char for ques_char in ques_clean]
|
||||
sentence_vic = encoding_question(word2vec_model, char_list)
|
||||
top_20_qid = most_similar_sentence_vec(sentence_vic, matrix_ques, top_vec=20)
|
||||
sentence_vec = question_encoding(word2vec_model, char_list)
|
||||
top_20_qid = calculate_text_similar(sentence_vec, matrix_org_norm, matrix_org_index, top_vec=top_vec)
|
||||
try:
|
||||
print("小姜机器人: " + syn_qa_dails[top_20_qid[0][0]].strip().split("\t")[1])
|
||||
print([(syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[0], syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[1]) for i in range(len(top_20_qid))])
|
||||
|
@ -5,20 +5,20 @@
|
||||
# @function :chatbot based search, encode sentence_vec by word
|
||||
|
||||
|
||||
from conf.path_config import w2v_model_merge_short_path, w2v_model_wiki_word_path
|
||||
from conf.path_config import projectdir, chicken_and_gossip_path
|
||||
from utils.text_tools import txtRead, txtWrite, getChinese
|
||||
from conf.path_config import matrix_ques_part_path
|
||||
from numpy import float32 as numpy_type
|
||||
from collections import Counter
|
||||
import pickle, jieba, os, re
|
||||
import jieba.posseg as pseg
|
||||
from gensim import matutils
|
||||
from math import log
|
||||
import numpy as np
|
||||
import os
|
||||
import pickle
|
||||
|
||||
import gensim
|
||||
import jieba
|
||||
import time
|
||||
import jieba.posseg as jieba_seg
|
||||
import numpy as np
|
||||
from gensim import matutils
|
||||
from numpy import float32 as numpy_type
|
||||
|
||||
from nlp_xiaojiang.conf.path_config import matrix_ques_part_path
|
||||
from nlp_xiaojiang.conf.path_config import projectdir, chicken_and_gossip_path
|
||||
from nlp_xiaojiang.conf.path_config import w2v_model_merge_short_path, w2v_model_wiki_word_path
|
||||
from nlp_xiaojiang.utils.text_tools import txtRead, getChinese
|
||||
|
||||
|
||||
def load_word2vec_model(path, bin=False, limit=None):
|
||||
@ -51,18 +51,19 @@ def get_jieba_flag(flag):
|
||||
return weight
|
||||
|
||||
|
||||
def word_segment_process(sentence):
|
||||
def word_flag_cut(sentence):
|
||||
"""
|
||||
jieba切词\词性
|
||||
jieba切词词性
|
||||
:param sentence:
|
||||
:return:
|
||||
"""
|
||||
sentence = sentence.replace('\n', '').replace(',', '').replace('"', '').replace(' ', '').replace('\t', '').upper().strip()
|
||||
sentence = sentence.replace('\n', '').replace(',', '').replace('"', '').\
|
||||
replace(' ', '').replace('\t', '').upper().strip()
|
||||
word_list = []
|
||||
flag_list = []
|
||||
try:
|
||||
sentence_cut = ''.join(jieba.lcut(sentence, cut_all=False, HMM=False))
|
||||
words = pseg.cut(sentence_cut)
|
||||
words = jieba_seg.cut(sentence_cut)
|
||||
for word in words:
|
||||
word_list.append(word.word)
|
||||
flag_list.append(word.flag)
|
||||
@ -72,32 +73,52 @@ def word_segment_process(sentence):
|
||||
return word_list, flag_list
|
||||
|
||||
|
||||
def encoding_question(w2v_model, word_list, flag_list):
|
||||
def basic_questions_encoding(w2v_model, word_list, flag_list):
|
||||
''' 生成句子向量
|
||||
:param wordlist: 分词list
|
||||
:param is_replaced: 是否替换default true
|
||||
:param debug_mode: default false
|
||||
:return: array句子的向量 len=300
|
||||
'''
|
||||
try:
|
||||
sentence_vec = w2v_model.wv[w2v_model.index2word[1]] * 0
|
||||
except:
|
||||
sentence_vec = w2v_model.wv[w2v_model.index2word[1]] * 0
|
||||
|
||||
sentence_vec = w2v_model.wv[w2v_model.index2word[1]] * 0
|
||||
for k in range(len(word_list)):
|
||||
word = word_list[k]
|
||||
flag = flag_list[k]
|
||||
if type(word) == str:
|
||||
try:
|
||||
sentence_vec = sentence_vec + w2v_model.wv[word] * get_jieba_flag(flag)
|
||||
except Exception as e:
|
||||
if word not in [' ', '']:
|
||||
sentence_vec = sentence_vec + 1
|
||||
|
||||
sentence_vec = sentence_vec + 1 # un_know词加1
|
||||
return sentence_vec
|
||||
|
||||
|
||||
def most_similar_sentence_vec(vec_ques, matrix_org, top_vec=20):
|
||||
def basic_questions_matrix_init(matrix_org, top_vec=20):
|
||||
"""
|
||||
单位化和初始化基本问题矩阵,以方便点乘, 减小计算量等
|
||||
:param matrix_org:
|
||||
:param top_vec:
|
||||
:return:
|
||||
"""
|
||||
len_matrix_org = len(matrix_org)
|
||||
# 防止top_vec越界
|
||||
top_vec = min(len(matrix_org), top_vec)
|
||||
# 首先对句向量矩阵标号
|
||||
matrix_org_index = list(range(len_matrix_org))
|
||||
# matrix_org单位化
|
||||
# 每个句向量求平方
|
||||
matrix_org_xinxin = matrix_org ** 2
|
||||
# 每个句向量求和, 压缩为一个数,当axis为1时, 是压缩列, 即将每一行的元素相加, 将矩阵压缩为一列
|
||||
matrix_org_sum = matrix_org_xinxin.sum(-1)
|
||||
# 每个数求根号, np.newaxis新增一个元素
|
||||
matrix_org_sqrt = np.sqrt(matrix_org_sum)[:, np.newaxis] # + 1e-9
|
||||
# 解决warning问题
|
||||
matrix_org_sqrt[matrix_org_sqrt == 0] = 1e-9
|
||||
# 句向量矩阵除以它的平均数
|
||||
matrix_org_norm = (matrix_org / matrix_org_sqrt).astype(numpy_type)
|
||||
return matrix_org_norm, matrix_org_index, top_vec
|
||||
|
||||
|
||||
def calculate_text_similar(vec_ques, matrix_org_norm, matrix_org_index, top_vec):
|
||||
"""
|
||||
最相似的句子,句向量与矩阵点乘
|
||||
:param vec:
|
||||
@ -106,19 +127,13 @@ def most_similar_sentence_vec(vec_ques, matrix_org, top_vec=20):
|
||||
:param topn:
|
||||
:return:
|
||||
"""
|
||||
# 首先对句向量矩阵标号
|
||||
matrix_org_index = list(range(len(matrix_org)))
|
||||
# Scale a vector to unit length. The only exception is the zerovector, which is returned back unchanged.
|
||||
# 问句向量标准化, Scale a vector to unit length. The only exception is the zero vector, which is returned back unchanged.
|
||||
vec_ques_mean = matutils.unitvec(np.array([vec_ques]).mean(axis=0)).astype(numpy_type)
|
||||
# matrix_org单位化
|
||||
matrix_org_norm = (matrix_org / np.sqrt((matrix_org ** 2).sum(-1))[..., np.newaxis]).astype(numpy_type)
|
||||
# 计算两个向量之间的相似度,使用numpy的dot函数,矩阵点乘
|
||||
# 矩阵点乘, 即问句与标准问句库里边的问句点乘,
|
||||
matrix_vec_dot = np.dot(matrix_org_norm, vec_ques_mean)
|
||||
# 防止top_vec越界
|
||||
top_vec = min(len(matrix_org), top_vec)
|
||||
# 相似度排序
|
||||
most_similar_sentence_vec_sort = matutils.argsort(matrix_vec_dot, topn=top_vec, reverse=True)
|
||||
|
||||
# 获取最相似标准问句的index和得分score
|
||||
index_score = []
|
||||
for t in most_similar_sentence_vec_sort[:top_vec]:
|
||||
index_score.append([matrix_org_index[t], float(matrix_vec_dot[t])])
|
||||
@ -146,8 +161,8 @@ def create_matrix_org_np(sen_count, word2vec_model, qa_path, matrix_ques_path_wo
|
||||
for qa_dail_one in qa_dail:
|
||||
ques = getChinese(qa_dail_one.split('\t')[0])
|
||||
# questions.append(ques)
|
||||
word_list, flag_list = word_segment_process(ques)
|
||||
sentence_vec = encoding_question(word2vec_model, word_list, flag_list)
|
||||
word_list, flag_list = word_flag_cut(ques)
|
||||
sentence_vec = encoding_basic_question(word2vec_model, word_list, flag_list)
|
||||
matrix_ques.append(sentence_vec)
|
||||
if len(matrix_ques)%sen_count == 0 and len(matrix_ques) != 0:
|
||||
print("count: " + str(count))
|
||||
@ -181,17 +196,20 @@ if __name__ == '__main__':
|
||||
if not os.path.exists(matrix_ques_part_path):
|
||||
create_matrix_org_np(sen_count=100000, word2vec_model=word2vec_model, qa_path=chicken_and_gossip_path, matrix_ques_path_word=matrix_ques_part_path)
|
||||
|
||||
# 读取
|
||||
# 读取标准问句矩阵
|
||||
print("np.loadtxt(matrix_ques_part_path) start!")
|
||||
matrix_ques = np.loadtxt(matrix_ques_part_path)
|
||||
print("np.loadtxt(matrix_ques_part_path) end!")
|
||||
# 标准问句矩阵初始化和预处理
|
||||
matrix_org_norm, matrix_org_index, top_vec = basic_questions_matrix_init(matrix_ques, top_vec=20)
|
||||
|
||||
while True:
|
||||
print("你: ")
|
||||
ques_ask = input()
|
||||
ques_clean = getChinese(ques_ask)
|
||||
word_list, flag_list = word_segment_process(ques_clean)
|
||||
sentence_vic = encoding_question(word2vec_model, word_list, flag_list)
|
||||
top_20_qid = most_similar_sentence_vec(sentence_vic, matrix_ques, top_vec=20)
|
||||
word_list, flag_list = word_flag_cut(ques_clean)
|
||||
sentence_vec = basic_questions_encoding(word2vec_model, word_list, flag_list)
|
||||
top_20_qid = calculate_text_similar(sentence_vec, matrix_org_norm, matrix_org_index, top_vec=top_vec)
|
||||
try:
|
||||
print("小姜机器人: " + syn_qa_dails[top_20_qid[0][0]].strip().split("\t")[1])
|
||||
print([(syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[0], syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[1]) for i in range(len(top_20_qid))])
|
||||
|
Loading…
Reference in New Issue
Block a user