修复 numpy warning问题

RuntimeWarning: invalid value encountered in true_divide matrix_org_norm = (matrix_org / np.sqrt((matrix_org ** 2).sum(-1))[..., np.newaxis]).astype(numpy_type)
This commit is contained in:
yongzhuo 2019-07-02 23:26:18 +08:00 committed by GitHub
parent 62c7af0922
commit 977e93701f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 122 additions and 87 deletions

View File

@ -4,19 +4,18 @@
# @author :Mo
# @function :chatbot based search, encode sentence_vec by char
from conf.path_config import w2v_model_char_path
from conf.path_config import matrix_ques_part_path_char
from utils.text_tools import txtRead, txtWrite, getChinese
from conf.path_config import projectdir, chicken_and_gossip_path
from numpy import float32 as numpy_type
from collections import Counter
import pickle, jieba, os, re
import jieba.posseg as pseg
from gensim import matutils
from math import log
import numpy as np
import os
import pickle
import gensim
import jieba
import numpy as np
from gensim import matutils
from numpy import float32 as numpy_type
from nlp_xiaojiang.conf.path_config import matrix_ques_part_path_char
from nlp_xiaojiang.conf.path_config import projectdir, chicken_and_gossip_path
from nlp_xiaojiang.conf.path_config import w2v_model_char_path
from nlp_xiaojiang.utils.text_tools import txtRead, getChinese
def load_word2vec_model(path, bin=False, limit=None):
@ -24,31 +23,51 @@ def load_word2vec_model(path, bin=False, limit=None):
return word2vec_model
def encoding_question(w2v_model, char_list):
def question_encoding(w2v_model, char_list):
''' 生成句子向量
:param wordlist: 分词list
:param is_replaced: 是否替换default true
:param debug_mode: default false
:return: array句子的向量 len=300
'''
try:
sentence_vec = w2v_model.wv[word2vec_model.index2word[1]] * 0
except:
sentence_vec = w2v_model.wv[word2vec_model.index2word[0]] * 0
sentence_vec = w2v_model.wv[w2v_model.index2word[1]] * 0
for k in range(len(char_list)):
char_list_one = char_list[k]
if type(char_list_one) == str:
word = char_list[k]
try:
sentence_vec = sentence_vec + w2v_model.wv[char_list_one]
sentence_vec = sentence_vec + w2v_model.wv[word]
except Exception as e:
print(str(e))
if char_list_one not in [' ', '']:
sentence_vec = sentence_vec + 1
sentence_vec = sentence_vec + 1 # un_know词加1
return sentence_vec
def most_similar_sentence_vec(vec_ques, matrix_org, top_vec=20):
def basic_questions_matrix_init(matrix_org, top_vec=20):
"""
单位化和初始化基本问题矩阵以方便点乘, 减小计算量等
:param matrix_org:
:param top_vec:
:return:
"""
len_matrix_org = len(matrix_org)
# 防止top_vec越界
top_vec = min(len(matrix_org), top_vec)
# 首先对句向量矩阵标号
matrix_org_index = list(range(len_matrix_org))
# matrix_org单位化
# 每个句向量求平方
matrix_org_xinxin = matrix_org ** 2
# 每个句向量求和, 压缩为一个数当axis为1时, 是压缩列, 即将每一行的元素相加, 将矩阵压缩为一列
matrix_org_sum = matrix_org_xinxin.sum(-1)
# 每个数求根号, np.newaxis新增一个元素
matrix_org_sqrt = np.sqrt(matrix_org_sum)[:, np.newaxis] # + 1e-9
# 解决warning问题
matrix_org_sqrt[matrix_org_sqrt == 0] = 1e-9
# 句向量矩阵除以它的平均数
matrix_org_norm = (matrix_org / matrix_org_sqrt).astype(numpy_type)
return matrix_org_norm, matrix_org_index, top_vec
def calculate_text_similar(vec_ques, matrix_org_norm, matrix_org_index, top_vec):
"""
最相似的句子句向量与矩阵点乘
:param vec:
@ -57,19 +76,13 @@ def most_similar_sentence_vec(vec_ques, matrix_org, top_vec=20):
:param topn:
:return:
"""
# 首先对句向量矩阵标号
matrix_org_index = list(range(len(matrix_org)))
# Scale a vector to unit length. The only exception is the zerovector, which is returned back unchanged.
# 问句向量标准化, Scale a vector to unit length. The only exception is the zero vector, which is returned back unchanged.
vec_ques_mean = matutils.unitvec(np.array([vec_ques]).mean(axis=0)).astype(numpy_type)
# matrix_org单位化
matrix_org_norm = (matrix_org / np.sqrt((matrix_org ** 2).sum(-1))[..., np.newaxis]).astype(numpy_type)
# 计算两个向量之间的相似度使用numpy的dot函数矩阵点乘
# 矩阵点乘, 即问句与标准问句库里边的问句点乘,
matrix_vec_dot = np.dot(matrix_org_norm, vec_ques_mean)
# 防止top_vec越界
top_vec = min(len(matrix_org), top_vec)
# 相似度排序
most_similar_sentence_vec_sort = matutils.argsort(matrix_vec_dot, topn=top_vec, reverse=True)
# 获取最相似标准问句的index和得分score
index_score = []
for t in most_similar_sentence_vec_sort[:top_vec]:
index_score.append([matrix_org_index[t], float(matrix_vec_dot[t])])
@ -97,7 +110,7 @@ def create_matrix_org_np(sen_count, word2vec_model, qa_path, matrix_ques_path):
for qa_dail_one in qa_dail:
ques = getChinese(qa_dail_one.split('\t')[0])
char_list = [ques_char for ques_char in ques]
sentence_vec = encoding_question(word2vec_model, char_list)
sentence_vec = question_encoding(word2vec_model, char_list)
matrix_ques.append(sentence_vec)
if len(matrix_ques)%sen_count == 0 and len(matrix_ques) != 0:
print("count: " + str(count))
@ -106,11 +119,10 @@ def create_matrix_org_np(sen_count, word2vec_model, qa_path, matrix_ques_path):
matrix_ques = []
break
# count += 1
# np.savetxt(projectdir + "/Data/sentence_vec_encode_char/" + str(count)+".txt", matrix_ques)
count += 1
np.savetxt(projectdir + "/Data/sentence_vec_encode_char/" + str(count)+".txt", matrix_ques)
print('create_matrix_org_pkl ok!')
# return matrix_ques
if __name__ == '__main__':
@ -123,16 +135,21 @@ if __name__ == '__main__':
if not os.path.exists(matrix_ques_part_path_char):
# matrix_ques = create_matrix_org_np(sen_count=100000, word2vec_model=word2vec_model, qa_path=chicken_and_gossip_path, matrix_ques_path=matrix_ques_part_path_char)
create_matrix_org_np(sen_count=100000, word2vec_model=word2vec_model, qa_path=chicken_and_gossip_path, matrix_ques_path=matrix_ques_part_path_char)
# 重载
# 读取标准问句矩阵
print("np.loadtxt(matrix_ques_part_path) start!")
matrix_ques = np.loadtxt(matrix_ques_part_path_char)
print("np.loadtxt(matrix_ques_part_path_char) ok!")
print("np.loadtxt(matrix_ques_part_path) end!")
# 标准问句矩阵初始化和预处理
matrix_org_norm, matrix_org_index, top_vec = basic_questions_matrix_init(matrix_ques, top_vec=20)
while True:
print("你问: ")
ques_ask = input()
ques_clean = getChinese(ques_ask)
char_list = [ques_char for ques_char in ques_clean]
sentence_vic = encoding_question(word2vec_model, char_list)
top_20_qid = most_similar_sentence_vec(sentence_vic, matrix_ques, top_vec=20)
sentence_vec = question_encoding(word2vec_model, char_list)
top_20_qid = calculate_text_similar(sentence_vec, matrix_org_norm, matrix_org_index, top_vec=top_vec)
try:
print("小姜机器人: " + syn_qa_dails[top_20_qid[0][0]].strip().split("\t")[1])
print([(syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[0], syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[1]) for i in range(len(top_20_qid))])

View File

@ -5,20 +5,20 @@
# @function :chatbot based search, encode sentence_vec by word
from conf.path_config import w2v_model_merge_short_path, w2v_model_wiki_word_path
from conf.path_config import projectdir, chicken_and_gossip_path
from utils.text_tools import txtRead, txtWrite, getChinese
from conf.path_config import matrix_ques_part_path
from numpy import float32 as numpy_type
from collections import Counter
import pickle, jieba, os, re
import jieba.posseg as pseg
from gensim import matutils
from math import log
import numpy as np
import os
import pickle
import gensim
import jieba
import time
import jieba.posseg as jieba_seg
import numpy as np
from gensim import matutils
from numpy import float32 as numpy_type
from nlp_xiaojiang.conf.path_config import matrix_ques_part_path
from nlp_xiaojiang.conf.path_config import projectdir, chicken_and_gossip_path
from nlp_xiaojiang.conf.path_config import w2v_model_merge_short_path, w2v_model_wiki_word_path
from nlp_xiaojiang.utils.text_tools import txtRead, getChinese
def load_word2vec_model(path, bin=False, limit=None):
@ -51,18 +51,19 @@ def get_jieba_flag(flag):
return weight
def word_segment_process(sentence):
def word_flag_cut(sentence):
"""
jieba切词\词性
jieba切词词性
:param sentence:
:return:
"""
sentence = sentence.replace('\n', '').replace(',', '').replace('"', '').replace(' ', '').replace('\t', '').upper().strip()
sentence = sentence.replace('\n', '').replace(',', '').replace('"', '').\
replace(' ', '').replace('\t', '').upper().strip()
word_list = []
flag_list = []
try:
sentence_cut = ''.join(jieba.lcut(sentence, cut_all=False, HMM=False))
words = pseg.cut(sentence_cut)
words = jieba_seg.cut(sentence_cut)
for word in words:
word_list.append(word.word)
flag_list.append(word.flag)
@ -72,32 +73,52 @@ def word_segment_process(sentence):
return word_list, flag_list
def encoding_question(w2v_model, word_list, flag_list):
def basic_questions_encoding(w2v_model, word_list, flag_list):
''' 生成句子向量
:param wordlist: 分词list
:param is_replaced: 是否替换default true
:param debug_mode: default false
:return: array句子的向量 len=300
'''
try:
sentence_vec = w2v_model.wv[w2v_model.index2word[1]] * 0
except:
sentence_vec = w2v_model.wv[w2v_model.index2word[1]] * 0
sentence_vec = w2v_model.wv[w2v_model.index2word[1]] * 0
for k in range(len(word_list)):
word = word_list[k]
flag = flag_list[k]
if type(word) == str:
try:
sentence_vec = sentence_vec + w2v_model.wv[word] * get_jieba_flag(flag)
except Exception as e:
if word not in [' ', '']:
sentence_vec = sentence_vec + 1
sentence_vec = sentence_vec + 1 # un_know词加1
return sentence_vec
def most_similar_sentence_vec(vec_ques, matrix_org, top_vec=20):
def basic_questions_matrix_init(matrix_org, top_vec=20):
"""
单位化和初始化基本问题矩阵以方便点乘, 减小计算量等
:param matrix_org:
:param top_vec:
:return:
"""
len_matrix_org = len(matrix_org)
# 防止top_vec越界
top_vec = min(len(matrix_org), top_vec)
# 首先对句向量矩阵标号
matrix_org_index = list(range(len_matrix_org))
# matrix_org单位化
# 每个句向量求平方
matrix_org_xinxin = matrix_org ** 2
# 每个句向量求和, 压缩为一个数当axis为1时, 是压缩列, 即将每一行的元素相加, 将矩阵压缩为一列
matrix_org_sum = matrix_org_xinxin.sum(-1)
# 每个数求根号, np.newaxis新增一个元素
matrix_org_sqrt = np.sqrt(matrix_org_sum)[:, np.newaxis] # + 1e-9
# 解决warning问题
matrix_org_sqrt[matrix_org_sqrt == 0] = 1e-9
# 句向量矩阵除以它的平均数
matrix_org_norm = (matrix_org / matrix_org_sqrt).astype(numpy_type)
return matrix_org_norm, matrix_org_index, top_vec
def calculate_text_similar(vec_ques, matrix_org_norm, matrix_org_index, top_vec):
"""
最相似的句子句向量与矩阵点乘
:param vec:
@ -106,19 +127,13 @@ def most_similar_sentence_vec(vec_ques, matrix_org, top_vec=20):
:param topn:
:return:
"""
# 首先对句向量矩阵标号
matrix_org_index = list(range(len(matrix_org)))
# Scale a vector to unit length. The only exception is the zerovector, which is returned back unchanged.
# 问句向量标准化, Scale a vector to unit length. The only exception is the zero vector, which is returned back unchanged.
vec_ques_mean = matutils.unitvec(np.array([vec_ques]).mean(axis=0)).astype(numpy_type)
# matrix_org单位化
matrix_org_norm = (matrix_org / np.sqrt((matrix_org ** 2).sum(-1))[..., np.newaxis]).astype(numpy_type)
# 计算两个向量之间的相似度使用numpy的dot函数矩阵点乘
# 矩阵点乘, 即问句与标准问句库里边的问句点乘,
matrix_vec_dot = np.dot(matrix_org_norm, vec_ques_mean)
# 防止top_vec越界
top_vec = min(len(matrix_org), top_vec)
# 相似度排序
most_similar_sentence_vec_sort = matutils.argsort(matrix_vec_dot, topn=top_vec, reverse=True)
# 获取最相似标准问句的index和得分score
index_score = []
for t in most_similar_sentence_vec_sort[:top_vec]:
index_score.append([matrix_org_index[t], float(matrix_vec_dot[t])])
@ -146,8 +161,8 @@ def create_matrix_org_np(sen_count, word2vec_model, qa_path, matrix_ques_path_wo
for qa_dail_one in qa_dail:
ques = getChinese(qa_dail_one.split('\t')[0])
# questions.append(ques)
word_list, flag_list = word_segment_process(ques)
sentence_vec = encoding_question(word2vec_model, word_list, flag_list)
word_list, flag_list = word_flag_cut(ques)
sentence_vec = encoding_basic_question(word2vec_model, word_list, flag_list)
matrix_ques.append(sentence_vec)
if len(matrix_ques)%sen_count == 0 and len(matrix_ques) != 0:
print("count: " + str(count))
@ -181,17 +196,20 @@ if __name__ == '__main__':
if not os.path.exists(matrix_ques_part_path):
create_matrix_org_np(sen_count=100000, word2vec_model=word2vec_model, qa_path=chicken_and_gossip_path, matrix_ques_path_word=matrix_ques_part_path)
# 读取
# 读取标准问句矩阵
print("np.loadtxt(matrix_ques_part_path) start!")
matrix_ques = np.loadtxt(matrix_ques_part_path)
print("np.loadtxt(matrix_ques_part_path) end!")
# 标准问句矩阵初始化和预处理
matrix_org_norm, matrix_org_index, top_vec = basic_questions_matrix_init(matrix_ques, top_vec=20)
while True:
print("你: ")
ques_ask = input()
ques_clean = getChinese(ques_ask)
word_list, flag_list = word_segment_process(ques_clean)
sentence_vic = encoding_question(word2vec_model, word_list, flag_list)
top_20_qid = most_similar_sentence_vec(sentence_vic, matrix_ques, top_vec=20)
word_list, flag_list = word_flag_cut(ques_clean)
sentence_vec = basic_questions_encoding(word2vec_model, word_list, flag_list)
top_20_qid = calculate_text_similar(sentence_vec, matrix_org_norm, matrix_org_index, top_vec=top_vec)
try:
print("小姜机器人: " + syn_qa_dails[top_20_qid[0][0]].strip().split("\t")[1])
print([(syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[0], syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[1]) for i in range(len(top_20_qid))])