修复 numpy warning问题

RuntimeWarning: invalid value encountered in true_divide matrix_org_norm = (matrix_org / np.sqrt((matrix_org ** 2).sum(-1))[..., np.newaxis]).astype(numpy_type)
This commit is contained in:
yongzhuo 2019-07-02 23:26:18 +08:00 committed by GitHub
parent 62c7af0922
commit 977e93701f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 122 additions and 87 deletions

View File

@ -4,19 +4,18 @@
# @author :Mo # @author :Mo
# @function :chatbot based search, encode sentence_vec by char # @function :chatbot based search, encode sentence_vec by char
from conf.path_config import w2v_model_char_path import os
from conf.path_config import matrix_ques_part_path_char import pickle
from utils.text_tools import txtRead, txtWrite, getChinese
from conf.path_config import projectdir, chicken_and_gossip_path
from numpy import float32 as numpy_type
from collections import Counter
import pickle, jieba, os, re
import jieba.posseg as pseg
from gensim import matutils
from math import log
import numpy as np
import gensim import gensim
import jieba import numpy as np
from gensim import matutils
from numpy import float32 as numpy_type
from nlp_xiaojiang.conf.path_config import matrix_ques_part_path_char
from nlp_xiaojiang.conf.path_config import projectdir, chicken_and_gossip_path
from nlp_xiaojiang.conf.path_config import w2v_model_char_path
from nlp_xiaojiang.utils.text_tools import txtRead, getChinese
def load_word2vec_model(path, bin=False, limit=None): def load_word2vec_model(path, bin=False, limit=None):
@ -24,31 +23,51 @@ def load_word2vec_model(path, bin=False, limit=None):
return word2vec_model return word2vec_model
def encoding_question(w2v_model, char_list): def question_encoding(w2v_model, char_list):
''' 生成句子向量 ''' 生成句子向量
:param wordlist: 分词list :param wordlist: 分词list
:param is_replaced: 是否替换default true :param is_replaced: 是否替换default true
:param debug_mode: default false :param debug_mode: default false
:return: array句子的向量 len=300 :return: array句子的向量 len=300
''' '''
try:
sentence_vec = w2v_model.wv[word2vec_model.index2word[1]] * 0
except:
sentence_vec = w2v_model.wv[word2vec_model.index2word[0]] * 0
sentence_vec = w2v_model.wv[w2v_model.index2word[1]] * 0
for k in range(len(char_list)): for k in range(len(char_list)):
char_list_one = char_list[k] word = char_list[k]
if type(char_list_one) == str: try:
try: sentence_vec = sentence_vec + w2v_model.wv[word]
sentence_vec = sentence_vec + w2v_model.wv[char_list_one] except Exception as e:
except Exception as e: sentence_vec = sentence_vec + 1 # un_know词加1
print(str(e))
if char_list_one not in [' ', '']:
sentence_vec = sentence_vec + 1
return sentence_vec return sentence_vec
def most_similar_sentence_vec(vec_ques, matrix_org, top_vec=20): def basic_questions_matrix_init(matrix_org, top_vec=20):
"""
单位化和初始化基本问题矩阵以方便点乘, 减小计算量等
:param matrix_org:
:param top_vec:
:return:
"""
len_matrix_org = len(matrix_org)
# 防止top_vec越界
top_vec = min(len(matrix_org), top_vec)
# 首先对句向量矩阵标号
matrix_org_index = list(range(len_matrix_org))
# matrix_org单位化
# 每个句向量求平方
matrix_org_xinxin = matrix_org ** 2
# 每个句向量求和, 压缩为一个数当axis为1时, 是压缩列, 即将每一行的元素相加, 将矩阵压缩为一列
matrix_org_sum = matrix_org_xinxin.sum(-1)
# 每个数求根号, np.newaxis新增一个元素
matrix_org_sqrt = np.sqrt(matrix_org_sum)[:, np.newaxis] # + 1e-9
# 解决warning问题
matrix_org_sqrt[matrix_org_sqrt == 0] = 1e-9
# 句向量矩阵除以它的平均数
matrix_org_norm = (matrix_org / matrix_org_sqrt).astype(numpy_type)
return matrix_org_norm, matrix_org_index, top_vec
def calculate_text_similar(vec_ques, matrix_org_norm, matrix_org_index, top_vec):
""" """
最相似的句子句向量与矩阵点乘 最相似的句子句向量与矩阵点乘
:param vec: :param vec:
@ -57,19 +76,13 @@ def most_similar_sentence_vec(vec_ques, matrix_org, top_vec=20):
:param topn: :param topn:
:return: :return:
""" """
# 首先对句向量矩阵标号 # 问句向量标准化, Scale a vector to unit length. The only exception is the zero vector, which is returned back unchanged.
matrix_org_index = list(range(len(matrix_org)))
# Scale a vector to unit length. The only exception is the zerovector, which is returned back unchanged.
vec_ques_mean = matutils.unitvec(np.array([vec_ques]).mean(axis=0)).astype(numpy_type) vec_ques_mean = matutils.unitvec(np.array([vec_ques]).mean(axis=0)).astype(numpy_type)
# matrix_org单位化 # 矩阵点乘, 即问句与标准问句库里边的问句点乘,
matrix_org_norm = (matrix_org / np.sqrt((matrix_org ** 2).sum(-1))[..., np.newaxis]).astype(numpy_type)
# 计算两个向量之间的相似度使用numpy的dot函数矩阵点乘
matrix_vec_dot = np.dot(matrix_org_norm, vec_ques_mean) matrix_vec_dot = np.dot(matrix_org_norm, vec_ques_mean)
# 防止top_vec越界
top_vec = min(len(matrix_org), top_vec)
# 相似度排序 # 相似度排序
most_similar_sentence_vec_sort = matutils.argsort(matrix_vec_dot, topn=top_vec, reverse=True) most_similar_sentence_vec_sort = matutils.argsort(matrix_vec_dot, topn=top_vec, reverse=True)
# 获取最相似标准问句的index和得分score
index_score = [] index_score = []
for t in most_similar_sentence_vec_sort[:top_vec]: for t in most_similar_sentence_vec_sort[:top_vec]:
index_score.append([matrix_org_index[t], float(matrix_vec_dot[t])]) index_score.append([matrix_org_index[t], float(matrix_vec_dot[t])])
@ -97,7 +110,7 @@ def create_matrix_org_np(sen_count, word2vec_model, qa_path, matrix_ques_path):
for qa_dail_one in qa_dail: for qa_dail_one in qa_dail:
ques = getChinese(qa_dail_one.split('\t')[0]) ques = getChinese(qa_dail_one.split('\t')[0])
char_list = [ques_char for ques_char in ques] char_list = [ques_char for ques_char in ques]
sentence_vec = encoding_question(word2vec_model, char_list) sentence_vec = question_encoding(word2vec_model, char_list)
matrix_ques.append(sentence_vec) matrix_ques.append(sentence_vec)
if len(matrix_ques)%sen_count == 0 and len(matrix_ques) != 0: if len(matrix_ques)%sen_count == 0 and len(matrix_ques) != 0:
print("count: " + str(count)) print("count: " + str(count))
@ -106,11 +119,10 @@ def create_matrix_org_np(sen_count, word2vec_model, qa_path, matrix_ques_path):
matrix_ques = [] matrix_ques = []
break break
# count += 1 count += 1
# np.savetxt(projectdir + "/Data/sentence_vec_encode_char/" + str(count)+".txt", matrix_ques) np.savetxt(projectdir + "/Data/sentence_vec_encode_char/" + str(count)+".txt", matrix_ques)
print('create_matrix_org_pkl ok!') print('create_matrix_org_pkl ok!')
# return matrix_ques
if __name__ == '__main__': if __name__ == '__main__':
@ -123,16 +135,21 @@ if __name__ == '__main__':
if not os.path.exists(matrix_ques_part_path_char): if not os.path.exists(matrix_ques_part_path_char):
# matrix_ques = create_matrix_org_np(sen_count=100000, word2vec_model=word2vec_model, qa_path=chicken_and_gossip_path, matrix_ques_path=matrix_ques_part_path_char) # matrix_ques = create_matrix_org_np(sen_count=100000, word2vec_model=word2vec_model, qa_path=chicken_and_gossip_path, matrix_ques_path=matrix_ques_part_path_char)
create_matrix_org_np(sen_count=100000, word2vec_model=word2vec_model, qa_path=chicken_and_gossip_path, matrix_ques_path=matrix_ques_part_path_char) create_matrix_org_np(sen_count=100000, word2vec_model=word2vec_model, qa_path=chicken_and_gossip_path, matrix_ques_path=matrix_ques_part_path_char)
# 重载
# 读取标准问句矩阵
print("np.loadtxt(matrix_ques_part_path) start!")
matrix_ques = np.loadtxt(matrix_ques_part_path_char) matrix_ques = np.loadtxt(matrix_ques_part_path_char)
print("np.loadtxt(matrix_ques_part_path_char) ok!") print("np.loadtxt(matrix_ques_part_path) end!")
# 标准问句矩阵初始化和预处理
matrix_org_norm, matrix_org_index, top_vec = basic_questions_matrix_init(matrix_ques, top_vec=20)
while True: while True:
print("你问: ") print("你问: ")
ques_ask = input() ques_ask = input()
ques_clean = getChinese(ques_ask) ques_clean = getChinese(ques_ask)
char_list = [ques_char for ques_char in ques_clean] char_list = [ques_char for ques_char in ques_clean]
sentence_vic = encoding_question(word2vec_model, char_list) sentence_vec = question_encoding(word2vec_model, char_list)
top_20_qid = most_similar_sentence_vec(sentence_vic, matrix_ques, top_vec=20) top_20_qid = calculate_text_similar(sentence_vec, matrix_org_norm, matrix_org_index, top_vec=top_vec)
try: try:
print("小姜机器人: " + syn_qa_dails[top_20_qid[0][0]].strip().split("\t")[1]) print("小姜机器人: " + syn_qa_dails[top_20_qid[0][0]].strip().split("\t")[1])
print([(syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[0], syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[1]) for i in range(len(top_20_qid))]) print([(syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[0], syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[1]) for i in range(len(top_20_qid))])

View File

@ -5,20 +5,20 @@
# @function :chatbot based search, encode sentence_vec by word # @function :chatbot based search, encode sentence_vec by word
from conf.path_config import w2v_model_merge_short_path, w2v_model_wiki_word_path import os
from conf.path_config import projectdir, chicken_and_gossip_path import pickle
from utils.text_tools import txtRead, txtWrite, getChinese
from conf.path_config import matrix_ques_part_path
from numpy import float32 as numpy_type
from collections import Counter
import pickle, jieba, os, re
import jieba.posseg as pseg
from gensim import matutils
from math import log
import numpy as np
import gensim import gensim
import jieba import jieba
import time import jieba.posseg as jieba_seg
import numpy as np
from gensim import matutils
from numpy import float32 as numpy_type
from nlp_xiaojiang.conf.path_config import matrix_ques_part_path
from nlp_xiaojiang.conf.path_config import projectdir, chicken_and_gossip_path
from nlp_xiaojiang.conf.path_config import w2v_model_merge_short_path, w2v_model_wiki_word_path
from nlp_xiaojiang.utils.text_tools import txtRead, getChinese
def load_word2vec_model(path, bin=False, limit=None): def load_word2vec_model(path, bin=False, limit=None):
@ -51,18 +51,19 @@ def get_jieba_flag(flag):
return weight return weight
def word_segment_process(sentence): def word_flag_cut(sentence):
""" """
jieba切词\词性 jieba切词词性
:param sentence: :param sentence:
:return: :return:
""" """
sentence = sentence.replace('\n', '').replace(',', '').replace('"', '').replace(' ', '').replace('\t', '').upper().strip() sentence = sentence.replace('\n', '').replace(',', '').replace('"', '').\
replace(' ', '').replace('\t', '').upper().strip()
word_list = [] word_list = []
flag_list = [] flag_list = []
try: try:
sentence_cut = ''.join(jieba.lcut(sentence, cut_all=False, HMM=False)) sentence_cut = ''.join(jieba.lcut(sentence, cut_all=False, HMM=False))
words = pseg.cut(sentence_cut) words = jieba_seg.cut(sentence_cut)
for word in words: for word in words:
word_list.append(word.word) word_list.append(word.word)
flag_list.append(word.flag) flag_list.append(word.flag)
@ -72,32 +73,52 @@ def word_segment_process(sentence):
return word_list, flag_list return word_list, flag_list
def encoding_question(w2v_model, word_list, flag_list): def basic_questions_encoding(w2v_model, word_list, flag_list):
''' 生成句子向量 ''' 生成句子向量
:param wordlist: 分词list :param wordlist: 分词list
:param is_replaced: 是否替换default true :param is_replaced: 是否替换default true
:param debug_mode: default false :param debug_mode: default false
:return: array句子的向量 len=300 :return: array句子的向量 len=300
''' '''
try:
sentence_vec = w2v_model.wv[w2v_model.index2word[1]] * 0
except:
sentence_vec = w2v_model.wv[w2v_model.index2word[1]] * 0
sentence_vec = w2v_model.wv[w2v_model.index2word[1]] * 0
for k in range(len(word_list)): for k in range(len(word_list)):
word = word_list[k] word = word_list[k]
flag = flag_list[k] flag = flag_list[k]
if type(word) == str: try:
try: sentence_vec = sentence_vec + w2v_model.wv[word] * get_jieba_flag(flag)
sentence_vec = sentence_vec + w2v_model.wv[word] * get_jieba_flag(flag) except Exception as e:
except Exception as e: sentence_vec = sentence_vec + 1 # un_know词加1
if word not in [' ', '']:
sentence_vec = sentence_vec + 1
return sentence_vec return sentence_vec
def most_similar_sentence_vec(vec_ques, matrix_org, top_vec=20): def basic_questions_matrix_init(matrix_org, top_vec=20):
"""
单位化和初始化基本问题矩阵以方便点乘, 减小计算量等
:param matrix_org:
:param top_vec:
:return:
"""
len_matrix_org = len(matrix_org)
# 防止top_vec越界
top_vec = min(len(matrix_org), top_vec)
# 首先对句向量矩阵标号
matrix_org_index = list(range(len_matrix_org))
# matrix_org单位化
# 每个句向量求平方
matrix_org_xinxin = matrix_org ** 2
# 每个句向量求和, 压缩为一个数当axis为1时, 是压缩列, 即将每一行的元素相加, 将矩阵压缩为一列
matrix_org_sum = matrix_org_xinxin.sum(-1)
# 每个数求根号, np.newaxis新增一个元素
matrix_org_sqrt = np.sqrt(matrix_org_sum)[:, np.newaxis] # + 1e-9
# 解决warning问题
matrix_org_sqrt[matrix_org_sqrt == 0] = 1e-9
# 句向量矩阵除以它的平均数
matrix_org_norm = (matrix_org / matrix_org_sqrt).astype(numpy_type)
return matrix_org_norm, matrix_org_index, top_vec
def calculate_text_similar(vec_ques, matrix_org_norm, matrix_org_index, top_vec):
""" """
最相似的句子句向量与矩阵点乘 最相似的句子句向量与矩阵点乘
:param vec: :param vec:
@ -106,19 +127,13 @@ def most_similar_sentence_vec(vec_ques, matrix_org, top_vec=20):
:param topn: :param topn:
:return: :return:
""" """
# 首先对句向量矩阵标号 # 问句向量标准化, Scale a vector to unit length. The only exception is the zero vector, which is returned back unchanged.
matrix_org_index = list(range(len(matrix_org)))
# Scale a vector to unit length. The only exception is the zerovector, which is returned back unchanged.
vec_ques_mean = matutils.unitvec(np.array([vec_ques]).mean(axis=0)).astype(numpy_type) vec_ques_mean = matutils.unitvec(np.array([vec_ques]).mean(axis=0)).astype(numpy_type)
# matrix_org单位化 # 矩阵点乘, 即问句与标准问句库里边的问句点乘,
matrix_org_norm = (matrix_org / np.sqrt((matrix_org ** 2).sum(-1))[..., np.newaxis]).astype(numpy_type)
# 计算两个向量之间的相似度使用numpy的dot函数矩阵点乘
matrix_vec_dot = np.dot(matrix_org_norm, vec_ques_mean) matrix_vec_dot = np.dot(matrix_org_norm, vec_ques_mean)
# 防止top_vec越界
top_vec = min(len(matrix_org), top_vec)
# 相似度排序 # 相似度排序
most_similar_sentence_vec_sort = matutils.argsort(matrix_vec_dot, topn=top_vec, reverse=True) most_similar_sentence_vec_sort = matutils.argsort(matrix_vec_dot, topn=top_vec, reverse=True)
# 获取最相似标准问句的index和得分score
index_score = [] index_score = []
for t in most_similar_sentence_vec_sort[:top_vec]: for t in most_similar_sentence_vec_sort[:top_vec]:
index_score.append([matrix_org_index[t], float(matrix_vec_dot[t])]) index_score.append([matrix_org_index[t], float(matrix_vec_dot[t])])
@ -146,8 +161,8 @@ def create_matrix_org_np(sen_count, word2vec_model, qa_path, matrix_ques_path_wo
for qa_dail_one in qa_dail: for qa_dail_one in qa_dail:
ques = getChinese(qa_dail_one.split('\t')[0]) ques = getChinese(qa_dail_one.split('\t')[0])
# questions.append(ques) # questions.append(ques)
word_list, flag_list = word_segment_process(ques) word_list, flag_list = word_flag_cut(ques)
sentence_vec = encoding_question(word2vec_model, word_list, flag_list) sentence_vec = encoding_basic_question(word2vec_model, word_list, flag_list)
matrix_ques.append(sentence_vec) matrix_ques.append(sentence_vec)
if len(matrix_ques)%sen_count == 0 and len(matrix_ques) != 0: if len(matrix_ques)%sen_count == 0 and len(matrix_ques) != 0:
print("count: " + str(count)) print("count: " + str(count))
@ -181,17 +196,20 @@ if __name__ == '__main__':
if not os.path.exists(matrix_ques_part_path): if not os.path.exists(matrix_ques_part_path):
create_matrix_org_np(sen_count=100000, word2vec_model=word2vec_model, qa_path=chicken_and_gossip_path, matrix_ques_path_word=matrix_ques_part_path) create_matrix_org_np(sen_count=100000, word2vec_model=word2vec_model, qa_path=chicken_and_gossip_path, matrix_ques_path_word=matrix_ques_part_path)
# 读取 # 读取标准问句矩阵
print("np.loadtxt(matrix_ques_part_path) start!") print("np.loadtxt(matrix_ques_part_path) start!")
matrix_ques = np.loadtxt(matrix_ques_part_path) matrix_ques = np.loadtxt(matrix_ques_part_path)
print("np.loadtxt(matrix_ques_part_path) end!") print("np.loadtxt(matrix_ques_part_path) end!")
# 标准问句矩阵初始化和预处理
matrix_org_norm, matrix_org_index, top_vec = basic_questions_matrix_init(matrix_ques, top_vec=20)
while True: while True:
print("你: ") print("你: ")
ques_ask = input() ques_ask = input()
ques_clean = getChinese(ques_ask) ques_clean = getChinese(ques_ask)
word_list, flag_list = word_segment_process(ques_clean) word_list, flag_list = word_flag_cut(ques_clean)
sentence_vic = encoding_question(word2vec_model, word_list, flag_list) sentence_vec = basic_questions_encoding(word2vec_model, word_list, flag_list)
top_20_qid = most_similar_sentence_vec(sentence_vic, matrix_ques, top_vec=20) top_20_qid = calculate_text_similar(sentence_vec, matrix_org_norm, matrix_org_index, top_vec=top_vec)
try: try:
print("小姜机器人: " + syn_qa_dails[top_20_qid[0][0]].strip().split("\t")[1]) print("小姜机器人: " + syn_qa_dails[top_20_qid[0][0]].strip().split("\t")[1])
print([(syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[0], syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[1]) for i in range(len(top_20_qid))]) print([(syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[0], syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[1]) for i in range(len(top_20_qid))])