修复 numpy warning问题
RuntimeWarning: invalid value encountered in true_divide matrix_org_norm = (matrix_org / np.sqrt((matrix_org ** 2).sum(-1))[..., np.newaxis]).astype(numpy_type)
This commit is contained in:
parent
62c7af0922
commit
977e93701f
@ -4,19 +4,18 @@
|
|||||||
# @author :Mo
|
# @author :Mo
|
||||||
# @function :chatbot based search, encode sentence_vec by char
|
# @function :chatbot based search, encode sentence_vec by char
|
||||||
|
|
||||||
from conf.path_config import w2v_model_char_path
|
import os
|
||||||
from conf.path_config import matrix_ques_part_path_char
|
import pickle
|
||||||
from utils.text_tools import txtRead, txtWrite, getChinese
|
|
||||||
from conf.path_config import projectdir, chicken_and_gossip_path
|
|
||||||
from numpy import float32 as numpy_type
|
|
||||||
from collections import Counter
|
|
||||||
import pickle, jieba, os, re
|
|
||||||
import jieba.posseg as pseg
|
|
||||||
from gensim import matutils
|
|
||||||
from math import log
|
|
||||||
import numpy as np
|
|
||||||
import gensim
|
import gensim
|
||||||
import jieba
|
import numpy as np
|
||||||
|
from gensim import matutils
|
||||||
|
from numpy import float32 as numpy_type
|
||||||
|
|
||||||
|
from nlp_xiaojiang.conf.path_config import matrix_ques_part_path_char
|
||||||
|
from nlp_xiaojiang.conf.path_config import projectdir, chicken_and_gossip_path
|
||||||
|
from nlp_xiaojiang.conf.path_config import w2v_model_char_path
|
||||||
|
from nlp_xiaojiang.utils.text_tools import txtRead, getChinese
|
||||||
|
|
||||||
|
|
||||||
def load_word2vec_model(path, bin=False, limit=None):
|
def load_word2vec_model(path, bin=False, limit=None):
|
||||||
@ -24,31 +23,51 @@ def load_word2vec_model(path, bin=False, limit=None):
|
|||||||
return word2vec_model
|
return word2vec_model
|
||||||
|
|
||||||
|
|
||||||
def encoding_question(w2v_model, char_list):
|
def question_encoding(w2v_model, char_list):
|
||||||
''' 生成句子向量
|
''' 生成句子向量
|
||||||
:param wordlist: 分词list
|
:param wordlist: 分词list
|
||||||
:param is_replaced: 是否替换default true
|
:param is_replaced: 是否替换default true
|
||||||
:param debug_mode: default false
|
:param debug_mode: default false
|
||||||
:return: array句子的向量 len=300
|
:return: array句子的向量 len=300
|
||||||
'''
|
'''
|
||||||
try:
|
|
||||||
sentence_vec = w2v_model.wv[word2vec_model.index2word[1]] * 0
|
|
||||||
except:
|
|
||||||
sentence_vec = w2v_model.wv[word2vec_model.index2word[0]] * 0
|
|
||||||
|
|
||||||
|
sentence_vec = w2v_model.wv[w2v_model.index2word[1]] * 0
|
||||||
for k in range(len(char_list)):
|
for k in range(len(char_list)):
|
||||||
char_list_one = char_list[k]
|
word = char_list[k]
|
||||||
if type(char_list_one) == str:
|
try:
|
||||||
try:
|
sentence_vec = sentence_vec + w2v_model.wv[word]
|
||||||
sentence_vec = sentence_vec + w2v_model.wv[char_list_one]
|
except Exception as e:
|
||||||
except Exception as e:
|
sentence_vec = sentence_vec + 1 # un_know词加1
|
||||||
print(str(e))
|
|
||||||
if char_list_one not in [' ', '']:
|
|
||||||
sentence_vec = sentence_vec + 1
|
|
||||||
return sentence_vec
|
return sentence_vec
|
||||||
|
|
||||||
|
|
||||||
def most_similar_sentence_vec(vec_ques, matrix_org, top_vec=20):
|
def basic_questions_matrix_init(matrix_org, top_vec=20):
|
||||||
|
"""
|
||||||
|
单位化和初始化基本问题矩阵,以方便点乘, 减小计算量等
|
||||||
|
:param matrix_org:
|
||||||
|
:param top_vec:
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
len_matrix_org = len(matrix_org)
|
||||||
|
# 防止top_vec越界
|
||||||
|
top_vec = min(len(matrix_org), top_vec)
|
||||||
|
# 首先对句向量矩阵标号
|
||||||
|
matrix_org_index = list(range(len_matrix_org))
|
||||||
|
# matrix_org单位化
|
||||||
|
# 每个句向量求平方
|
||||||
|
matrix_org_xinxin = matrix_org ** 2
|
||||||
|
# 每个句向量求和, 压缩为一个数,当axis为1时, 是压缩列, 即将每一行的元素相加, 将矩阵压缩为一列
|
||||||
|
matrix_org_sum = matrix_org_xinxin.sum(-1)
|
||||||
|
# 每个数求根号, np.newaxis新增一个元素
|
||||||
|
matrix_org_sqrt = np.sqrt(matrix_org_sum)[:, np.newaxis] # + 1e-9
|
||||||
|
# 解决warning问题
|
||||||
|
matrix_org_sqrt[matrix_org_sqrt == 0] = 1e-9
|
||||||
|
# 句向量矩阵除以它的平均数
|
||||||
|
matrix_org_norm = (matrix_org / matrix_org_sqrt).astype(numpy_type)
|
||||||
|
return matrix_org_norm, matrix_org_index, top_vec
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_text_similar(vec_ques, matrix_org_norm, matrix_org_index, top_vec):
|
||||||
"""
|
"""
|
||||||
最相似的句子,句向量与矩阵点乘
|
最相似的句子,句向量与矩阵点乘
|
||||||
:param vec:
|
:param vec:
|
||||||
@ -57,19 +76,13 @@ def most_similar_sentence_vec(vec_ques, matrix_org, top_vec=20):
|
|||||||
:param topn:
|
:param topn:
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
# 首先对句向量矩阵标号
|
# 问句向量标准化, Scale a vector to unit length. The only exception is the zero vector, which is returned back unchanged.
|
||||||
matrix_org_index = list(range(len(matrix_org)))
|
|
||||||
# Scale a vector to unit length. The only exception is the zerovector, which is returned back unchanged.
|
|
||||||
vec_ques_mean = matutils.unitvec(np.array([vec_ques]).mean(axis=0)).astype(numpy_type)
|
vec_ques_mean = matutils.unitvec(np.array([vec_ques]).mean(axis=0)).astype(numpy_type)
|
||||||
# matrix_org单位化
|
# 矩阵点乘, 即问句与标准问句库里边的问句点乘,
|
||||||
matrix_org_norm = (matrix_org / np.sqrt((matrix_org ** 2).sum(-1))[..., np.newaxis]).astype(numpy_type)
|
|
||||||
# 计算两个向量之间的相似度,使用numpy的dot函数,矩阵点乘
|
|
||||||
matrix_vec_dot = np.dot(matrix_org_norm, vec_ques_mean)
|
matrix_vec_dot = np.dot(matrix_org_norm, vec_ques_mean)
|
||||||
# 防止top_vec越界
|
|
||||||
top_vec = min(len(matrix_org), top_vec)
|
|
||||||
# 相似度排序
|
# 相似度排序
|
||||||
most_similar_sentence_vec_sort = matutils.argsort(matrix_vec_dot, topn=top_vec, reverse=True)
|
most_similar_sentence_vec_sort = matutils.argsort(matrix_vec_dot, topn=top_vec, reverse=True)
|
||||||
|
# 获取最相似标准问句的index和得分score
|
||||||
index_score = []
|
index_score = []
|
||||||
for t in most_similar_sentence_vec_sort[:top_vec]:
|
for t in most_similar_sentence_vec_sort[:top_vec]:
|
||||||
index_score.append([matrix_org_index[t], float(matrix_vec_dot[t])])
|
index_score.append([matrix_org_index[t], float(matrix_vec_dot[t])])
|
||||||
@ -97,7 +110,7 @@ def create_matrix_org_np(sen_count, word2vec_model, qa_path, matrix_ques_path):
|
|||||||
for qa_dail_one in qa_dail:
|
for qa_dail_one in qa_dail:
|
||||||
ques = getChinese(qa_dail_one.split('\t')[0])
|
ques = getChinese(qa_dail_one.split('\t')[0])
|
||||||
char_list = [ques_char for ques_char in ques]
|
char_list = [ques_char for ques_char in ques]
|
||||||
sentence_vec = encoding_question(word2vec_model, char_list)
|
sentence_vec = question_encoding(word2vec_model, char_list)
|
||||||
matrix_ques.append(sentence_vec)
|
matrix_ques.append(sentence_vec)
|
||||||
if len(matrix_ques)%sen_count == 0 and len(matrix_ques) != 0:
|
if len(matrix_ques)%sen_count == 0 and len(matrix_ques) != 0:
|
||||||
print("count: " + str(count))
|
print("count: " + str(count))
|
||||||
@ -106,11 +119,10 @@ def create_matrix_org_np(sen_count, word2vec_model, qa_path, matrix_ques_path):
|
|||||||
matrix_ques = []
|
matrix_ques = []
|
||||||
break
|
break
|
||||||
|
|
||||||
# count += 1
|
count += 1
|
||||||
# np.savetxt(projectdir + "/Data/sentence_vec_encode_char/" + str(count)+".txt", matrix_ques)
|
np.savetxt(projectdir + "/Data/sentence_vec_encode_char/" + str(count)+".txt", matrix_ques)
|
||||||
|
|
||||||
print('create_matrix_org_pkl ok!')
|
print('create_matrix_org_pkl ok!')
|
||||||
# return matrix_ques
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
@ -123,16 +135,21 @@ if __name__ == '__main__':
|
|||||||
if not os.path.exists(matrix_ques_part_path_char):
|
if not os.path.exists(matrix_ques_part_path_char):
|
||||||
# matrix_ques = create_matrix_org_np(sen_count=100000, word2vec_model=word2vec_model, qa_path=chicken_and_gossip_path, matrix_ques_path=matrix_ques_part_path_char)
|
# matrix_ques = create_matrix_org_np(sen_count=100000, word2vec_model=word2vec_model, qa_path=chicken_and_gossip_path, matrix_ques_path=matrix_ques_part_path_char)
|
||||||
create_matrix_org_np(sen_count=100000, word2vec_model=word2vec_model, qa_path=chicken_and_gossip_path, matrix_ques_path=matrix_ques_part_path_char)
|
create_matrix_org_np(sen_count=100000, word2vec_model=word2vec_model, qa_path=chicken_and_gossip_path, matrix_ques_path=matrix_ques_part_path_char)
|
||||||
# 重载
|
|
||||||
|
# 读取标准问句矩阵
|
||||||
|
print("np.loadtxt(matrix_ques_part_path) start!")
|
||||||
matrix_ques = np.loadtxt(matrix_ques_part_path_char)
|
matrix_ques = np.loadtxt(matrix_ques_part_path_char)
|
||||||
print("np.loadtxt(matrix_ques_part_path_char) ok!")
|
print("np.loadtxt(matrix_ques_part_path) end!")
|
||||||
|
# 标准问句矩阵初始化和预处理
|
||||||
|
matrix_org_norm, matrix_org_index, top_vec = basic_questions_matrix_init(matrix_ques, top_vec=20)
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
print("你问: ")
|
print("你问: ")
|
||||||
ques_ask = input()
|
ques_ask = input()
|
||||||
ques_clean = getChinese(ques_ask)
|
ques_clean = getChinese(ques_ask)
|
||||||
char_list = [ques_char for ques_char in ques_clean]
|
char_list = [ques_char for ques_char in ques_clean]
|
||||||
sentence_vic = encoding_question(word2vec_model, char_list)
|
sentence_vec = question_encoding(word2vec_model, char_list)
|
||||||
top_20_qid = most_similar_sentence_vec(sentence_vic, matrix_ques, top_vec=20)
|
top_20_qid = calculate_text_similar(sentence_vec, matrix_org_norm, matrix_org_index, top_vec=top_vec)
|
||||||
try:
|
try:
|
||||||
print("小姜机器人: " + syn_qa_dails[top_20_qid[0][0]].strip().split("\t")[1])
|
print("小姜机器人: " + syn_qa_dails[top_20_qid[0][0]].strip().split("\t")[1])
|
||||||
print([(syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[0], syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[1]) for i in range(len(top_20_qid))])
|
print([(syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[0], syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[1]) for i in range(len(top_20_qid))])
|
||||||
|
@ -5,20 +5,20 @@
|
|||||||
# @function :chatbot based search, encode sentence_vec by word
|
# @function :chatbot based search, encode sentence_vec by word
|
||||||
|
|
||||||
|
|
||||||
from conf.path_config import w2v_model_merge_short_path, w2v_model_wiki_word_path
|
import os
|
||||||
from conf.path_config import projectdir, chicken_and_gossip_path
|
import pickle
|
||||||
from utils.text_tools import txtRead, txtWrite, getChinese
|
|
||||||
from conf.path_config import matrix_ques_part_path
|
|
||||||
from numpy import float32 as numpy_type
|
|
||||||
from collections import Counter
|
|
||||||
import pickle, jieba, os, re
|
|
||||||
import jieba.posseg as pseg
|
|
||||||
from gensim import matutils
|
|
||||||
from math import log
|
|
||||||
import numpy as np
|
|
||||||
import gensim
|
import gensim
|
||||||
import jieba
|
import jieba
|
||||||
import time
|
import jieba.posseg as jieba_seg
|
||||||
|
import numpy as np
|
||||||
|
from gensim import matutils
|
||||||
|
from numpy import float32 as numpy_type
|
||||||
|
|
||||||
|
from nlp_xiaojiang.conf.path_config import matrix_ques_part_path
|
||||||
|
from nlp_xiaojiang.conf.path_config import projectdir, chicken_and_gossip_path
|
||||||
|
from nlp_xiaojiang.conf.path_config import w2v_model_merge_short_path, w2v_model_wiki_word_path
|
||||||
|
from nlp_xiaojiang.utils.text_tools import txtRead, getChinese
|
||||||
|
|
||||||
|
|
||||||
def load_word2vec_model(path, bin=False, limit=None):
|
def load_word2vec_model(path, bin=False, limit=None):
|
||||||
@ -51,18 +51,19 @@ def get_jieba_flag(flag):
|
|||||||
return weight
|
return weight
|
||||||
|
|
||||||
|
|
||||||
def word_segment_process(sentence):
|
def word_flag_cut(sentence):
|
||||||
"""
|
"""
|
||||||
jieba切词\词性
|
jieba切词词性
|
||||||
:param sentence:
|
:param sentence:
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
sentence = sentence.replace('\n', '').replace(',', '').replace('"', '').replace(' ', '').replace('\t', '').upper().strip()
|
sentence = sentence.replace('\n', '').replace(',', '').replace('"', '').\
|
||||||
|
replace(' ', '').replace('\t', '').upper().strip()
|
||||||
word_list = []
|
word_list = []
|
||||||
flag_list = []
|
flag_list = []
|
||||||
try:
|
try:
|
||||||
sentence_cut = ''.join(jieba.lcut(sentence, cut_all=False, HMM=False))
|
sentence_cut = ''.join(jieba.lcut(sentence, cut_all=False, HMM=False))
|
||||||
words = pseg.cut(sentence_cut)
|
words = jieba_seg.cut(sentence_cut)
|
||||||
for word in words:
|
for word in words:
|
||||||
word_list.append(word.word)
|
word_list.append(word.word)
|
||||||
flag_list.append(word.flag)
|
flag_list.append(word.flag)
|
||||||
@ -72,32 +73,52 @@ def word_segment_process(sentence):
|
|||||||
return word_list, flag_list
|
return word_list, flag_list
|
||||||
|
|
||||||
|
|
||||||
def encoding_question(w2v_model, word_list, flag_list):
|
def basic_questions_encoding(w2v_model, word_list, flag_list):
|
||||||
''' 生成句子向量
|
''' 生成句子向量
|
||||||
:param wordlist: 分词list
|
:param wordlist: 分词list
|
||||||
:param is_replaced: 是否替换default true
|
:param is_replaced: 是否替换default true
|
||||||
:param debug_mode: default false
|
:param debug_mode: default false
|
||||||
:return: array句子的向量 len=300
|
:return: array句子的向量 len=300
|
||||||
'''
|
'''
|
||||||
try:
|
|
||||||
sentence_vec = w2v_model.wv[w2v_model.index2word[1]] * 0
|
|
||||||
except:
|
|
||||||
sentence_vec = w2v_model.wv[w2v_model.index2word[1]] * 0
|
|
||||||
|
|
||||||
|
sentence_vec = w2v_model.wv[w2v_model.index2word[1]] * 0
|
||||||
for k in range(len(word_list)):
|
for k in range(len(word_list)):
|
||||||
word = word_list[k]
|
word = word_list[k]
|
||||||
flag = flag_list[k]
|
flag = flag_list[k]
|
||||||
if type(word) == str:
|
try:
|
||||||
try:
|
sentence_vec = sentence_vec + w2v_model.wv[word] * get_jieba_flag(flag)
|
||||||
sentence_vec = sentence_vec + w2v_model.wv[word] * get_jieba_flag(flag)
|
except Exception as e:
|
||||||
except Exception as e:
|
sentence_vec = sentence_vec + 1 # un_know词加1
|
||||||
if word not in [' ', '']:
|
|
||||||
sentence_vec = sentence_vec + 1
|
|
||||||
|
|
||||||
return sentence_vec
|
return sentence_vec
|
||||||
|
|
||||||
|
|
||||||
def most_similar_sentence_vec(vec_ques, matrix_org, top_vec=20):
|
def basic_questions_matrix_init(matrix_org, top_vec=20):
|
||||||
|
"""
|
||||||
|
单位化和初始化基本问题矩阵,以方便点乘, 减小计算量等
|
||||||
|
:param matrix_org:
|
||||||
|
:param top_vec:
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
len_matrix_org = len(matrix_org)
|
||||||
|
# 防止top_vec越界
|
||||||
|
top_vec = min(len(matrix_org), top_vec)
|
||||||
|
# 首先对句向量矩阵标号
|
||||||
|
matrix_org_index = list(range(len_matrix_org))
|
||||||
|
# matrix_org单位化
|
||||||
|
# 每个句向量求平方
|
||||||
|
matrix_org_xinxin = matrix_org ** 2
|
||||||
|
# 每个句向量求和, 压缩为一个数,当axis为1时, 是压缩列, 即将每一行的元素相加, 将矩阵压缩为一列
|
||||||
|
matrix_org_sum = matrix_org_xinxin.sum(-1)
|
||||||
|
# 每个数求根号, np.newaxis新增一个元素
|
||||||
|
matrix_org_sqrt = np.sqrt(matrix_org_sum)[:, np.newaxis] # + 1e-9
|
||||||
|
# 解决warning问题
|
||||||
|
matrix_org_sqrt[matrix_org_sqrt == 0] = 1e-9
|
||||||
|
# 句向量矩阵除以它的平均数
|
||||||
|
matrix_org_norm = (matrix_org / matrix_org_sqrt).astype(numpy_type)
|
||||||
|
return matrix_org_norm, matrix_org_index, top_vec
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_text_similar(vec_ques, matrix_org_norm, matrix_org_index, top_vec):
|
||||||
"""
|
"""
|
||||||
最相似的句子,句向量与矩阵点乘
|
最相似的句子,句向量与矩阵点乘
|
||||||
:param vec:
|
:param vec:
|
||||||
@ -106,19 +127,13 @@ def most_similar_sentence_vec(vec_ques, matrix_org, top_vec=20):
|
|||||||
:param topn:
|
:param topn:
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
# 首先对句向量矩阵标号
|
# 问句向量标准化, Scale a vector to unit length. The only exception is the zero vector, which is returned back unchanged.
|
||||||
matrix_org_index = list(range(len(matrix_org)))
|
|
||||||
# Scale a vector to unit length. The only exception is the zerovector, which is returned back unchanged.
|
|
||||||
vec_ques_mean = matutils.unitvec(np.array([vec_ques]).mean(axis=0)).astype(numpy_type)
|
vec_ques_mean = matutils.unitvec(np.array([vec_ques]).mean(axis=0)).astype(numpy_type)
|
||||||
# matrix_org单位化
|
# 矩阵点乘, 即问句与标准问句库里边的问句点乘,
|
||||||
matrix_org_norm = (matrix_org / np.sqrt((matrix_org ** 2).sum(-1))[..., np.newaxis]).astype(numpy_type)
|
|
||||||
# 计算两个向量之间的相似度,使用numpy的dot函数,矩阵点乘
|
|
||||||
matrix_vec_dot = np.dot(matrix_org_norm, vec_ques_mean)
|
matrix_vec_dot = np.dot(matrix_org_norm, vec_ques_mean)
|
||||||
# 防止top_vec越界
|
|
||||||
top_vec = min(len(matrix_org), top_vec)
|
|
||||||
# 相似度排序
|
# 相似度排序
|
||||||
most_similar_sentence_vec_sort = matutils.argsort(matrix_vec_dot, topn=top_vec, reverse=True)
|
most_similar_sentence_vec_sort = matutils.argsort(matrix_vec_dot, topn=top_vec, reverse=True)
|
||||||
|
# 获取最相似标准问句的index和得分score
|
||||||
index_score = []
|
index_score = []
|
||||||
for t in most_similar_sentence_vec_sort[:top_vec]:
|
for t in most_similar_sentence_vec_sort[:top_vec]:
|
||||||
index_score.append([matrix_org_index[t], float(matrix_vec_dot[t])])
|
index_score.append([matrix_org_index[t], float(matrix_vec_dot[t])])
|
||||||
@ -146,8 +161,8 @@ def create_matrix_org_np(sen_count, word2vec_model, qa_path, matrix_ques_path_wo
|
|||||||
for qa_dail_one in qa_dail:
|
for qa_dail_one in qa_dail:
|
||||||
ques = getChinese(qa_dail_one.split('\t')[0])
|
ques = getChinese(qa_dail_one.split('\t')[0])
|
||||||
# questions.append(ques)
|
# questions.append(ques)
|
||||||
word_list, flag_list = word_segment_process(ques)
|
word_list, flag_list = word_flag_cut(ques)
|
||||||
sentence_vec = encoding_question(word2vec_model, word_list, flag_list)
|
sentence_vec = encoding_basic_question(word2vec_model, word_list, flag_list)
|
||||||
matrix_ques.append(sentence_vec)
|
matrix_ques.append(sentence_vec)
|
||||||
if len(matrix_ques)%sen_count == 0 and len(matrix_ques) != 0:
|
if len(matrix_ques)%sen_count == 0 and len(matrix_ques) != 0:
|
||||||
print("count: " + str(count))
|
print("count: " + str(count))
|
||||||
@ -181,17 +196,20 @@ if __name__ == '__main__':
|
|||||||
if not os.path.exists(matrix_ques_part_path):
|
if not os.path.exists(matrix_ques_part_path):
|
||||||
create_matrix_org_np(sen_count=100000, word2vec_model=word2vec_model, qa_path=chicken_and_gossip_path, matrix_ques_path_word=matrix_ques_part_path)
|
create_matrix_org_np(sen_count=100000, word2vec_model=word2vec_model, qa_path=chicken_and_gossip_path, matrix_ques_path_word=matrix_ques_part_path)
|
||||||
|
|
||||||
# 读取
|
# 读取标准问句矩阵
|
||||||
print("np.loadtxt(matrix_ques_part_path) start!")
|
print("np.loadtxt(matrix_ques_part_path) start!")
|
||||||
matrix_ques = np.loadtxt(matrix_ques_part_path)
|
matrix_ques = np.loadtxt(matrix_ques_part_path)
|
||||||
print("np.loadtxt(matrix_ques_part_path) end!")
|
print("np.loadtxt(matrix_ques_part_path) end!")
|
||||||
|
# 标准问句矩阵初始化和预处理
|
||||||
|
matrix_org_norm, matrix_org_index, top_vec = basic_questions_matrix_init(matrix_ques, top_vec=20)
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
print("你: ")
|
print("你: ")
|
||||||
ques_ask = input()
|
ques_ask = input()
|
||||||
ques_clean = getChinese(ques_ask)
|
ques_clean = getChinese(ques_ask)
|
||||||
word_list, flag_list = word_segment_process(ques_clean)
|
word_list, flag_list = word_flag_cut(ques_clean)
|
||||||
sentence_vic = encoding_question(word2vec_model, word_list, flag_list)
|
sentence_vec = basic_questions_encoding(word2vec_model, word_list, flag_list)
|
||||||
top_20_qid = most_similar_sentence_vec(sentence_vic, matrix_ques, top_vec=20)
|
top_20_qid = calculate_text_similar(sentence_vec, matrix_org_norm, matrix_org_index, top_vec=top_vec)
|
||||||
try:
|
try:
|
||||||
print("小姜机器人: " + syn_qa_dails[top_20_qid[0][0]].strip().split("\t")[1])
|
print("小姜机器人: " + syn_qa_dails[top_20_qid[0][0]].strip().split("\t")[1])
|
||||||
print([(syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[0], syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[1]) for i in range(len(top_20_qid))])
|
print([(syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[0], syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[1]) for i in range(len(top_20_qid))])
|
||||||
|
Loading…
Reference in New Issue
Block a user