修复 numpy warning问题

RuntimeWarning: invalid value encountered in true_divide matrix_org_norm = (matrix_org / np.sqrt((matrix_org ** 2).sum(-1))[..., np.newaxis]).astype(numpy_type)
2019-07-02 23:26:18 +08:00 · 2019-07-02 23:26:18 +08:00 · 977e93701f
commit 977e93701f
parent 62c7af0922
2 changed files with 122 additions and 87 deletions
--- a/ChatBot/chatbot_search/chatbot_sentence_vec_by_char.py
+++ b/ChatBot/chatbot_search/chatbot_sentence_vec_by_char.py
@ -4,19 +4,18 @@
 # @author   :Mo
 # @function :chatbot based search, encode sentence_vec by char

-from conf.path_config import w2v_model_char_path
-from conf.path_config import matrix_ques_part_path_char
-from utils.text_tools import txtRead, txtWrite, getChinese
-from conf.path_config import projectdir, chicken_and_gossip_path
-from numpy import float32 as numpy_type
-from collections import Counter
-import pickle, jieba, os, re
-import jieba.posseg as pseg
-from gensim import matutils
-from math import log
-import numpy as np
+import os
+import pickle
+
 import gensim
-import jieba
+import numpy as np
+from gensim import matutils
+from numpy import float32 as numpy_type
+
+from nlp_xiaojiang.conf.path_config import matrix_ques_part_path_char
+from nlp_xiaojiang.conf.path_config import projectdir, chicken_and_gossip_path
+from nlp_xiaojiang.conf.path_config import w2v_model_char_path
+from nlp_xiaojiang.utils.text_tools import txtRead, getChinese


 def load_word2vec_model(path, bin=False, limit=None):
@ -24,31 +23,51 @@ def load_word2vec_model(path, bin=False, limit=None):
    return word2vec_model


-def encoding_question(w2v_model, char_list):
+def question_encoding(w2v_model, char_list):
    '''    生成句子向量
    :param wordlist: 分词list
    :param is_replaced: 是否替换default true
    :param debug_mode: default false
    :return: array句子的向量 len=300
    '''
-    try:
-        sentence_vec = w2v_model.wv[word2vec_model.index2word[1]] * 0
-    except:
-        sentence_vec = w2v_model.wv[word2vec_model.index2word[0]] * 0

+    sentence_vec = w2v_model.wv[w2v_model.index2word[1]] * 0
    for k in range(len(char_list)):
-        char_list_one = char_list[k]
-        if type(char_list_one) == str:
+        word = char_list[k]
        try:
-                sentence_vec = sentence_vec + w2v_model.wv[char_list_one]
+            sentence_vec = sentence_vec + w2v_model.wv[word]
        except Exception as e:
-                print(str(e))
-                if char_list_one not in [' ', '']:
-                    sentence_vec = sentence_vec + 1
+            sentence_vec = sentence_vec + 1 # un_know词加1
    return sentence_vec


-def most_similar_sentence_vec(vec_ques, matrix_org, top_vec=20):
+def basic_questions_matrix_init(matrix_org, top_vec=20):
+    """
+        单位化和初始化基本问题矩阵，以方便点乘, 减小计算量等
+    :param matrix_org: 
+    :param top_vec: 
+    :return: 
+    """
+    len_matrix_org = len(matrix_org)
+    # 防止top_vec越界
+    top_vec = min(len(matrix_org), top_vec)
+    # 首先对句向量矩阵标号
+    matrix_org_index = list(range(len_matrix_org))
+    # matrix_org单位化
+    # 每个句向量求平方
+    matrix_org_xinxin = matrix_org ** 2
+    # 每个句向量求和, 压缩为一个数，当axis为1时, 是压缩列, 即将每一行的元素相加, 将矩阵压缩为一列
+    matrix_org_sum = matrix_org_xinxin.sum(-1)
+    # 每个数求根号, np.newaxis新增一个元素
+    matrix_org_sqrt = np.sqrt(matrix_org_sum)[:, np.newaxis]  # + 1e-9
+    # 解决warning问题
+    matrix_org_sqrt[matrix_org_sqrt == 0] = 1e-9
+    # 句向量矩阵除以它的平均数
+    matrix_org_norm = (matrix_org / matrix_org_sqrt).astype(numpy_type)
+    return matrix_org_norm, matrix_org_index, top_vec
+
+
+def calculate_text_similar(vec_ques, matrix_org_norm, matrix_org_index, top_vec):
    """
      最相似的句子，句向量与矩阵点乘
    :param vec: 
@ -57,19 +76,13 @@ def most_similar_sentence_vec(vec_ques, matrix_org, top_vec=20):
    :param topn: 
    :return: 
    """
-    # 首先对句向量矩阵标号
-    matrix_org_index = list(range(len(matrix_org)))
-    # Scale a vector to unit length. The only exception is the zerovector, which is returned back unchanged.
+    # 问句向量标准化, Scale a vector to unit length. The only exception is the zero vector, which is returned back unchanged.
    vec_ques_mean = matutils.unitvec(np.array([vec_ques]).mean(axis=0)).astype(numpy_type)
-    # matrix_org单位化
-    matrix_org_norm = (matrix_org / np.sqrt((matrix_org ** 2).sum(-1))[..., np.newaxis]).astype(numpy_type)
-    # 计算两个向量之间的相似度，使用numpy的dot函数，矩阵点乘
+    # 矩阵点乘, 即问句与标准问句库里边的问句点乘,
    matrix_vec_dot = np.dot(matrix_org_norm, vec_ques_mean)
-    # 防止top_vec越界
-    top_vec = min(len(matrix_org), top_vec)
    # 相似度排序
    most_similar_sentence_vec_sort = matutils.argsort(matrix_vec_dot, topn=top_vec, reverse=True)
-
+    # 获取最相似标准问句的index和得分score
    index_score = []
    for t in most_similar_sentence_vec_sort[:top_vec]:
        index_score.append([matrix_org_index[t], float(matrix_vec_dot[t])])
@ -97,7 +110,7 @@ def create_matrix_org_np(sen_count, word2vec_model, qa_path, matrix_ques_path):
    for qa_dail_one in qa_dail:
        ques = getChinese(qa_dail_one.split('\t')[0])
        char_list = [ques_char for ques_char in ques]
-        sentence_vec = encoding_question(word2vec_model, char_list)
+        sentence_vec = question_encoding(word2vec_model, char_list)
        matrix_ques.append(sentence_vec)
        if len(matrix_ques)%sen_count == 0 and len(matrix_ques) != 0:
            print("count: " + str(count))
@ -106,11 +119,10 @@ def create_matrix_org_np(sen_count, word2vec_model, qa_path, matrix_ques_path):
            matrix_ques = []
            break

-    # count += 1
-    # np.savetxt(projectdir + "/Data/sentence_vec_encode_char/" + str(count)+".txt", matrix_ques)
+    count += 1
+    np.savetxt(projectdir + "/Data/sentence_vec_encode_char/" + str(count)+".txt", matrix_ques)

    print('create_matrix_org_pkl ok!')
-    # return matrix_ques


 if __name__ == '__main__':
@ -123,16 +135,21 @@ if __name__ == '__main__':
    if not os.path.exists(matrix_ques_part_path_char):
        # matrix_ques = create_matrix_org_np(sen_count=100000, word2vec_model=word2vec_model, qa_path=chicken_and_gossip_path, matrix_ques_path=matrix_ques_part_path_char)
        create_matrix_org_np(sen_count=100000, word2vec_model=word2vec_model, qa_path=chicken_and_gossip_path, matrix_ques_path=matrix_ques_part_path_char)
-    # 重载
+
+    # 读取标准问句矩阵
+    print("np.loadtxt(matrix_ques_part_path) start!")
    matrix_ques = np.loadtxt(matrix_ques_part_path_char)
-    print("np.loadtxt(matrix_ques_part_path_char) ok!")
+    print("np.loadtxt(matrix_ques_part_path) end!")
+    # 标准问句矩阵初始化和预处理
+    matrix_org_norm, matrix_org_index, top_vec = basic_questions_matrix_init(matrix_ques, top_vec=20)
+
    while True:
        print("你问: ")
        ques_ask = input()
        ques_clean = getChinese(ques_ask)
        char_list = [ques_char for ques_char in ques_clean]
-        sentence_vic = encoding_question(word2vec_model, char_list)
-        top_20_qid = most_similar_sentence_vec(sentence_vic, matrix_ques, top_vec=20)
+        sentence_vec = question_encoding(word2vec_model, char_list)
+        top_20_qid = calculate_text_similar(sentence_vec, matrix_org_norm, matrix_org_index, top_vec=top_vec)
        try:
            print("小姜机器人: " + syn_qa_dails[top_20_qid[0][0]].strip().split("\t")[1])
            print([(syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[0], syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[1]) for i in range(len(top_20_qid))])
--- a/ChatBot/chatbot_search/chatbot_sentence_vec_by_word.py
+++ b/ChatBot/chatbot_search/chatbot_sentence_vec_by_word.py
@ -5,20 +5,20 @@
 # @function :chatbot based search, encode sentence_vec by word


-from conf.path_config import w2v_model_merge_short_path, w2v_model_wiki_word_path
-from conf.path_config import projectdir, chicken_and_gossip_path
-from utils.text_tools import txtRead, txtWrite, getChinese
-from conf.path_config import matrix_ques_part_path
-from numpy import float32 as numpy_type
-from collections import Counter
-import pickle, jieba, os, re
-import jieba.posseg as pseg
-from gensim import matutils
-from math import log
-import numpy as np
+import os
+import pickle
+
 import gensim
 import jieba
-import time
+import jieba.posseg as jieba_seg
+import numpy as np
+from gensim import matutils
+from numpy import float32 as numpy_type
+
+from nlp_xiaojiang.conf.path_config import matrix_ques_part_path
+from nlp_xiaojiang.conf.path_config import projectdir, chicken_and_gossip_path
+from nlp_xiaojiang.conf.path_config import w2v_model_merge_short_path, w2v_model_wiki_word_path
+from nlp_xiaojiang.utils.text_tools import txtRead, getChinese


 def load_word2vec_model(path, bin=False, limit=None):
@ -51,18 +51,19 @@ def get_jieba_flag(flag):
    return weight


-def word_segment_process(sentence):
+def word_flag_cut(sentence):
    """
-        jieba切词\词性
+        jieba切词词性
    :param sentence: 
    :return: 
    """
-    sentence = sentence.replace('\n', '').replace(',', '').replace('"', '').replace(' ', '').replace('\t', '').upper().strip()
+    sentence = sentence.replace('\n', '').replace(',', '').replace('"', '').\
+                        replace(' ', '').replace('\t', '').upper().strip()
    word_list = []
    flag_list = []
    try:
        sentence_cut =  ''.join(jieba.lcut(sentence, cut_all=False, HMM=False))
-        words = pseg.cut(sentence_cut)
+        words = jieba_seg.cut(sentence_cut)
        for word in words:
            word_list.append(word.word)
            flag_list.append(word.flag)
@ -72,32 +73,52 @@ def word_segment_process(sentence):
    return word_list, flag_list


-def encoding_question(w2v_model, word_list, flag_list):
+def basic_questions_encoding(w2v_model, word_list, flag_list):
    '''    生成句子向量
    :param wordlist: 分词list
    :param is_replaced: 是否替换default true
    :param debug_mode: default false
    :return: array句子的向量 len=300
    '''
-    try:
-        sentence_vec = w2v_model.wv[w2v_model.index2word[1]] * 0
-    except:
-        sentence_vec = w2v_model.wv[w2v_model.index2word[1]] * 0

+    sentence_vec = w2v_model.wv[w2v_model.index2word[1]] * 0
    for k in range(len(word_list)):
        word = word_list[k]
        flag = flag_list[k]
-        if type(word) == str:
        try:
            sentence_vec = sentence_vec + w2v_model.wv[word] * get_jieba_flag(flag)
        except Exception as e:
-                if word not in [' ', '']:
-                    sentence_vec = sentence_vec + 1
-
+            sentence_vec = sentence_vec + 1 # un_know词加1
    return sentence_vec


-def most_similar_sentence_vec(vec_ques, matrix_org, top_vec=20):
+def basic_questions_matrix_init(matrix_org, top_vec=20):
+    """
+        单位化和初始化基本问题矩阵，以方便点乘, 减小计算量等
+    :param matrix_org: 
+    :param top_vec: 
+    :return: 
+    """
+    len_matrix_org = len(matrix_org)
+    # 防止top_vec越界
+    top_vec = min(len(matrix_org), top_vec)
+    # 首先对句向量矩阵标号
+    matrix_org_index = list(range(len_matrix_org))
+    # matrix_org单位化
+    # 每个句向量求平方
+    matrix_org_xinxin = matrix_org ** 2
+    # 每个句向量求和, 压缩为一个数，当axis为1时, 是压缩列, 即将每一行的元素相加, 将矩阵压缩为一列
+    matrix_org_sum = matrix_org_xinxin.sum(-1)
+    # 每个数求根号, np.newaxis新增一个元素
+    matrix_org_sqrt = np.sqrt(matrix_org_sum)[:, np.newaxis]  # + 1e-9
+    # 解决warning问题
+    matrix_org_sqrt[matrix_org_sqrt == 0] = 1e-9
+    # 句向量矩阵除以它的平均数
+    matrix_org_norm = (matrix_org / matrix_org_sqrt).astype(numpy_type)
+    return matrix_org_norm, matrix_org_index, top_vec
+
+
+def calculate_text_similar(vec_ques, matrix_org_norm, matrix_org_index, top_vec):
    """
      最相似的句子，句向量与矩阵点乘
    :param vec: 
@ -106,19 +127,13 @@ def most_similar_sentence_vec(vec_ques, matrix_org, top_vec=20):
    :param topn: 
    :return: 
    """
-    # 首先对句向量矩阵标号
-    matrix_org_index = list(range(len(matrix_org)))
-    # Scale a vector to unit length. The only exception is the zerovector, which is returned back unchanged.
+    # 问句向量标准化, Scale a vector to unit length. The only exception is the zero vector, which is returned back unchanged.
    vec_ques_mean = matutils.unitvec(np.array([vec_ques]).mean(axis=0)).astype(numpy_type)
-    # matrix_org单位化
-    matrix_org_norm = (matrix_org / np.sqrt((matrix_org ** 2).sum(-1))[..., np.newaxis]).astype(numpy_type)
-    # 计算两个向量之间的相似度，使用numpy的dot函数，矩阵点乘
+    # 矩阵点乘, 即问句与标准问句库里边的问句点乘,
    matrix_vec_dot = np.dot(matrix_org_norm, vec_ques_mean)
-    # 防止top_vec越界
-    top_vec = min(len(matrix_org), top_vec)
    # 相似度排序
    most_similar_sentence_vec_sort = matutils.argsort(matrix_vec_dot, topn=top_vec, reverse=True)
-
+    # 获取最相似标准问句的index和得分score
    index_score = []
    for t in most_similar_sentence_vec_sort[:top_vec]:
        index_score.append([matrix_org_index[t], float(matrix_vec_dot[t])])
@ -146,8 +161,8 @@ def create_matrix_org_np(sen_count, word2vec_model, qa_path, matrix_ques_path_wo
    for qa_dail_one in qa_dail:
        ques = getChinese(qa_dail_one.split('\t')[0])
        # questions.append(ques)
-        word_list, flag_list = word_segment_process(ques)
-        sentence_vec = encoding_question(word2vec_model, word_list, flag_list)
+        word_list, flag_list = word_flag_cut(ques)
+        sentence_vec = encoding_basic_question(word2vec_model, word_list, flag_list)
        matrix_ques.append(sentence_vec)
        if len(matrix_ques)%sen_count == 0 and len(matrix_ques) != 0:
            print("count: " + str(count))
@ -181,17 +196,20 @@ if __name__ == '__main__':
    if not os.path.exists(matrix_ques_part_path):
        create_matrix_org_np(sen_count=100000, word2vec_model=word2vec_model, qa_path=chicken_and_gossip_path, matrix_ques_path_word=matrix_ques_part_path)

-    # 读取
+    # 读取标准问句矩阵
    print("np.loadtxt(matrix_ques_part_path) start!")
    matrix_ques = np.loadtxt(matrix_ques_part_path)
    print("np.loadtxt(matrix_ques_part_path) end!")
+    # 标准问句矩阵初始化和预处理
+    matrix_org_norm, matrix_org_index, top_vec = basic_questions_matrix_init(matrix_ques, top_vec=20)
+
    while True:
        print("你: ")
        ques_ask = input()
        ques_clean = getChinese(ques_ask)
-        word_list, flag_list = word_segment_process(ques_clean)
-        sentence_vic = encoding_question(word2vec_model, word_list, flag_list)
-        top_20_qid = most_similar_sentence_vec(sentence_vic, matrix_ques, top_vec=20)
+        word_list, flag_list = word_flag_cut(ques_clean)
+        sentence_vec = basic_questions_encoding(word2vec_model, word_list, flag_list)
+        top_20_qid = calculate_text_similar(sentence_vec, matrix_org_norm, matrix_org_index, top_vec=top_vec)
        try:
            print("小姜机器人: " + syn_qa_dails[top_20_qid[0][0]].strip().split("\t")[1])
            print([(syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[0], syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[1]) for i in range(len(top_20_qid))])