diff --git a/ChatBot/__init__.py b/ChatBot/__init__.py new file mode 100644 index 0000000..d869156 --- /dev/null +++ b/ChatBot/__init__.py @@ -0,0 +1,5 @@ +# -*- coding: UTF-8 -*- +#!/usr/bin/python +# @Time :2019/3/29 23:11 +# @author :Mo +# @function : \ No newline at end of file diff --git a/ChatBot/chatbot_search/__init__.py b/ChatBot/chatbot_search/__init__.py new file mode 100644 index 0000000..d838479 --- /dev/null +++ b/ChatBot/chatbot_search/__init__.py @@ -0,0 +1,5 @@ +# -*- coding: UTF-8 -*- +# !/usr/bin/python +# @time :2019/4/3 15:15 +# @author :Mo +# @function : \ No newline at end of file diff --git a/ChatBot/chatbot_search/chatbot_fuzzy.py b/ChatBot/chatbot_search/chatbot_fuzzy.py new file mode 100644 index 0000000..605690c --- /dev/null +++ b/ChatBot/chatbot_search/chatbot_fuzzy.py @@ -0,0 +1,163 @@ +# -*- coding: UTF-8 -*- +# !/usr/bin/python +# @time :2019/4/4 10:00 +# @author :Mo +# @function : + + +from conf.path_config import chicken_and_gossip_path +from utils.text_tools import txtRead, txtWrite +from conf.path_config import projectdir +from fuzzywuzzy import process +from fuzzywuzzy import fuzz +import pickle +import time +import re + + +def count_same_char(x1, x2): + '''获取相同字符的个数''' + res = [] + for x in x1: + if x in x2: + res.append(x) + if res: + return len(res) + else: + return 0 + + +def fuzzy_re(user_input, collection): + '''匹配方法, 效果不大好,只能匹配相同字数一样,或者字数比他多的那种,同义词或者是有一个词不一样,就没法区分开''' + suggestions = [] + user_input = user_input.replace('.', '').replace('*', '').replace('?', '') + + collection_new = [] + len_user_input = len(user_input) + for coll in collection: # 获取包含所有字符的,如果不包含,就返回错误 + count_coll = 0 + for i in range(len_user_input): + if user_input[i] in coll: + count_coll += 1 + if len_user_input == count_coll: + collection_new.append(coll) + if not collection_new: + return None + + + pattern = '.*?'.join(user_input) # Converts 'djm' to 'd.*?j.*?m' + try: + regex = re.compile(pattern) # Compiles a regex. + except: + gg = 0 + for item in collection_new: + match = regex.search(item) # Checks if the current item matches the regex. + if match: + suggestions.append((len(match.group()), match.start(), item)) + return [x for _, _, x in sorted(suggestions)] + + +def fuzzy_fuzzywuzzy(fuzz, user_input, collection): + '''编辑距离,速度比较慢,比起匹配方法,能够处理字符不一样的问题''' + collection_new = [] + len_user_input = len(user_input) + for coll in collection: # 获取包含一个字符的,如果不包含,就返回错误 + for i in range(len_user_input): + if user_input[i] in coll: + collection_new.append(coll) + if not collection_new: + return None + collection_new = list(set(collection_new)) + + same_char_list = [] + for collection_new_one in collection_new: # 获取相同字符串多的问题 + count_same_char_one = count_same_char(user_input, collection_new_one) + same_char_list.append((collection_new_one, count_same_char_one)) + same_char_list.sort(key=lambda x: x[1], reverse=True) + if len(same_char_list) >= 500: + same_char_list = same_char_list[0: 500] + + result = process.extract(user_input, same_char_list, scorer=fuzz.token_set_ratio, limit=20) + return result + + +def fuzzy_fuzzywuzzy_list(fuzz, user_input, qa_list, collection, topn=50): + '''编辑距离,速度比较慢,比起匹配方法,能够处理字符不一样的问题''' + + start_time = time.time() + # user_input_set = set([user_input_one for user_input_one in user_input]) + user_input_set = [user_input_one for user_input_one in user_input] + + + same_char_list = [] + max_data = 0 + max_data_list = [] + count_collection_new_one = 0 + for collection_new_one in collection: # 获取相同字符串多的问题 + count_same_char_one = len([x for x in user_input_set if x in collection_new_one]) + + if count_same_char_one > 0: + same_char_list.append((count_collection_new_one, count_same_char_one)) + if count_same_char_one > max_data: + max_data_list.append(count_same_char_one) + max_data = count_same_char_one + count_collection_new_one += 1 + + end_time1 = time.time() + list_max_count = [] + len_max_data_list = len(max_data_list) + for x in range(len_max_data_list): # 获取前20排名 + for k,l in same_char_list: + if l == max_data_list[len_max_data_list -1 - x]: + list_max_count.append(qa_list[k]) #问答重这里取出来 + if len(list_max_count) >= 5000: + list_max_count = list_max_count[0:5000] + break + + end_time2 = time.time() + + # end_time1: 0.34090662002563477 + # end_time2: 0.4080846309661865 + + # end_time1: 0.06417036056518555 + # end_time2: 0.08422374725341797 + + # same_char_list.sort(key=lambda x: x[1], reverse=True) + # if len(same_char_list) >= 20: + # same_char_list = same_char_list[0: 20] + + result = process.extract(user_input, list_max_count, scorer=fuzz.token_set_ratio, limit=topn) + end_time3 = time.time() + + # print('end_time1: ' + str(end_time1 - start_time)) + # print('end_time2: ' + str(end_time2 - start_time)) + # print('end_time3: ' + str(end_time3 - start_time)) + + return result + # [fuzz.WRatio, fuzz.QRatio, + # fuzz.token_set_ratio, fuzz.token_sort_ratio, + # fuzz.partial_token_set_ratio, fuzz.partial_token_sort_ratio, + # fuzz.UWRatio, fuzz.UQRatio] + + +if __name__ == '__main__': + start_time = time.time() + qa_list = txtRead(chicken_and_gossip_path) + questions = [qa.strip().split("\t")[0] for qa in qa_list] + print("read questions ok!") + sen = "你谁呀" + # list_fuzzyfinder = fuzzyfinder(base_syn_one_split[1], qa_list) + # list_fuzzyfinder = fuzzy_fuzzywuzzy(fuzz, base_syn_one_split[1], qa_list) + print("你问: " + "你谁呀") + list_fuzzyfinder = fuzzy_fuzzywuzzy_list(fuzz, sen, qa_list, questions, topn=5) + print("小姜机器人: " + list_fuzzyfinder[0][0].split("\t")[1].strip()) + print("推荐结果: ") + print(list_fuzzyfinder) + + while True: + print("你问: ") + ques = input() + list_fuzzyfinder = fuzzy_fuzzywuzzy_list(fuzz, ques, qa_list, questions, topn=5) + print("小姜机器人: " + list_fuzzyfinder[0][0].split("\t")[1].strip()) + print("推荐结果: ") + print(list_fuzzyfinder) diff --git a/ChatBot/chatbot_search/chatbot_sentence_vec_by_char.py b/ChatBot/chatbot_search/chatbot_sentence_vec_by_char.py new file mode 100644 index 0000000..e229451 --- /dev/null +++ b/ChatBot/chatbot_search/chatbot_sentence_vec_by_char.py @@ -0,0 +1,142 @@ +# -*- coding: UTF-8 -*- +# !/usr/bin/python +# @time :2019/4/4 10:00 +# @author :Mo +# @function :chatbot based search, encode sentence_vec by char + +from conf.path_config import w2v_model_char_path +from conf.path_config import matrix_ques_part_path_char +from utils.text_tools import txtRead, txtWrite, getChinese +from conf.path_config import projectdir, chicken_and_gossip_path +from numpy import float32 as numpy_type +from collections import Counter +import pickle, jieba, os, re +import jieba.posseg as pseg +from gensim import matutils +from math import log +import numpy as np +import gensim +import jieba + + +def load_word2vec_model(path, bin=False, limit=None): + word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(path, limit=limit, binary=bin, unicode_errors='ignore') + return word2vec_model + + +def encoding_question(w2v_model, char_list): + ''' 生成句子向量 + :param wordlist: 分词list + :param is_replaced: 是否替换default true + :param debug_mode: default false + :return: array句子的向量 len=300 + ''' + try: + sentence_vec = w2v_model.wv[word2vec_model.index2word[1]] * 0 + except: + sentence_vec = w2v_model.wv[word2vec_model.index2word[0]] * 0 + + for k in range(len(char_list)): + char_list_one = char_list[k] + if type(char_list_one) == str: + try: + sentence_vec = sentence_vec + w2v_model.wv[char_list_one] + except Exception as e: + print(str(e)) + if char_list_one not in [' ', '']: + sentence_vec = sentence_vec + 1 + return sentence_vec + + +def most_similar_sentence_vec(vec_ques, matrix_org, top_vec=20): + """ + 最相似的句子,句向量与矩阵点乘 + :param vec: + :param matrix: + :param keys: + :param topn: + :return: + """ + # 首先对句向量矩阵标号 + matrix_org_index = list(range(len(matrix_org))) + # Scale a vector to unit length. The only exception is the zerovector, which is returned back unchanged. + vec_ques_mean = matutils.unitvec(np.array([vec_ques]).mean(axis=0)).astype(numpy_type) + # matrix_org单位化 + matrix_org_norm = (matrix_org / np.sqrt((matrix_org ** 2).sum(-1))[..., np.newaxis]).astype(numpy_type) + # 计算两个向量之间的相似度,使用numpy的dot函数,矩阵点乘 + matrix_vec_dot = np.dot(matrix_org_norm, vec_ques_mean) + # 防止top_vec越界 + top_vec = min(len(matrix_org), top_vec) + # 相似度排序 + most_similar_sentence_vec_sort = matutils.argsort(matrix_vec_dot, topn=top_vec, reverse=True) + + index_score = [] + for t in most_similar_sentence_vec_sort[:top_vec]: + index_score.append([matrix_org_index[t], float(matrix_vec_dot[t])]) + return index_score + + +def create_matrix_org_np(sen_count, word2vec_model, qa_path, matrix_ques_path): + """ + 创建问题句向量 + :param sen_count: int + :param word2vec_model: gensim model + :param qa_path: str + :param matrix_ques_path:str + :return: None + """ + if os.path.exists(matrix_ques_path): + file_matrix_ques = open(matrix_ques_path, 'rb') + matrix_ques = pickle.load(file_matrix_ques) + return matrix_ques + print('create_matrix_org_pkl start!') + qa_dail = txtRead(qa_path, encodeType='utf-8') + # questions = [] + matrix_ques = [] + count = 0 + for qa_dail_one in qa_dail: + ques = getChinese(qa_dail_one.split('\t')[0]) + char_list = [ques_char for ques_char in ques] + sentence_vec = encoding_question(word2vec_model, char_list) + matrix_ques.append(sentence_vec) + if len(matrix_ques)%sen_count == 0 and len(matrix_ques) != 0: + print("count: " + str(count)) + count += 1 + np.savetxt(projectdir + "/Data/sentence_vec_encode_char/" + str(count)+".txt", matrix_ques) + matrix_ques = [] + break + + # count += 1 + # np.savetxt(projectdir + "/Data/sentence_vec_encode_char/" + str(count)+".txt", matrix_ques) + + print('create_matrix_org_pkl ok!') + # return matrix_ques + + +if __name__ == '__main__': + + # 读取问答语料 + syn_qa_dails = txtRead(chicken_and_gossip_path, encodeType='utf-8') + # 读取词向量 + word2vec_model = load_word2vec_model(w2v_model_char_path, limit=None) + # 创建标准问答中问题的句向量,存起来,到matrix_ques_path, 10万条,可自己设置,这里需要耗费点时间 + if not os.path.exists(matrix_ques_part_path_char): + # matrix_ques = create_matrix_org_np(sen_count=100000, word2vec_model=word2vec_model, qa_path=chicken_and_gossip_path, matrix_ques_path=matrix_ques_part_path_char) + create_matrix_org_np(sen_count=100000, word2vec_model=word2vec_model, qa_path=chicken_and_gossip_path, matrix_ques_path=matrix_ques_part_path_char) + # 重载 + matrix_ques = np.loadtxt(matrix_ques_part_path_char) + print("np.loadtxt(matrix_ques_part_path_char) ok!") + while True: + print("你问: ") + ques_ask = input() + ques_clean = getChinese(ques_ask) + char_list = [ques_char for ques_char in ques_clean] + sentence_vic = encoding_question(word2vec_model, char_list) + top_20_qid = most_similar_sentence_vec(sentence_vic, matrix_ques, top_vec=20) + try: + print("小姜机器人: " + syn_qa_dails[top_20_qid[0][0]].strip().split("\t")[1]) + print([(syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[0], syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[1]) for i in range(len(top_20_qid))]) + except Exception as e: + # 有的字符可能打不出来 + print(str(e)) + diff --git a/ChatBot/chatbot_search/chatbot_sentence_vec_by_word.py b/ChatBot/chatbot_search/chatbot_sentence_vec_by_word.py new file mode 100644 index 0000000..0e3d61c --- /dev/null +++ b/ChatBot/chatbot_search/chatbot_sentence_vec_by_word.py @@ -0,0 +1,217 @@ +# -*- coding: UTF-8 -*- +# !/usr/bin/python +# @time :2019/4/4 10:00 +# @author :Mo +# @function :chatbot based search, encode sentence_vec by word + + +from conf.path_config import w2v_model_merge_short_path, w2v_model_wiki_word_path +from conf.path_config import projectdir, chicken_and_gossip_path +from utils.text_tools import txtRead, txtWrite, getChinese +from conf.path_config import matrix_ques_part_path +from numpy import float32 as numpy_type +from collections import Counter +import pickle, jieba, os, re +import jieba.posseg as pseg +from gensim import matutils +from math import log +import numpy as np +import gensim +import jieba +import time + + +def load_word2vec_model(path, bin=False, limit=None): + print("load_word2vec_model start!") + word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(path, limit=limit, binary=bin, unicode_errors='ignore') + print("load_word2vec_model end!") + return word2vec_model + + +def is_oov(model_vec, query_seg, p_max=0.16): + """ + 判断查询分词的oov情况是放弃,如果oov词个数超过xx%则放弃该回答答案 + :param topic_model: + :return: + """ + words = [word for word in query_seg if str(word).strip() is not ""] + count_total = 1 + count_oov = 0 + if words: + count_total = len(words) + for word in words: + if word not in model_vec: + count_oov = count_oov + 1 + return float(count_oov/count_total) > p_max + + +def get_td_idf_flag(jieba_cut_list, dictionary, tfidf_model): + # todo + '''获取td-idf权重,有问题,同一个词只计算一次,有的还没有,比如说停用词''' + seg1_list = [] + vec1 = tfidf_model[dictionary.doc2bow(jieba_cut_list)] + for vec1_one in vec1: + seg1_list.append(vec1_one[1]) + sum_seg1_list = sum(seg1_list) + + return [x/sum_seg1_list for x in seg1_list] + + +def get_jieba_flag(flag): + '''词性''' + if flag in ['n', 'nr', 'ns', 'nt', 'nz']: + weight = 1.3 + elif flag in ['r', 'i', 't', 'ng', 'an']: + weight = 0.7 + else: + weight = 1 + return weight + + +def word_segment_process(sentence): + """ + jieba切词\词性 + :param sentence: + :return: + """ + sentence = sentence.replace('\n', '').replace(',', '').replace('"', '').replace(' ', '').replace('\t', '').upper().strip() + word_list = [] + flag_list = [] + try: + sentence_cut = ''.join(jieba.lcut(sentence, cut_all=False, HMM=False)) + words = pseg.cut(sentence_cut) + for word in words: + word_list.append(word.word) + flag_list.append(word.flag) + except Exception as e: + word_list = [sentence] + flag_list = ['nt'] + return word_list, flag_list + + +def encoding_question(w2v_model, word_list, flag_list): + ''' 生成句子向量 + :param wordlist: 分词list + :param is_replaced: 是否替换default true + :param debug_mode: default false + :return: array句子的向量 len=300 + ''' + try: + sentence_vec = w2v_model.wv[w2v_model.index2word[1]] * 0 + except: + sentence_vec = w2v_model.wv[w2v_model.index2word[1]] * 0 + + for k in range(len(word_list)): + word = word_list[k] + flag = flag_list[k] + if type(word) == str: + try: + sentence_vec = sentence_vec + w2v_model.wv[word] * get_jieba_flag(flag) + except Exception as e: + if word not in [' ', '']: + sentence_vec = sentence_vec + 1 + + return sentence_vec + + +def most_similar_sentence_vec(vec_ques, matrix_org, top_vec=20): + """ + 最相似的句子,句向量与矩阵点乘 + :param vec: + :param matrix: + :param keys: + :param topn: + :return: + """ + # 首先对句向量矩阵标号 + matrix_org_index = list(range(len(matrix_org))) + # Scale a vector to unit length. The only exception is the zerovector, which is returned back unchanged. + vec_ques_mean = matutils.unitvec(np.array([vec_ques]).mean(axis=0)).astype(numpy_type) + # matrix_org单位化 + matrix_org_norm = (matrix_org / np.sqrt((matrix_org ** 2).sum(-1))[..., np.newaxis]).astype(numpy_type) + # 计算两个向量之间的相似度,使用numpy的dot函数,矩阵点乘 + matrix_vec_dot = np.dot(matrix_org_norm, vec_ques_mean) + # 防止top_vec越界 + top_vec = min(len(matrix_org), top_vec) + # 相似度排序 + most_similar_sentence_vec_sort = matutils.argsort(matrix_vec_dot, topn=top_vec, reverse=True) + + index_score = [] + for t in most_similar_sentence_vec_sort[:top_vec]: + index_score.append([matrix_org_index[t], float(matrix_vec_dot[t])]) + return index_score + + +def create_matrix_org_np(sen_count, word2vec_model, qa_path, matrix_ques_path_word): + """ + 创建问题句向量,设置sen_count=10000, 防止内存不够奔溃 + :param sen_count: int, write sentence_encode num per twice + :param word2vec_model: model + :param qa_path: str + :param matrix_ques_path: str + :return: + """ + if os.path.exists(matrix_ques_path_word): + file_matrix_ques = open(matrix_ques_path_word, 'rb') + matrix_ques = pickle.load(file_matrix_ques) + return matrix_ques + print('create_matrix_org_pkl start!') + qa_dail = txtRead(qa_path, encodeType='utf-8') + # questions = [] + matrix_ques = [] + count = 0 + for qa_dail_one in qa_dail: + ques = getChinese(qa_dail_one.split('\t')[0]) + # questions.append(ques) + word_list, flag_list = word_segment_process(ques) + sentence_vec = encoding_question(word2vec_model, word_list, flag_list) + matrix_ques.append(sentence_vec) + if len(matrix_ques)%sen_count == 0 and len(matrix_ques) != 0: + print("count: " + str(count)) + count += 1 + np.savetxt(projectdir + "/Data/sentence_vec_encode_word/" + str(count)+".txt", matrix_ques) + matrix_ques = [] + # break + + count += 1 + np.savetxt(projectdir + "/Data/sentence_vec_encode_word/" + str(count)+".txt", matrix_ques) + # matrix_ques = [] + # file_matrix_ques = open(matrix_ques_path, 'wb') + # pickle.dump(matrix_ques, file_matrix_ques) + print('create_matrix_org_np ok!') + # return matrix_ques + + +if __name__ == '__main__': + # 读取问答语料 + syn_qa_dails = txtRead(chicken_and_gossip_path, encodeType='utf-8') + + # 读取词向量,w2v_model_wiki_word_path数据是自己训练的,w2v_model_merge_short_path只取了部分数据,你可以前往下载 + if os.path.exists(w2v_model_wiki_word_path): + word2vec_model = load_word2vec_model(w2v_model_wiki_word_path, limit=None) + print("load w2v_model_wiki_word_path ok!") + else: + word2vec_model = load_word2vec_model(w2v_model_merge_short_path, limit=None) + print("load w2v_model_merge_short_path ok!") + + # 创建标准问答中问题的句向量,存起来,到matrix_ques_path + if not os.path.exists(matrix_ques_part_path): + create_matrix_org_np(sen_count=100000, word2vec_model=word2vec_model, qa_path=chicken_and_gossip_path, matrix_ques_path_word=matrix_ques_part_path) + + # 读取 + print("np.loadtxt(matrix_ques_part_path) start!") + matrix_ques = np.loadtxt(matrix_ques_part_path) + print("np.loadtxt(matrix_ques_part_path) end!") + while True: + print("你: ") + ques_ask = input() + ques_clean = getChinese(ques_ask) + word_list, flag_list = word_segment_process(ques_clean) + sentence_vic = encoding_question(word2vec_model, word_list, flag_list) + top_20_qid = most_similar_sentence_vec(sentence_vic, matrix_ques, top_vec=20) + try: + print("小姜机器人: " + syn_qa_dails[top_20_qid[0][0]].strip().split("\t")[1]) + print([(syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[0], syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[1]) for i in range(len(top_20_qid))]) + except Exception as e: + # 有的字符可能打不出来 + print(str(e)) diff --git a/FeatureProject/__init__.py b/FeatureProject/__init__.py new file mode 100644 index 0000000..98d55da --- /dev/null +++ b/FeatureProject/__init__.py @@ -0,0 +1,5 @@ +# -*- coding: UTF-8 -*- +#!/usr/bin/python +# @Time :2019/3/29 23:10 +# @author :Mo +# @function : \ No newline at end of file diff --git a/FeatureProject/__pycache__/__init__.cpython-36.pyc b/FeatureProject/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..1a3197e Binary files /dev/null and b/FeatureProject/__pycache__/__init__.cpython-36.pyc differ diff --git a/FeatureProject/__pycache__/distance_text_or_vec.cpython-36.pyc b/FeatureProject/__pycache__/distance_text_or_vec.cpython-36.pyc new file mode 100644 index 0000000..782f92e Binary files /dev/null and b/FeatureProject/__pycache__/distance_text_or_vec.cpython-36.pyc differ diff --git a/FeatureProject/__pycache__/distance_vec_TS_SS.cpython-36.pyc b/FeatureProject/__pycache__/distance_vec_TS_SS.cpython-36.pyc new file mode 100644 index 0000000..e6a5b80 Binary files /dev/null and b/FeatureProject/__pycache__/distance_vec_TS_SS.cpython-36.pyc differ diff --git a/FeatureProject/cut_td_idf.py b/FeatureProject/cut_td_idf.py new file mode 100644 index 0000000..ac2c4b0 --- /dev/null +++ b/FeatureProject/cut_td_idf.py @@ -0,0 +1,104 @@ +# -*- coding: UTF-8 -*- +# !/usr/bin/python +# @time :2019/4/1 10:35 +# @author :Mo +# @function :cut sentences + + +from conf.path_config import chicken_and_gossip_path, td_idf_cut_path, td_idf_cut_pinyin +from utils.text_tools import txtWrite, txtRead, get_syboml, strQ2B +from conf.path_config import projectdir +from gensim import corpora, models +import xpinyin +import pickle +import jieba + + +def cut_td_idf(sources_path, target_path): + """ + 结巴切词,汉语 + :param path: + :return: + """ + print("cut_td_idf start! ") + corpus = txtRead(sources_path) + governments = [] + for corpus_one in corpus: + corpus_one_clear = corpus_one.replace(' ', '').strip() + ques_q2b = strQ2B(corpus_one_clear.strip()) + ques_q2b_syboml = get_syboml(ques_q2b) + governments.append(ques_q2b_syboml.strip()) + + government_ques = list(map(lambda x: ' '.join(jieba.lcut(x)), governments)) + + topic_ques_all = [] + for topic_ques_one in government_ques: + top_ques_aqlq = topic_ques_one.replace(' ', ' ').replace(' ', ' ').strip() + '\n' + topic_ques_all.append(top_ques_aqlq) + + txtWrite(topic_ques_all, target_path) + print("cut_td_idf ok! " + sources_path) + + +def cut_td_idf_pinyin(sources_path, target_path): #获取拼音 + """ + 汉语转拼音 + :param path: + :return: + """ + pin = xpinyin.Pinyin() + corpus = txtRead(sources_path) + topic_ques_all = [] + corpus_count = 0 + for corpus_one in corpus: + corpus_count += 1 + # time1 = time.time() + corpus_one_clear = corpus_one.replace(' ', '').strip() + ques_q2b = strQ2B(corpus_one_clear.strip()) + ques_q2b_syboml = get_syboml(ques_q2b) + ques_q2b_syboml_pinying = pin.get_pinyin(ques_q2b_syboml.replace(' ', '').replace(' ', '').strip(), ' ') + topic_ques_all.append(ques_q2b_syboml_pinying + '\n') + # time2 = time.time() + # print(str(corpus_count) + 'time:' + str(time2 - time1)) + txtWrite(topic_ques_all, target_path) + print("cut_td_idf_pinyin ok! " + sources_path) + + +def init_tfidf_chinese_or_pinyin(sources_path): + """ + 构建td_idf + :param path: + :return: + """ + questions = txtRead(sources_path) + corpora_documents = [] + for item_text in questions: + item_seg = list(jieba.cut(str(item_text).strip())) + corpora_documents.append(item_seg) + + dictionary = corpora.Dictionary(corpora_documents) + corpus = [dictionary.doc2bow(text) for text in corpora_documents] + tfidf_model = models.TfidfModel(corpus) + print("init_tfidf_chinese_or_pinyin ok! " + sources_path) + file = open(sources_path.replace(".csv", "_dictionary_model.pkl"), 'wb') + pickle.dump([dictionary, tfidf_model], file) + + +if __name__ == '__main__': + # path_text = projectdir + '/Data/chicken_gossip.txt' + # sentences = txtRead(path_text) + # sentences_q = [] + # for sentences_one in sentences: + # sentences_one_replace = sentences_one.replace(" ", "").replace("\t", "") + # sentences_one_replace_split = sentences_one_replace.split("|") + # sentence_new = sentences_one_replace_split[0] + "\t" + "".join(sentences_one_replace_split[1:]) + # sentences_q.append(sentence_new) + # sentences = txtWrite(sentences_q, projectdir + '/Data/chicken_and_gossip.txt') + + + cut_td_idf(chicken_and_gossip_path, td_idf_cut_path) + cut_td_idf_pinyin(chicken_and_gossip_path, td_idf_cut_pinyin) + init_tfidf_chinese_or_pinyin(td_idf_cut_path) + init_tfidf_chinese_or_pinyin(td_idf_cut_pinyin) + print("corpus ok!") + diff --git a/FeatureProject/distance_text_or_vec.py b/FeatureProject/distance_text_or_vec.py new file mode 100644 index 0000000..b501b99 --- /dev/null +++ b/FeatureProject/distance_text_or_vec.py @@ -0,0 +1,330 @@ +# -*- coding: UTF-8 -*- +# !/usr/bin/python +# @time :2019/4/4 10:00 +# @author :Mo +# @function : + +from sklearn.feature_extraction.text import TfidfVectorizer +from utils.text_tools import txtRead, get_syboml, strQ2B +import Levenshtein as Leven +from fuzzywuzzy import fuzz +import jieba.analyse +import numpy as np +import xpinyin +import pickle +import jieba +import os + + +zero_bit = 0.000000001 +pin = xpinyin.Pinyin() + + +def clear_sentence(sentence): + """ + 数据清晰,全角转半角 + :param sentence: str, input sentence + :return: str, clearned sentences + """ + corpus_one_clear = str(sentence).replace(' ', '').strip() + ques_q2b = strQ2B(corpus_one_clear.strip()) + ques_q2b_syboml = get_syboml(ques_q2b) + return ques_q2b_syboml + + +def chinese2pinyin(sentence): + """ + chinese translate to pingyin + :param sentence: str, input sentence + :return: str, output pingyin + """ + ques_q2b_syboml_pinying = pin.get_pinyin(sentence, ' ') + return ques_q2b_syboml_pinying + + +def hamming_distance(v1, v2): + n = int(v1, 2) ^ int(v2, 2) + return bin(n & 0xffffffff).count('1') + + +def cosine_distance(v1, v2): # 余弦距离 + if v1.all() and v2.all(): + return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)) + else: + return 0 + + +def euclidean_distance(v1, v2): # 欧氏距离 + return np.sqrt(np.sum(np.square(v1 - v2))) + + +def manhattan_distance(v1, v2): # 曼哈顿距离 + return np.sum(np.abs(v1 - v2)) + + +def chebyshev_distance(v1, v2): # 切比雪夫距离 + return np.max(np.abs(v1 - v2)) + + +def minkowski_distance(v1, v2): # 闵可夫斯基距离 + return np.sqrt(np.sum(np.square(v1 - v2))) + + +def euclidean_distance_standardized(v1, v2): # 标准化欧氏距离 + v1_v2 = np.vstack([v1, v2]) + sk_v1_v2 = np.var(v1_v2, axis=0, ddof=1) + return np.sqrt(((v1 - v2) ** 2 / (sk_v1_v2 + zero_bit * np.ones_like(sk_v1_v2))).sum()) + + +def mahalanobis_distance(v1, v2): # 马氏距离 + # 马氏距离要求样本数要大于维数,否则无法求协方差矩阵 + # 此处进行转置,表示10个样本,每个样本2维 + X = np.vstack([v1, v2]) + XT = X.T + + # 方法一:根据公式求解 + S = np.cov(X) # 两个维度之间协方差矩阵 + try: + SI = np.linalg.inv(S) # 协方差矩阵的逆矩阵 todo + except: + SI = np.zeros_like(S) + # 马氏距离计算两个样本之间的距离,此处共有10个样本,两两组合,共有45个距离。 + n = XT.shape[0] + distance_all = [] + for i in range(0, n): + for j in range(i + 1, n): + delta = XT[i] - XT[j] + distance_1 = np.sqrt(np.dot(np.dot(delta, SI), delta.T)) + distance_all.append(distance_1) + return np.sum(np.abs(distance_all)) + + +def bray_curtis_distance(v1, v2): # 布雷柯蒂斯距离, 生物学生态距离 + up_v1_v2 = np.sum(np.abs(v2 - v1)) + down_v1_v2 = np.sum(v1) + np.sum(v2) + return up_v1_v2 / (down_v1_v2 + zero_bit) + + +def pearson_correlation_distance(v1, v2): # 皮尔逊相关系数(Pearson correlation) + v1_v2 = np.vstack([v1, v2]) + return np.corrcoef(v1_v2)[0][1] + + +def jaccard_similarity_coefficient_distance(v1, v2): # 杰卡德相似系数(Jaccard similarity coefficient) + # 方法一:根据公式求解 + v1 = np.asarray(v1) + v2 = np.asarray(v2) + up = np.double(np.bitwise_and((v1 != v2), np.bitwise_or(v1 != 0, v2 != 0)).sum()) + down = np.double(np.bitwise_or(v1 != 0, v2 != 0).sum() + zero_bit) + return up / down + + +def wmd_distance(model, sent1_cut_list, sent2_cut_list): # WMD距离 + # model.init_sims(replace=True) + distance = model.wmdistance(sent1_cut_list, sent2_cut_list) + return distance + + +# def HamMings_Levenshtein(str1, str2): +# sim = Leven.hamming(str1, str2) +# return sim + +def edit_levenshtein(str1, str2): + return Leven.distance(str1, str2) + + +def ratio_levenshtein(str1, str2): + return Leven.ratio(str1, str2) + + +def jaro_levenshtein(str1, str2): + return Leven.jaro(str1, str2) + + +def set_ratio_fuzzywuzzy(str1, str2): + return fuzz.token_set_ratio(str1, str2) + + +def sort_ratio_fuzzywuzzy(str1, str2): + return fuzz.token_sort_ratio(str1, str2) + + +def num_of_common_sub_str(str1, str2): + ''' + 求两个字符串的最长公共子串 + 思想:建立一个二维数组,保存连续位相同与否的状态 + ''' + lstr1 = len(str1) + lstr2 = len(str2) + record = [[0 for i in range(lstr2 + 1)] for j in range(lstr1 + 1)] # 多一位 + maxNum = 0 # 最长匹配长度 + p = 0 # 匹配的起始位 + + for i in range(lstr1): + for j in range(lstr2): + if str1[i] == str2[j]: + # 相同则累加 + record[i + 1][j + 1] = record[i][j] + 1 + if record[i + 1][j + 1] > maxNum: + # 获取最大匹配长度 + maxNum = record[i + 1][j + 1] + # 记录最大匹配长度的终止位置 + p = i + 1 + # return str1[p - maxNum:p], maxNum + return maxNum + + +####################################################### 汉明距离 +def string_hash(source): + if source == "": + return 0 + else: + x = ord(source[0]) << 7 + m = 1000003 + mask = 2 ** 128 - 1 + for c in source: + x = ((x * m) ^ ord(c)) & mask + x ^= len(source) + if x == -1: + x = -2 + x = bin(x).replace('0b', '').zfill(64)[-64:] + + return str(x) + + +def sim_hash(content): + seg = jieba.cut(content) + keyWord = jieba.analyse.extract_tags('|'.join(seg), topK=20, withWeight=True, allowPOS=()) + # 先按照权重排序,再按照词排序 + keyList = [] + # print(keyWord) + for feature, weight in keyWord: + weight = int(weight * 20) + feature = string_hash(feature) + temp = [] + for f in feature: + if f == '1': + temp.append(weight) + else: + temp.append(-weight) + keyList.append(temp) + content_list = np.sum(np.array(keyList), axis=0) + # 编码读不出来 + if len(keyList) == 0: + return '00' + simhash = '' + for c in content_list: + if c > 0: + simhash = simhash + '1' + else: + simhash = simhash + '0' + return simhash + + +def hamming_distance_equal(v1, v2): + n = int(v1, 2) ^ int(v2, 2) + return bin(n & 0xffffffff).count('1') + + +def hamming_distance(sen1, sen2): + return hamming_distance_equal(sim_hash(sen1), sim_hash(sen2)) + + +def normalization(x): + """ + 归一化,最大最小值 + :param x: + :return: + """ + return [(float(i) - min(x)) / float(max(x) - min(x) + zero_bit) for i in x] + + +def z_score(x, axis=0): + """ + 标准化 + :param x: arrary, numpy + :param axis: int, 0 + :return: arrary, numpy + """ + x = np.array(x).astype(float) + xr = np.rollaxis(x, axis=axis) + xr -= np.mean(x, axis=axis) + xr /= np.std(x, axis=axis) + # print(x) + return x + + +def tok_td_idf(data_path): + if os.path.exists(data_path + 'td_idf_cut.csv'): + '''#计算TD-DIDF,获取训练测试数据''' + datas = txtRead(data_path + 'td_idf_cut.csv') + # 默认值只匹配长度≥2的单词,修改为1;ngram_range特征所以有2个词的,总计词语50428个 + # vec_tdidf = TfidfVectorizer(ngram_range=(1, 2), token_pattern=r"(?u)\b\w+\b", min_df=1, max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1,max_features=30000) + vec_tdidf = TfidfVectorizer(ngram_range=(1, 2), token_pattern=r"(?u)\b\w+\b", min_df=3, + max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1, max_features=50000) + vec_tdidf.fit_transform(datas) + file_vec_tdidf = open(data_path + 'td_idf_cut_model.pkl', 'wb') + pickle.dump(vec_tdidf, file_vec_tdidf) + + return vec_tdidf + + +def tok_td_idf_pinyin(data_path): + if os.path.exists(data_path + 'td_idf_cut_pinyin.csv'): + '''#计算TD-DIDF,获取训练测试数据''' + datas = txtRead(data_path + 'td_idf_cut_pinyin.csv') + # 默认值只匹配长度≥2的单词,修改为1;ngram_range特征所以有2个词的,总计词语50428个 + # vec_tdidf = TfidfVectorizer(ngram_range=(1, 2), token_pattern=r"(?u)\b\w+\b", min_df=1, max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1,max_features=30000) + vec_tdidf = TfidfVectorizer(ngram_range=(1, 2), token_pattern=r"(?u)\b\w+\b", min_df=3, + max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1, max_features=50000) + vec_tdidf.fit_transform(datas) + file_vec_tdidf = open(data_path + 'td_idf_cut_pinyin_model.pkl', 'wb') + pickle.dump(vec_tdidf, file_vec_tdidf) + + return vec_tdidf + + +if __name__ == '__main__': + vec1_test = np.array([1, 38, 17, 32]) + vec2_test = np.array([5, 6, 8, 9]) + + str1_test = "你到底是谁?" + str2_test = "没想到我是谁,是真样子" + + print(clear_sentence(str1_test)) # 数据处理 + print(chinese2pinyin(str1_test)) # 中文转拼音 + + print(euclidean_distance(vec1_test, vec2_test)) + print(cosine_distance(vec1_test, vec2_test)) + print(manhattan_distance(vec1_test, vec2_test)) + print(euclidean_distance(vec1_test, vec2_test)) + print(chebyshev_distance(vec1_test, vec2_test)) + print(minkowski_distance(vec1_test, vec2_test)) + + print(euclidean_distance_standardized(vec1_test, vec2_test)) + print(mahalanobis_distance(vec1_test, vec2_test)) + + print('###############################################') + + print(bray_curtis_distance(vec1_test, vec2_test)) + print(pearson_correlation_distance(vec1_test, vec2_test)) + print(jaccard_similarity_coefficient_distance(vec1_test, vec2_test)) + + print('###############################################') + + # print(HamMings_Levenshtein(str1, str2)),需要等长 + # print(Wmd_distance(model, sent1_cut_list, sent2_cut_list)) # 需要gensim word2vec model + + print(hamming_distance(str1_test, str2_test)) + print(edit_levenshtein(str1_test, str2_test)) + print(ratio_levenshtein(str1_test, str2_test)) + print(jaro_levenshtein(str1_test, str2_test)) + print(set_ratio_fuzzywuzzy(str1_test, str2_test)) + print(sort_ratio_fuzzywuzzy(str1_test, str2_test)) + print(num_of_common_sub_str(str1_test, str2_test)) + print(normalization(vec1_test)) # 归一化(0-1) + print(z_score(vec1_test)) # 标准化(0附近,正负) + + # data_path = 'D:/workspace/python/bitbucket/nlp_model_v1.0/nlp_model/models/word_feature/sim_data/' + # tok_TD_IDF(data_path) + # tok_TD_IDF_pinyin(data_path) diff --git a/FeatureProject/distance_vec_TS_SS.py b/FeatureProject/distance_vec_TS_SS.py new file mode 100644 index 0000000..447ee79 --- /dev/null +++ b/FeatureProject/distance_vec_TS_SS.py @@ -0,0 +1,84 @@ +# -*- coding: UTF-8 -*- +# !/usr/bin/python +# @time :2019/4/3 10:36 +# @author :Mo +# @function :TS-SS distance +# @url :https://github.com/taki0112/Vector_Similarity +# @paper :A Hybrid Geometric Approach for Measuring Similarity Level Among Documents and Document Clustering + + +import numpy as np +import math + +zero_bit = 0.000000001 + + +def Cosine(vec1, vec2): + """ + 余弦相似度 + :param vec1: arrary + :param vec2: arrary + :return: float + """ + result = InnerProduct(vec1, vec2) / (VectorSize(vec1) * VectorSize(vec2) + zero_bit) + return result + + +def VectorSize(vec): + vec_pow = sum(math.pow(v + zero_bit, 2) for v in vec) + if vec_pow >= 0: + return math.sqrt(vec_pow) + else: + return zero_bit + + +def InnerProduct(vec1, vec2): + try: + return sum(v1 * v2 for v1, v2 in zip(vec1, vec2)) + except: + return zero_bit + + +def Euclidean(vec1, vec2): + vec12_pow = sum(math.pow((v1 - v2), 2) for v1, v2 in zip(vec1, vec2)) + if vec12_pow >= 0: + return math.sqrt(vec12_pow) + else: + return zero_bit + + +def Theta(vec1, vec2): + cosine_vec12 = Cosine(vec1, vec2) + if -1 <= cosine_vec12 and cosine_vec12 <= 1: + return math.acos(cosine_vec12) + 10 + else: + return zero_bit + 10 + + +def Triangle(vec1, vec2): + theta = math.radians(Theta(vec1, vec2)) + return (VectorSize(vec1) * VectorSize(vec2) * math.sin(theta)) / 2 + + +def Magnitude_Difference(vec1, vec2): + return abs(VectorSize(vec1) - VectorSize(vec2)) + + +def Sector(vec1, vec2): + ED = Euclidean(vec1, vec2) + MD = Magnitude_Difference(vec1, vec2) + theta = Theta(vec1, vec2) + return math.pi * math.pow((ED + MD), 2) * theta / 360 + + +def TS_SS(vec1, vec2): + return Triangle(vec1, vec2) * Sector(vec1, vec2) + + +if __name__ == '__main__': + vec1_test = np.array([1, 38, 17, 32]) + vec2_test = np.array([5, 6, 8, 9]) + + print(Euclidean(vec1_test, vec2_test)) + print(Cosine(vec1_test, vec2_test)) + print(TS_SS(vec1_test, vec2_test)) diff --git a/FeatureProject/normalization_util.py b/FeatureProject/normalization_util.py new file mode 100644 index 0000000..4edf2b1 --- /dev/null +++ b/FeatureProject/normalization_util.py @@ -0,0 +1,96 @@ +# -*- coding: UTF-8 -*- +#!/usr/bin/python +# @Time :2019/3/12 14:18 +# @author :Mo +# @site :https://blog.csdn.net/rensihui + +from sklearn import preprocessing +import numpy as np + +def autoL1L2(data, norms = 'l1'): + '''L1或者L2正则化''' + return preprocessing.normalize(data, norm = norms) + +def autoScale(data): + '''标准化, (X-mean)/std.得到的结果是,对于每个属性/每列来说所有数据都聚集在0附近,方差为1。''' + return preprocessing.scale(data) + +def autoMinMaxScaler(data): + '''将属性缩放到一个指定范围''' + return preprocessing.MinMaxScaler(feature_range=(0, 1)).fit_transform(data) + +def autoLinNorm(data): # 传入一个矩阵 + ''' 0-1归一化 + :param data: []矩阵 + :return: [] + ''' + mins = data.min(0) # 返回data矩阵中每一列中最小的元素,返回一个列表 + maxs = data.max(0) # 返回data矩阵中每一列中最大的元素,返回一个列表 + ranges = maxs - mins # 最大值列表 - 最小值列表 = 差值列表 + normData = np.zeros(np.shape(data)) # 生成一个与 data矩阵同规格的normData全0矩阵,用于装归一化后的数据 + row = data.shape[0] # 返回 data矩阵的行数 + normData = data - np.tile(mins, (row, 1)) # data矩阵每一列数据都减去每一列的最小值 + normData = normData / np.tile(ranges, (row, 1)) # data矩阵每一列数据都除去每一列的差值(差值 = 某列的最大值- 某列最小值) + return normData + + + +def autoAvgNorm(data): # 传入一个矩阵 + ''' 均值归一化 + :param data: []矩阵 + :return: [] + ''' + avg = np.average(data, axis=1) # 返回data矩阵中每一列中最小的元素,返回一个列表 + sigma = np.std(data, axis=1) # 返回data矩阵中每一列中最大的元素,返回一个列表 + normData = np.zeros(np.shape(data)) # 生成一个与 data矩阵同规格的normData全0矩阵,用于装归一化后的数据 + row = data.shape[0] # 返回 data矩阵的行数 + normData = data - np.tile(avg, (row, 1)) # data矩阵每一列数据都减去每一列的最小值 + normData = normData / np.tile(sigma, (row, 1)) # data矩阵每一列数据都除去每一列的差值(差值 = 某列的最大值- 某列最小值) + return normData + + + +###Sigmoid函数;Sigmoid函数是一个具有S形曲线的函数,是良好的阈值函数,在(0, 0.5)处中心对称,在(0, 0.5)附近有比较大的斜率, +# 而当数据趋向于正无穷和负无穷的时候,映射出来的值就会无限趋向于1和0,是个人非常喜欢的“归一化方法”,之所以打引号是因为我觉得Sigmoid函数在 +# 阈值分割上也有很不错的表现,根据公式的改变,就可以改变分割阈值,这里作为归一化方法,我们只考虑(0, 0.5)作为分割阈值的点的情况: +def sigmoid(data,useStatus): + ''' sig归一化 + :param data: []矩阵 + :return: [] + ''' + if useStatus: + row=data.shape[0] + column=data.shape[1] + normData = np.zeros(np.shape(data)) + for i in range(row): + for j in range(column): + normData[i][j]=1.0 / (1 + np.exp(-float(data[i][j]))); + return normData + else: + return float(data); + +if __name__ == '__main__': + arr = np.array([[8, 7, 8], [4, 3, 1], [6, 9, 8]]) + + print("l1正则化") + print(autoL1L2(arr, norms='l1')) + + print("l2正则化") + print(autoL1L2(arr, norms='l2')) + + print("0-1标准化处理") + print(autoScale(arr)) + + print("0-1缩放处理") + print(autoMinMaxScaler(arr)) + + + print("0-1归一化处理") + print(autoLinNorm(arr)) + + + print("均值归一化处理") + print(autoAvgNorm(arr)) + + print("sig归一化处理") + print(sigmoid(arr,True)) diff --git a/FeatureProject/sentence_sim_feature.py b/FeatureProject/sentence_sim_feature.py new file mode 100644 index 0000000..ffe056c --- /dev/null +++ b/FeatureProject/sentence_sim_feature.py @@ -0,0 +1,384 @@ +# -*- coding:utf-8 -*- +# -*- created by: moyongzhuo -*- + + +from FeatureProject.distance_text_or_vec import euclidean_distance, cosine_distance, manhattan_distance, euclidean_distance, jaccard_similarity_coefficient_distance +from FeatureProject.distance_text_or_vec import chebyshev_distance, minkowski_distance, euclidean_distance_standardized +from FeatureProject.distance_text_or_vec import mahalanobis_distance, bray_curtis_distance, pearson_correlation_distance +from FeatureProject.distance_text_or_vec import wmd_distance, normalization, z_score +from FeatureProject.distance_text_or_vec import hamming_distance, edit_levenshtein, ratio_levenshtein, jaro_levenshtein, set_ratio_fuzzywuzzy, sort_ratio_fuzzywuzzy +from FeatureProject.distance_text_or_vec import clear_sentence, chinese2pinyin, num_of_common_sub_str +from conf.path_config import word2_vec_path, td_idf_path, td_idf_path_pinyin +from FeatureProject.distance_vec_TS_SS import TS_SS +from gensim import corpora, models, matutils +from conf.path_config import projectdir +from gensim.models import KeyedVectors +import pandas as pd +import numpy as np +import pickle +import jieba +import time +import os + + +class SentenceSimFeature: + def __init__(self): + self.sen1 = None + self.sen2 = None + self.seg1 = None + self.seg2 = None + self.sen_vec1 = None + self.sen_vec2 = None + self.tfidf_vec1 = None + self.tfidf_vec2 = None + self.dictionary = None + self.tfidf_model = None + self.w2c_model = None + + self.tfidf_pinyin_model = None + self.dictionary_pinyin = None + self.sen1_pinyin = None + self.sen2_pinyin = None + self.seg1_pinyin = None + self.seg2_pinyin = None + self.tfidf_vec1_pinyin = None + self.tfidf_vec2_pinyin = None + + def set_data(self, sen1, sen2): + sen1 = clear_sentence(sen1) + sen2 = clear_sentence(sen2) + self.sen1 = str(sen1).strip() + self.sen2 = str(sen2).strip() + self.seg1 = list(jieba.cut(sen1)) + self.seg2 = list(jieba.cut(sen2)) + self.sen1_pinyin = chinese2pinyin(sen1) + self.sen2_pinyin = chinese2pinyin(sen2) + self.seg1_pinyin = (self.sen1_pinyin).split(' ') + self.seg2_pinyin = (self.sen2_pinyin).split(' ') + self.sen_vec1 = np.zeros(300) + self.sen_vec2 = np.zeros(300) + # self.tfidf_vec1 = np.array((self.tfidf_model.transform([' '.join(self.seg1)])).toarray().tolist()[0]) + # self.tfidf_vec2 = np.array((self.tfidf_model.transform([' '.join(self.seg2)])).toarray().tolist()[0]) + # self.tfidf_vec1_pinyin = np.array((self.tfidf_pinyin_model.transform([' '.join(self.seg1_pinyin)])).toarray().tolist()[0]) + # self.tfidf_vec2_pinyin = np.array((self.tfidf_pinyin_model.transform([' '.join(self.seg2_pinyin)])).toarray().tolist()[0]) + self.tfidf_vec1 = self.tfidf_model[self.dictionary.doc2bow(self.seg1)] + self.tfidf_vec2 = self.tfidf_model[self.dictionary.doc2bow(self.seg2)] + self.tfidf_vec1_pinyin = self.tfidf_pinyin_model[self.dictionary_pinyin.doc2bow(self.seg1_pinyin)] + self.tfidf_vec2_pinyin = self.tfidf_pinyin_model[self.dictionary_pinyin.doc2bow(self.seg2_pinyin)] + + def same_word_count(self): + count_left = 0 + for s in self.seg1: + if s in self.seg2: + count_left += 1 + + count_right = 0 + for s in self.seg2: + if s in self.seg1: + count_right += 1 + + return min(count_left, count_right) + + def same_char_count(self): + seg1 = list(self.sen1) + seg2 = list(self.sen2) + + count_left = 0 + for s in seg1: + if s in seg2: + count_left += 1 + + count_right = 0 + for s in seg2: + if s in seg1: + count_right += 1 + + return min(count_left, count_right) + + def sentence_length(self): + len_sen1 = len(self.sen1) + len_sen2 = len(self.sen2) + len_abs_sub = abs(len_sen1 - len_sen2) + len_rate = len_sen1 / len_sen2 + len_add_rate = len_sen1 * len_sen2 / (len_sen1 + len_sen2) + + return [len_abs_sub, len_rate, len_add_rate] + + def init_sentence_vector(self): + # file_path = os.path.dirname(__file__) + print('load w2v model begin') + # model_path = os.path.join(file_path, word2_vec_path) + self.w2c_model = KeyedVectors.load_word2vec_format(word2_vec_path, unicode_errors='ignore', limit=None) # ,binary=True) + print('load w2v model success') + + def encode_sentence_vector(self): + for s in self.seg1: + try: + self.sen_vec1 += self.w2c_model[s] + except: + self.sen_vec1 += np.zeros(300) + continue + + for s in self.seg2: + try: + self.sen_vec2 += self.w2c_model[s] + except: + self.sen_vec2 += np.zeros(300) + continue + + def init_tfidf(self): + file = open(td_idf_path, 'rb') + tfidf_dictionary_model = pickle.load(file) + self.dictionary = tfidf_dictionary_model[0] + self.tfidf_model = tfidf_dictionary_model[1] + + file = open(td_idf_path_pinyin, 'rb') + tfidf_dictionary_pinyin_model = pickle.load(file) + self.dictionary_pinyin = tfidf_dictionary_pinyin_model[0] + self.tfidf_pinyin_model = tfidf_dictionary_pinyin_model[1] + print("init_tfidf ok!") + + def w2c_all_vec(self): + w2c_Cosine = cosine_distance(self.sen_vec1, self.sen_vec2) + w2c_TS_SS = TS_SS(self.sen_vec1, self.sen_vec2) + w2c_Manhattan = manhattan_distance(self.sen_vec1, self.sen_vec2) + w2c_Euclidean = euclidean_distance(self.sen_vec1, self.sen_vec2) + w2c_Jaccard = jaccard_similarity_coefficient_distance(self.sen_vec1, self.sen_vec2) + + w2c_Chebyshev = chebyshev_distance(self.sen_vec1, self.sen_vec2) + w2c_Minkowski = minkowski_distance(self.sen_vec1, self.sen_vec2) + + w2c_Euclidean_Standard = euclidean_distance_standardized(self.sen_vec1, self.sen_vec2) + w2c_Mahalanobis = mahalanobis_distance(self.sen_vec1, self.sen_vec2) + w2c_Bray = bray_curtis_distance(self.sen_vec1, self.sen_vec2) + w2c_Pearson = pearson_correlation_distance(self.sen_vec1, self.sen_vec2) + + # w2c_Wmd = Wmd_Distance(self.w2c_model, self.sen_vec1, self.sen_vec2) + return [w2c_Cosine, w2c_TS_SS, w2c_Manhattan, w2c_Euclidean, w2c_Jaccard, w2c_Chebyshev, + w2c_Minkowski, w2c_Euclidean_Standard, w2c_Mahalanobis, w2c_Bray, w2c_Pearson] + + def tdidf_all_vec(self): + + return matutils.cossim(self.tfidf_vec1, self.tfidf_vec2) + + def edit_all_str(self): + str_hamming = hamming_distance(self.sen1, self.sen2) + str_edit = edit_levenshtein(self.sen1, self.sen2) + str_ratio = ratio_levenshtein(self.sen1, self.sen2) + str_jaro = jaro_levenshtein(self.sen1, self.sen2) + str_set_ratio_fuzz = set_ratio_fuzzywuzzy(self.sen1, self.sen2) + str_sort_ratio_fuzz = sort_ratio_fuzzywuzzy(self.sen1, self.sen2) + str_commonsubstr = num_of_common_sub_str(self.sen1, self.sen2) + str_list_Wmd = wmd_distance(self.w2c_model, self.seg1, self.seg2) + + return [str_hamming, str_edit, str_ratio, str_jaro, + str_set_ratio_fuzz, str_sort_ratio_fuzz, str_commonsubstr, str_list_Wmd] + + def word_jaccard(self): + a = list(set(self.seg1).intersection(set(self.seg2))) + b = list(set(self.seg1).union(set(self.seg2))) + return float(len(a) / len(b)) + + def char_jaccard(self): + a = list(set(list(self.sen1)).intersection(set(list(self.sen2)))) + b = list(set(list(self.sen1)).union(set(list(self.sen2)))) + + return float(len(a) / len(b)) + + def tdidf_all_vec_pinyin(self): + + return matutils.cossim(self.tfidf_vec1_pinyin, self.tfidf_vec2_pinyin) + + def edit_all_pinyin(self): + pinyin_hamming = hamming_distance(self.sen1_pinyin, self.sen2_pinyin) + pinyin_edit = edit_levenshtein(self.sen1_pinyin, self.sen2_pinyin) + pinyin_ratio = ratio_levenshtein(self.sen1_pinyin, self.sen2_pinyin) + pinyin_jaro = jaro_levenshtein(self.sen1_pinyin, self.sen2_pinyin) + pinyin_set_ratio_fuzz = set_ratio_fuzzywuzzy(self.sen1_pinyin, self.sen2_pinyin) + pinyin_sort_ratio_fuzz = sort_ratio_fuzzywuzzy(self.sen1_pinyin, self.sen2_pinyin) + pinyin_commonsubstr = num_of_common_sub_str(self.sen1_pinyin, self.sen2_pinyin) + pinyin_list_Wmd = wmd_distance(self.w2c_model, self.seg1_pinyin, self.seg2_pinyin) + + return [pinyin_hamming, pinyin_edit, pinyin_ratio, pinyin_jaro, + pinyin_set_ratio_fuzz, pinyin_sort_ratio_fuzz, pinyin_commonsubstr, pinyin_list_Wmd] + + def word_jaccard_pinyin(self): + a = list(set(self.seg1_pinyin).intersection(set(self.seg2_pinyin))) + b = list(set(self.seg1_pinyin).union(set(self.seg2_pinyin))) + return float(len(a) / len(b)) + + def char_jaccard_pinyin(self): + a = list(set(list(self.seg1_pinyin)).intersection(set(list(self.seg2_pinyin)))) + b = list(set(list(self.seg1_pinyin)).union(set(list(self.seg2_pinyin)))) + + return float(len(a) / len(b)) + + +def sentence_input_t(): + while True: + s1 = input('s1: ') + s2 = input('s2: ') + + start_time = time.time() + ssf.set_data(s1, s2) + ssf.encode_sentence_vector() + + time1 = time.time() + print('set_data time:' + str(time1 - start_time)) + + # 相同词、长度 + same_word_count = ssf.same_word_count() + time2 = time.time() + print('same_word_count time:' + str(time2 - time1)) + + same_char_count = ssf.same_char_count() + time3 = time.time() + print('same_char_count time:' + str(time3 - time2)) + + [len_abs_sub, len_rate, len_add_rate] = ssf.sentence_length() + time4 = time.time() + print('sentence_length time:' + str(time4 - time3)) + + # w2c_all_vec + [w2c_Cosine, w2c_TS_SS, w2c_Manhattan, w2c_Euclidean, + w2c_Jaccard, w2c_Chebyshev, w2c_Minkowski, w2c_Euclidean_Standard, w2c_Mahalanobis, + w2c_Bray, w2c_Pearson] = ssf.w2c_all_vec() + time5 = time.time() + print('w2c_all_vec time:' + str(time5 - time4)) + + # tdidf_all_vec + # [tdidf_Cosine, tdidf_TS_SS, tdidf_Manhattan, tdidf_Euclidean, + # tdidf_Jaccard, tdidf_Chebyshev,tdidf_Minkowski, tdidf_Euclidean_Standard, tdidf_Mahalanobis, + # tdidf_Bray, tdidf_Pearson] = ssf.tdidf_all_vec() + tdidf_cossim = ssf.tdidf_all_vec() + time6 = time.time() + print('tdidf_all_vec time:' + str(time6 - time5)) + + # edit_all_str + [str_hamming, str_edit, str_ratio, str_jaro, + str_set_ratio_fuzz, str_sort_ratio_fuzz, str_commonsubstr, str_list_Wmd] = ssf.edit_all_str() + time7 = time.time() + print('edit_all_str time:' + str(time7 - time6)) + + # jaccard系数 + word_jaccard = ssf.word_jaccard() + char_jaccard = ssf.char_jaccard() + time8 = time.time() + print('jaccard系数 time:' + str(time8 - time7)) + + # tdidf_all_vec_pinyin + # [tdidf_piyin_Cosine, tdidf_piyin_TS_SS, tdidf_piyin_Manhattan, tdidf_piyin_Euclidean, tdidf_piyin_Jaccard, + # tdidf_piyin_Chebyshev, tdidf_piyin_Minkowski, tdidf_piyin_Euclidean_Standard, tdidf_piyin_Mahalanobis, + # tdidf_piyin_Bray, tdidf_piyin_Pearson] = ssf.tdidf_all_vec_pinyin() + tdidf_pinyin_cossim = ssf.tdidf_all_vec_pinyin() + time9 = time.time() + print('tdidf_all_vec_pinyin time:' + str(time9 - time8)) + + # edit_all_pinyin + [pinyin_hamming, pinyin_edit, pinyin_ratio, pinyin_jaro, + pinyin_set_ratio_fuzz, pinyin_sort_ratio_fuzz, pinyin_commonsubstr, pinyin_list_Wmd] = ssf.edit_all_pinyin() + time10 = time.time() + print('edit_all_pinyin time:' + str(time10 - time9)) + + # jaccard系数 + word_jaccard_pinyin = ssf.word_jaccard_pinyin() + char_jaccard_pinyin = ssf.char_jaccard_pinyin() + time11 = time.time() + print('jaccard系数pinyin time:' + str(time11 - time10)) + sim_all_last = [same_word_count, same_char_count, len_abs_sub, len_rate, len_add_rate, + w2c_Cosine, w2c_TS_SS, w2c_Manhattan, w2c_Euclidean, w2c_Jaccard, w2c_Chebyshev, w2c_Minkowski, + w2c_Euclidean_Standard, w2c_Mahalanobis, w2c_Bray, w2c_Pearson, + tdidf_cossim, str_hamming, str_edit, str_ratio, str_jaro, str_set_ratio_fuzz, + str_sort_ratio_fuzz, + str_commonsubstr, str_list_Wmd, + word_jaccard, char_jaccard, tdidf_pinyin_cossim, + pinyin_hamming, pinyin_edit, pinyin_ratio, pinyin_jaro, pinyin_set_ratio_fuzz, + pinyin_sort_ratio_fuzz, + pinyin_commonsubstr, pinyin_list_Wmd, + word_jaccard_pinyin, char_jaccard_pinyin] + print("sim: ") + print(sim_all_last) + + +if __name__ == '__main__': + ssf = SentenceSimFeature() + ssf.init_sentence_vector() + ssf.init_tfidf() + s1 = "你知道Mo的能力上限吗" + s2 = "你好呀,Mo水平很差" + start_time = time.time() + + ssf.set_data(s1, s2) + ssf.encode_sentence_vector() + + time1 = time.time() + print('set_data time:' + str(time1 - start_time)) + + # 相同词、长度 + same_word_count = ssf.same_word_count() + time2 = time.time() + print('same_word_count time:' + str(time2 - time1)) + + same_char_count = ssf.same_char_count() + time3 = time.time() + print('same_char_count time:' + str(time3 - time2)) + + [len_abs_sub, len_rate, len_add_rate] = ssf.sentence_length() + time4 = time.time() + print('sentence_length time:' + str(time4 - time3)) + + # w2c_all_vec + [w2c_Cosine, w2c_TS_SS, w2c_Manhattan, w2c_Euclidean, + w2c_Jaccard, w2c_Chebyshev, w2c_Minkowski, w2c_Euclidean_Standard, w2c_Mahalanobis, + w2c_Bray, w2c_Pearson] = ssf.w2c_all_vec() + time5 = time.time() + print('w2c_all_vec time:' + str(time5 - time4)) + + # tdidf_all_vec + tdidf_cossim = ssf.tdidf_all_vec() + time6 = time.time() + print('tdidf_all_vec time:' + str(time6 - time5)) + + # edit_all_str + [str_hamming, str_edit, str_ratio, str_jaro, + str_set_ratio_fuzz, str_sort_ratio_fuzz, str_commonsubstr, str_list_Wmd] = ssf.edit_all_str() + time7 = time.time() + print('edit_all_str time:' + str(time7 - time6)) + + # jaccard系数 + word_jaccard = ssf.word_jaccard() + char_jaccard = ssf.char_jaccard() + time8 = time.time() + print('jaccard系数 time:' + str(time8 - time7)) + + # pinyin + tdidf_pinyin_cossim = ssf.tdidf_all_vec_pinyin() + time9 = time.time() + print('tdidf_all_vec_pinyin time:' + str(time9 - time8)) + + # edit_all_pinyin + [pinyin_hamming, pinyin_edit, pinyin_ratio, pinyin_jaro, + pinyin_set_ratio_fuzz, pinyin_sort_ratio_fuzz, pinyin_commonsubstr, pinyin_list_Wmd] = ssf.edit_all_pinyin() + time10 = time.time() + print('edit_all_pinyin time:' + str(time10 - time9)) + + # jaccard系数 + word_jaccard_pinyin = ssf.word_jaccard_pinyin() + char_jaccard_pinyin = ssf.char_jaccard_pinyin() + time11 = time.time() + print('jaccard系数pinyin time:' + str(time11 - time10)) + + sim_all_last = [same_word_count, same_char_count, len_abs_sub, len_rate, len_add_rate, + w2c_Cosine, w2c_TS_SS, w2c_Manhattan, w2c_Euclidean, w2c_Jaccard, w2c_Chebyshev, w2c_Minkowski, + w2c_Euclidean_Standard, w2c_Mahalanobis, w2c_Bray, w2c_Pearson, + tdidf_cossim, str_hamming, str_edit, str_ratio, str_jaro, str_set_ratio_fuzz, str_sort_ratio_fuzz, + str_commonsubstr, str_list_Wmd, + word_jaccard, char_jaccard, tdidf_pinyin_cossim, + pinyin_hamming, pinyin_edit, pinyin_ratio, pinyin_jaro, pinyin_set_ratio_fuzz, + pinyin_sort_ratio_fuzz, + pinyin_commonsubstr, pinyin_list_Wmd, + word_jaccard_pinyin, char_jaccard_pinyin] + print("小姜机器人计算sim: ") + print(sim_all_last) + + sentence_input_t() diff --git a/conf/__init__.py b/conf/__init__.py new file mode 100644 index 0000000..b238954 --- /dev/null +++ b/conf/__init__.py @@ -0,0 +1,5 @@ +# -*- coding: UTF-8 -*- +# !/usr/bin/python +# @time :2019/4/3 11:23 +# @author :Mo +# @function : \ No newline at end of file diff --git a/conf/__pycache__/__init__.cpython-36.pyc b/conf/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..9042934 Binary files /dev/null and b/conf/__pycache__/__init__.cpython-36.pyc differ diff --git a/conf/__pycache__/path_config.cpython-36.pyc b/conf/__pycache__/path_config.cpython-36.pyc new file mode 100644 index 0000000..6ee0fdc Binary files /dev/null and b/conf/__pycache__/path_config.cpython-36.pyc differ diff --git a/conf/path_config.py b/conf/path_config.py new file mode 100644 index 0000000..72bf2aa --- /dev/null +++ b/conf/path_config.py @@ -0,0 +1,39 @@ +# -*- coding: UTF-8 -*- +# !/usr/bin/python +# @time :2019/4/3 11:23 +# @author :Mo +# @function :path + + +import pathlib +import sys +import os + + +# base dir +projectdir = str(pathlib.Path(os.path.abspath(__file__)).parent.parent) +sys.path.append(projectdir) +print(projectdir) + +# corpus +chicken_and_gossip_path = projectdir + '/Data/corpus/chicken_and_gossip.txt' + +# word2vec +w2v_model_merge_short_path = projectdir + "/Data/chinese_vector/w2v_model_merge_short.vec" + +# tf_idf +td_idf_cut_path = projectdir + '/Data/tf_idf/td_idf_cut.csv' +td_idf_cut_pinyin = projectdir + '/Data/tf_idf/td_idf_cut_pinyin.csv' +td_idf_path_pinyin = projectdir + '/Data/tf_idf/td_idf_cut_pinyin_dictionary_model.pkl' +td_idf_path = projectdir + '/Data/tf_idf/td_idf_cut_dictionary_model.pkl' + +# word, 句向量 +w2v_model_wiki_word_path = projectdir + '/Data/chinese_vector/w2v_model_wiki_word.vec' +matrix_ques_part_path = projectdir + '/Data/sentence_vec_encode_word/1.txt' + +# char, 句向量 +w2v_model_char_path = projectdir + '/Data/chinese_vector/w2v_model_wiki_char.vec' +matrix_ques_part_path_char = projectdir + '/Data/sentence_vec_encode_char/1.txt' + +# word2vec select +word2_vec_path = w2v_model_wiki_word_path if os.path.exists(w2v_model_wiki_word_path) else w2v_model_merge_short_path \ No newline at end of file diff --git a/python-version-time b/python-version-time new file mode 100644 index 0000000..54dcdff --- /dev/null +++ b/python-version-time @@ -0,0 +1,15 @@ +Python 3.3.2(May 15, 2013) +Python 3.2.5(May 15, 2013) +Python 3.1.5(April 10, 2012) +Python 3.0.1(February 13, 2009) +Python 2.7.5(May 15, 2013) +Python 2.6.8(April 10, 2012) +Python 2.5.6(May 26, 2011) +Python 2.4.6(December 19, 2008) +Python 2.3.7(March 11, 2008) +Python 2.2.3(May 30, 2003) +Python 2.1.3(April 8, 2002) +Python 2.0.1(June 2001) +Python 1.6.1(September 2000) +Python 1.5.2(April 1999) +Older releases:Source releases,binaries-1.1,binaries-1.2,binaries-1.3,binaries-1.4,binaries-1.5 diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..b6c21a2 --- /dev/null +++ b/readme.md @@ -0,0 +1,49 @@ +# nlp_xiaojiang + +# Data + - chinese_vector + - 截取的部分word2vec训练词向量(自己需要下载全效果才会好) + - corpus + - 小黄鸡和gossip问答预料(数据没清洗) + - sentence_vec_encode_char + - 1.txt(字向量生成的前100000句向量) + - sentence_vec_encode_word + - 1.txt(词向量生成的前100000句向量) + - tf_idf(chicken_and_gossip.txt生成的tf-idf) + +# ChatBot + - 检索式ChatBot + - 像ES那样直接检索(如使用fuzzywuzzy),只能字面匹配 + - 构造句向量,检索问答库,能够检索有同义词的句子 + - 生成式ChatBot(todo) + - seq2seq + - GAN + +# FeatureProject + - normalization_util指的是数据归一化 + - 0-1归一化处理 + - 均值归一化 + - sig归一化处理 + - sim feature(这里只有ML,没有bert、emlo等的句向量相似度) + - distance_text_or_vec:各种计算文本、向量距离等 + - distance_vec_TS_SS:TS_SS计算词向量距离 + - cut_td_idf:将小黄鸡语料和gossip结合 + - sentence_sim_feature:计算两个文本的相似度或者距离,例如qq(问题和问题),或者qa(问题和答案) + +# run + - 1.创建tf-idf文件等(运行2需要先跑1): python cut_td_idf.py + - 2.计算两个句子间的各种相似度,先计算一个预定义的,然后可输入自定义的(先跑1): python sentence_sim_feature.py + - 3.chatbot_1跑起来(fuzzy检索-没)(独立):python chatbot_fuzzy.py + - 4.chatbot_2跑起来(句向量检索-词)(独立):python chatbot_sentence_vec_by_word.py + - 5.chatbot_3跑起来(句向量检索-字)(独立):python chatbot_sentence_vec_by_char.py + +# requestments.txt + - python_Levenshtei + - 调用Levenshtein,我的python是3.6, + - 打开其源文件https://www.lfd.uci.edu/~gohlke/pythonlibs/ + - 查找python_Levenshtein-0.12.0-cp36-cp36m-win_amd64.whl下载即可 + - pyemd + - pyhanlp + - 下好依赖JPype1-0.6.3-cp36-cp36m-win_amd64.whl + + diff --git a/requestments.txt b/requestments.txt new file mode 100644 index 0000000..2195e9c --- /dev/null +++ b/requestments.txt @@ -0,0 +1,12 @@ +python-Levenshtein==0.12.0 +fuzzywuzzy==0.17.0 +openpyxl==2.6.2 +pandas==0.24.2 +xpinyin==0.5.6 +numpy==1.16.1 +gensim==3.7.1 +pyemd==0.5.1 +jieba==0.39 +xlrd==1.2.0 +sklearn +pathlib diff --git a/result_test/__init__.py b/result_test/__init__.py new file mode 100644 index 0000000..cdaeb55 --- /dev/null +++ b/result_test/__init__.py @@ -0,0 +1,5 @@ +# -*- coding: UTF-8 -*- +# !/usr/bin/python +# @time :2019/4/3 14:40 +# @author :Mo +# @function : \ No newline at end of file diff --git a/result_test/result_chatbot_fuzzy.txt b/result_test/result_chatbot_fuzzy.txt new file mode 100644 index 0000000..cd5b755 --- /dev/null +++ b/result_test/result_chatbot_fuzzy.txt @@ -0,0 +1,38 @@ +Connected to pydev debugger (build 171.3780.115) +D:\workspace\pythonMyCode\django_project\nlp_xiaojiang +read questions ok! +: ˭ѽ +Сˣ Ҿ˼˰ľۺѽ +Ƽ: +[('˭ѽ\tҾ˼˰ľۺѽ\n', 100), ('˭ѽ\tҾ˼˰ľۺѽ\n', 100), ('˭\t==\n', 67), ('˭\t==\n', 67), ('˭\t==\n', 67)] +: +nenen +Сˣ +Ƽ: +[('nnn\t\n', 75), ('nnn\tnnn\n', 75), ('lene\t==\n', 67), ('tencent\t==\n', 67), ('nann\t==\n', 67)] +: +niguola +Сˣ == +Ƽ: +[('igdota\t==\n', 62), ('ula\t==\n', 60), ('qiulaif\t==\n', 57), ('qiulaif\t==\n', 57), ('gold\t==\n', 55)] +: + +Сˣ imbaͳ˵ɵƲ +Ƽ: +[(',ɵ\timbaͳ˵ɵƲ\n', 100), ('!ҵİ,ɣ\tҪ~ҲᡣҪǾ˦ô\n', 100), ('\t==\n', 67), ('Ҵ,\t==\n', 60), ('\t\n', 57)] +: + +Сˣ ҮҮҮ +Ƽ: +[('\tҮҮҮ\n', 100), ('\t2Ǽ㼦ܣ\n', 100), ('\t\n', 100), ('\tСм\n', 100), ('\t򱨵См\n', 100)] +: +Ц +Сˣ ֪ +Ƽ: +[('Ц\t֪\n', 100), ('Ц\t==\n', 67), ('Ц\t==\n', 67), ('Ц\t\n', 50), ('Ц\t\n', 50)] +: +ӭ +Сˣ ӭ +Ƽ: +[('\tӭ\n', 62), ('\t\n', 57), ('\t\n', 57), ('\t==\n', 57), ('\t\n', 57)] +: diff --git a/result_test/result_chatbot_sentence_vec_by_char.txt b/result_test/result_chatbot_sentence_vec_by_char.txt new file mode 100644 index 0000000..f750d52 --- /dev/null +++ b/result_test/result_chatbot_sentence_vec_by_char.txt @@ -0,0 +1,55 @@ +Connected to pydev debugger (build 171.3780.115) +D:\workspace\pythonMyCode\django_project\nlp_xiaojiang +np.loadtxt(matrix_ques_part_path_char) ok! +: +˭ѽ +D:/workspace/pythonMyCode/django_project/nlp_xiaojiang/ChatBot/chatbot_search/chatbot_sentence_vec_by_char.py:115: RuntimeWarning: invalid value encountered in true_divide + matrix_org_norm = (matrix_org / np.sqrt((matrix_org ** 2).sum(-1))[..., np.newaxis]).astype(numpy_type) +С: £ +'gbk' codec can't encode character '\u301c' in position 227: illegal multibyte sequence +: + +С: ʲô +[('', 'ʲô'), ('', '(O_O)'), ('', '=='), ('', 'ԶԶԴ󷽷'), ('', 'ǰҵģ԰'), ('', '˳Ե̫?߲dzŵλ?....һǵ~Ҫ߲Ҫݭ³~(^V^)'), ('', 'ԶԶԴ󷽷'), (',,,', '=='), ('', "ô'"), ('', '˳Ե̫?߲dzŵλ?....һǵ~Ҫ߲Ҫݭ³~(^V^)'), ('', 'ʲô'), ('', '˵ˣҲ'), ('', 'ʲô'), ('', 'ǾȥԷɣװףŵ°ףߺ'), ('', '?'), ('', 'ʲô'), (',', '˳Ե̫?߲dzŵλ?....һǵ~Ҫ߲Ҫݭ³~(^V^)'), ('', "ô'"), ('', '?'), ('', '˳Ե̫?߲dzŵλ?....һǵ~Ҫ߲Ҫݭ³~(^V^)')] +: + +С: _? +[('', '_?'), (',,', '=='), ('', 'Цƨ'), (',', 'hŶ'), ('', 'Ц'), ('', ''), ('', ''), ('', 'hihi~'), ('', 'ôôDz'), ('', 'ҮҮ'), ('', ''), (',', '갡'), ('', 'Ц'), ('', 'ҲŲ'), ('', '(o?)'), ('', ''), ('', ''), ('', '_?'), ('', 'Unauthorizedaccess!.Inthisprogram(site,app),theSimSimiAPIisbeingusedillegally.Pleasecontactus.'), ('!', '')] +: + +С: հ +'gbk' codec can't encode character '\u301c' in position 136: illegal multibyte sequence +: + +С: Сͨ~ +[('', 'Сͨ~'), ('', '(*^__^*)'), ('', ''), ('', 'ʹ'), ('', 'См~'), ('', 'hi'), ('', 'ҮҮҮ'), ('', ''), ('', 'ҮҮҮ'), ('', 'Сͨ~'), ('', 'Сͨ~'), ('', ''), ('', 'См'), ('', ''), ('', ''), ('', 'ʹ'), ('', '򱨵См'), ('', '°'), ('', 'hi'), ('', '')] +: +ȥĸ +С: һȥԷͰ +[('ϸȥ', 'һȥԷͰ'), ('Ǹȥ', 'ʲô߹'), ('ȥ', 'μ̫꣡'), ('', 'ǵģ'), ('', '԰ʲôأ֪Щʲôأ'), ('', "ǰŸʻһ'"), ('ĸ', 'š㡣'), ('ϸȥ', 'ϸ'), ('ĸ', '˵˵еȻǻ'), ('ĸ', 'RudyJeffEric'), ('!˸ȥ', ''), ('', 'SB'), ('˸ȥ', 'һȥԷͰ'), ('˸ȥ', '˸ȥ'), ('˸ȥ', '˸ȥ'), ('˸ȥ', ''), ('˸ȥ', 'С׿ͷ'), ('˸ȥ', '˸'), ('˸ȥ', '˿'), ('˸ȥ', '==')] +: +ȥˣ +С: ȥ㋌ +[('ȥ', 'ȥ㋌'), ('ȥҼ', 'ȥģ'), ('ȥ', ''), ('ȥ', 'ȥģ'), ('', 'Ŷ'), ('뿴ȥ', ''), ('', 'ţǻģʹᣬҲһֱ㡣'), ('ȥϺ', 'һԵ'), ('ȥ', '봩Խ'), ('ĺ', 'ϲһﶼ'), ('ȥ', 'ðκͦİϲѩ'), ('Լȥ', '̫ˡ'), ('', 'ëߣѧϰȥ'), ('Լȥ', 'Ǹ˵'), ('', ''), ('', 'ã'), ('', 'ߣֽȥԺܶܶ~~'), ('Ķ', 'ϲһﶼ'), ('', '֪'), ('', "ⱨӦģͻᷢѪ⻯ķ'һӲ㣡")] +: + +"word ' ' not in vocabulary" +С: ܺŶ +list index out of range +: + +С: Ŷ~~~~~~~~ +[('', 'Ŷ~~~~~~~~'), ('', '=='), ('', 'ף㷢ˣ'), ('', 'ûûʲô'), ('', '㲻Ҳ'), ('ѽ', 'giոgiİոgiյչţ'), ('', 'ô߷'), ('', '^_^'), ('', 'ڵġڵġʱ̺ţ'), ('~~~(>_<', 'ʲô簮'), ('', 'ô'), ('', 'ľ'), ('', 'ӴǺ'), ('', 'Ӵ'), ('', 'Ӵ'), ('', 'ô'), ('', 'ʲô簮'), ('', '̾'), ('', 'Ŷ'), ('', '==')] +: +»ˮ +С: Զ̣ܾ㶮ģ +'gbk' codec can't encode character '\u2207' in position 329: illegal multibyte sequence +: +ϲ˭ +С: Ȼϲ +[('ϲ˭', 'Ȼϲ'), ('ϲ˭', 'С'), ('ϲ˭', 'ϲʨ'), ('ϲ˭', 'bb'), ('ϲ˭', '£'), ('ϲ˭', '˭ϲҡҾϲ˭'), ('ϲ˭', 'ɰ'), ('ϲ˭', ''), ('ϲ˭', 'ȻǺ㱿'), ('ϲ˭', 'š˼Ҳ˼'), ('ϲ˭', 'Ʋqq******'), ('ϲ˭', 'ͣҵ'), ('ϲ˭', 'ȻСŮ~'), ('ϲ˭', 'Ȼ'), ('ϲ˭', 'Ը˵ladygaga'), ('ϲ˭', 'ϲ'), ('ϲ˭', ''), ('˭ϲ', 'ϲѾ>3<'), (',,ϲ˭', 'ŶŶ'), ('!ϲ˭', 'ߺ')] +: + +С: +[('?', ''), ('', 'óԳ'), ('??', ''), ('...', 'ô˻'), ('~', '~~~'), ('??', '˭֪ͥôϴγԻһϴءڶҸ'), ('', ''), ('', 'ͲҪŵô'), ('?', '벻ҪڹΪû޷ȥ'), (',', '쵽~'), ('', 'ͷ'), ('', 'ôôˣӳû'), ('?', ''), ('', ''), ('', ''), ('', ''), ('?', '£'), ('', 'Ŷ'), ('', 'Сˣ'), ('??', '==')] +: diff --git a/result_test/result_chatbot_sentence_vec_by_word.txt b/result_test/result_chatbot_sentence_vec_by_word.txt new file mode 100644 index 0000000..2c1f251 --- /dev/null +++ b/result_test/result_chatbot_sentence_vec_by_word.txt @@ -0,0 +1,73 @@ +Connected to pydev debugger (build 171.3780.115) +D:\workspace\pythonMyCode\django_project\nlp_xiaojiang +load_word2vec_model start! +load_word2vec_model end! +load w2v_model_wiki_word_path ok! +np.loadtxt(matrix_ques_part_path) start! +np.loadtxt(matrix_ques_part_path) end! +: +˭ѽ +Building prefix dict from the default dictionary ... +Loading model from cache C:\Users\MOYONG~1\AppData\Local\Temp\jieba.cache +Loading model cost 0.815 seconds. +Prefix dict has been built succesfully. +D:/workspace/pythonMyCode/django_project/nlp_xiaojiang/ChatBot/chatbot_search/chatbot_sentence_vec_by_word.py:131: RuntimeWarning: invalid value encountered in true_divide + matrix_org_norm = (matrix_org / np.sqrt((matrix_org ** 2).sum(-1))[..., np.newaxis]).astype(numpy_type) +С: ȥ +[('ְ˭', 'ȥ'), ('ְ˭', '޷ɷ'), ('˭', ''), ('˭', '°'), ('˭', ''), ('˭', '˵Ҳûã'), ('˭', '°'), ('˭', 'ҵˡ~~ꡭҪ߱ˣ費Ҹְ˵^o^'), ('˭', '׳'), ('˭', 'Ϲ'), ('˭', 'Ƽ'), ('²˭', 'Ҳ²¡'), ('˭', 'ڰ'), ('˭', 'õ~'), ('˭', 'ȻǴչ˶'), ('˭', 'ڿ˧'), ('˭', 'ѽ'), ('˭', 'õ~'), ('˭', 'µһ˧磬òǻ۵ĽϣӢĻ~ү'), ('˭', 'ƣ')] +: + +С: ҽлСͨӴ +[('', 'ҽлСͨӴ'), ('', 'ҽsimsimi'), ('ҵ', 'Ard!!'), ('', 'үķౣƿСͨȸС'), ('', ''), ('ֽ', 'ߣܡ~һ'), ('Ůѵ', 'Ρ'), ('ֽʲô', 'Ұְ'), ('Լ', 'Ŷ'), ('', 'Ŷ'), ('ôд', 'ÿϣ㶼кְ'), ('ҵʲô', 'һȻ'), ('', 'Ƽ'), ('ֽɶ', ''), ('е', ''), ('ҵ19˵ҵʲô', ''), ('һҵ', 'ү'), ('appʲô', ''), ('ҵŵʲô', ''), ('Խгҵ', 'ҽС˼ܴ')] +: +ҵ +С: Ard!! +[('ҵ', 'Ard!!'), ('', 'ҽsimsimi'), ('', 'ҽлСͨӴ'), ('', ''), ('', 'үķౣƿСͨȸС'), ('ҵŮʲô', 'ѽ˵ұԼ֪'), ('ҵŵʲô', ''), ('˵ҵ', 'ร'), ('ֽ', 'ߣܡ~һ'), ('ҵʲô', 'һȻ'), ('Լ', 'Ŷ'), ('', 'Ŷ'), ('ϲɶ', '֪'), ('ҵ19˵ҵʲô', ''), ('˵΢', 'DiҲ'), ('ֽɶ', ''), ('Ůѵ', 'Ρ'), ('ֽʲô', 'Ұְ'), ('Сƹѵ', 'Ρ'), ('', 'Ժ~')] +: +bվʲôÿ +С: лѽ,յлɺóȥῴ +'gbk' codec can't encode character '\u301c' in position 116: illegal multibyte sequence +: +ѽ +С: Ϻö +'gbk' codec can't encode character '\u301c' in position 131: illegal multibyte sequence +: +ƽֻ? +С: ðɺðҳֲ +[('仰', 'ðɺðҳֲ'), (',,,,,Ц', 'ðЦˣձ˭ձô󣡣
úѽͲί㣬̫Ƿ'), ('˵Ƕ', ''), ('˵СͨǹĻĸ', 'һ'), ('Ҫ仰', 'ǵǵΣǼҵΣ'), ('ǵط,', 'Ž'), ('ʲô', 'и'), ('ôÿ춼仰', 'ñ'), ('˵Ǽ', 'yes,mylord'), ('ǻô', 'Ҳһ'), ('˵,ƼŮ', '=='), ('仰˵ĺü', 'ϯɵһ~~~'), ('˵ǹ', 'ǹĸأ'), ('仰ҲԶظ', ''), ('仰ʲô˼', 'һ'), ('˵㵽ǹĻĸ', 'ме'), ('仰,ܻܲ仰', 'ǣֲҪ'), ('Цĸɻ', 'ÿѧУô˵еġ*ѧġ
翼ԣĴ˾Ȼͬ࿼
˼˽Ÿ˵һ仰Ӧöʶɲˡ
һѧֱʰô'), ('˵ijݻ', 'ѩ'), ('˵ʨ', 'ţ')] +: +ѽ㲻 +С: +'gbk' codec can't encode character '\xaf' in position 433: illegal multibyte sequence +: +㲻ѽ +С: ô? +[(',㲻', 'ô?'), ('㲻', '(_)'), ('㲻', 'yes,sir'), ('㲻', 'аһʹ'), ('㲻', 'һ~\\(RQ)/~'), ('˵㲻', 'ðɣ'), ('@ô', 'ȵҹ˾ʱĵظ˵һ^_^'), ('ʲôѽ,㶮', 'ҽа'), ('벻а,', '뵽ĵ'), ('Ȼ', 'Ǽ,'), ('㲻ô', 'Ҿϲ㲻ϲңѽ'), ('ɵ', 'ûɵ'), ('ô@,~~~(>_<', 'ءСͨϢһ'), ('ǺǸ,Ҫ', 'ԲװҲ'), ('ﲻ', 'ԲװҲ'), ('ɶ', '=='), ('ƨѽ~ҽʲô', ''), ('˵ô@ѽ,', '˵ֻһ㡣'), ('Ƕѽ', 'զ֪'), ('˵ҷ', 'õģСҲ(>n<)')] +: +Ǻ +С: ð +[('ǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺ', 'ð'), ('Ǻǡ', 'ɵЦɶ'), ('Ǻ', 'èġ'), ('Ǻ', ''), ('Ǻ', 'ϴȥ'), ('Ǻǡ', 'лл'), ('Ǻ', '˵Ǻ'), ('Ǻ,', '˴˱˴˰'), ('Ǻ', 'ҪǺǣҲϲҲϲĵ'), ('Ǻ', 'ɵ'), ('ǺǺǺ', 'ֲŮ'), ('Ǻ,', 'ãСͨ'), ('ǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺ', 'Ǻү'), ('Ǻ', 'ds'), ('Ǻ', ''), ('ǺǺǺ~~', '=='), ('Ǻ', '˿źǺǡ'), ('Ǻ,,,', 'ǺdzԷȥϴ'), ('Ǻ', 'ֹߣںǺǡ'), ('ǺǺǺǺǺ', 'Ǻã')] +: + +С: hi +[('', 'hi'), ('~', '=='), ('', 'ʹ'), ('', 'См~'), ('', '~\\(RQ)/~'), ('', '򱨵См'), ('', ''), ('', 'Сͨ~'), ('', ''), ('', 'См'), ('', 'hi'), ('', ''), ('', ''), ('', 'См~'), ('', 'Сͨ~'), ('', 'См~'), ('', ''), ('', 'См~'), ('', '~\\(RQ)/~'), ('', '==')] +: +ȥ +С: Ҳ֪ +'gbk' codec can't encode character '\xaf' in position 273: illegal multibyte sequence +: + +С: +'gbk' codec can't encode character '\u301c' in position 335: illegal multibyte sequence +: + +С: +'gbk' codec can't encode character '\u301c' in position 170: illegal multibyte sequence +: +ȥ +'gbk' codec can't encode character '\u301c' in position 32: illegal multibyte sequence +: + +С: that'sok +[('thanks', "that'sok"), ('Ģ', '㹽'), ('ɵϻ', 'ö'), ('ſ', 'fuck'), ('baka', '·'), ('Сͷ', 'Ҫζ'), ('㹤', ''), ('Ե', '=='), ('ͯӼ', 'Ҫ'), ('5.3.6.9', '=='), ('', 'Ҹְ'), ('', 'У'), ('', 'ҧСͨ'), ('33333', ''), ('ƿ', 'һֱԥ'), ('year', '¹'), ('goodnight', 'SweetdreamdarlingXD'), ('goodnight', 'ba\u2006d'), ('goodnight', 'Ȼ'), ('', 'ٺ١')] +: diff --git a/result_test/result_sentence_sim_feature.txt b/result_test/result_sentence_sim_feature.txt new file mode 100644 index 0000000..f5adfec --- /dev/null +++ b/result_test/result_sentence_sim_feature.txt @@ -0,0 +1,37 @@ +Connected to pydev debugger (build 171.3780.115) +D:\workspace\pythonMyCode\django_project\nlp_xiaojiang +load w2v model begin +load w2v model success +Building prefix dict from the default dictionary ... +Loading model from cache C:\Users\MOYONG~1\AppData\Local\Temp\jieba.cache +Loading model cost 0.719 seconds. +set_data time0.7200782299041748 +Prefix dict has been built succesfully. +same_word_count time0.0 +same_char_count time0.0 +sentence_length time0.0 +w2c_all_vec time0.1994335651397705 +tdidf_all_vec time0.0 +edit_all_str time0.0019953250885009766 +jaccardϵ time0.0 +tdidf_all_vec_pinyin time0.0 +edit_all_pinyin time0.004553556442260742 +jaccardϵpinyin time0.0 +sim: +[1, 3, 1, 1.1, 5.238095238095238, 0.6782572237857507, 3461.1677906854284, 283.83272299933014, 19.980963040347838, 0.9999999999966667, 3.0830289870500565, 19.980963040347838, 24.494821131252575, 79619.83774188746, -5.10379204991808, 0.6769724044408956, 0.0, 12, 9, 0.2857142857142857, 0.5242424242424243, 19, 19, 2, 8.141546895617283, 0.08333333333333333, 0.16666666666666666, 0.008081558347970244, 17, 22, 0.5217391304347826, 0.6838686096962837, 56, 47, 4, 6.190419904893637, 0.11764705882352941, 0.11764705882352941] +s1: +s2: ߵúѽ +set_data time0.0009706020355224609 +same_word_count time0.0009982585906982422 +same_char_count time0.0 +sentence_length time0.0 +w2c_all_vec time0.20846796035766602 +tdidf_all_vec time0.0 +edit_all_str time0.0019943714141845703 +jaccardϵ time0.0 +tdidf_all_vec_pinyin time0.0 +edit_all_pinyin time0.0019960403442382812 +jaccardϵpinyin time0.0 +sim: +[2, 3, 1, 0.875, 3.7333333333333334, 0.8200504988005877, 3746.94646712115, 236.48076447923086, 17.65693370974129, 0.9999999999966667, 4.2634280025959015, 17.65693370974129, 24.494877087856107, 78956.49194315828, -13.367107715032754, 0.8200018973656127, 0.07174613344073014, 21, 6, 0.4, 0.6011904761904762, 40, 40, 1, 5.620521171774245, 0.2, 0.25, 0.36243089354552877, 10, 15, 0.5384615384615384, 0.6417797888386123, 62, 58, 5, 6.01776904578638, 0.25, 0.25] +s1: \ No newline at end of file diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000..d838479 --- /dev/null +++ b/utils/__init__.py @@ -0,0 +1,5 @@ +# -*- coding: UTF-8 -*- +# !/usr/bin/python +# @time :2019/4/3 15:15 +# @author :Mo +# @function : \ No newline at end of file diff --git a/utils/text_tools.py b/utils/text_tools.py new file mode 100644 index 0000000..a8b40f6 --- /dev/null +++ b/utils/text_tools.py @@ -0,0 +1,322 @@ +# -*- coding: UTF-8 -*- +# !/usr/bin/python +# @time :2019/4/3 11:23 +# @author :Mo +# @function :utils, tools + + +from openpyxl import Workbook +import logging as logger +import gensim +import jieba +import time +import xlrd +import re + + +#中英文标点符号 +filters='[!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n' + '!,;:。?、“”’‘《》()~@#¥%……&*\()/{}【】…=-]' +#标点符号、空格 +filters_1 = "[\.\!\/_,?;:$%^*<>()+\"\']+|[!,;:。?、“”’‘《》()~@#¥%……&*\(\)\/\-]+" + +"""去除标点符号、空格""" +def clear_punctuation(text): + """去除标点符号""" + sentence = text.replace(' ', '') + sentence_punctuation_clear = re.sub(filters, ' ', sentence).strip() + sentence_punctuation_clear_replace = sentence_punctuation_clear.replace(' ', ' ').replace(' ', ' ') + return sentence_punctuation_clear_replace + + +'''截取中文、拼音、数字,去除特殊字符等''' +def getChinese1(ques): + # ques = '•“鑫菁英”教育分期手续费怎么收取?可以' + findAllChinese = ''.join(re.findall(u"([\u4e00-\u9fa50-9A-Za-z])", ques)) + # print(sub_str) + return findAllChinese + + +'''xlrd读xls''' +def xlsRead(sheetName=None, cols=0, fileXlsPath=None): + '''读xls文件''' + workbook = xlrd.open_workbook(fileXlsPath) + # 根据sheet索引或者名称获取sheet内容 + sheet = workbook.sheet_by_name(sheetName) + nrows = sheet.nrows + ncols = sheet.ncols + + listRows = [] + for i in range(nrows): + listRows.append(sheet.row_values(i)) + + return listRows + + +'''openpyxl写xlsx''' +def xlsxWrite(sheetName, writeList, fileXlsName): + wb = Workbook() + print('{}'.format(wb.get_sheet_names())) # 提供一个默认名叫Sheet的表,office2016下新建提供默认Sheet1 + sheet = wb.create_sheet(sheetName) + # i = 0 + for listLine_one in writeList: + # i += 1 + sheet.append(listLine_one) + # if i == 1000: + # break + wb.save(fileXlsName) + + + +"""判断一个unicode是否是英文字母""" +def is_alphabet(uchar): + """判断一个unicode是否是英文字母""" + if (uchar >= u'\u0041' and uchar <= u'\u005a') or (uchar >= u'\u0061' and uchar <= u'\u007a'): + return True + else: + return False + +'''读取txt文件''' +def txtRead(filePath, encodeType = 'utf-8'): + listLine = [] + try: + file = open(filePath, 'r', encoding= encodeType) + + while True: + line = file.readline() + if not line: + break + + listLine.append(line) + + file.close() + + except Exception as e: + logger.info(str(e)) + + finally: + return listLine + +'''读取txt文件''' +def txtWrite(listLine, filePath, type = 'w',encodeType='utf-8'): + + try: + file = open(filePath, type, encoding=encodeType) + file.writelines(listLine) + file.close() + + except Exception as e: + logger.info(str(e)) + +'''截取中文、拼音、数字,去除特殊字符等''' +'''要保留特殊字符的格式,最好的方法是每个字符都去匹配''' + +def getChinese(ques): + # ques = '•“鑫菁英”教育分期手续费怎么收取?可以' + ques = strQ2B(ques) + answer = '' + for ques_one in ques: + ques_one_findall = ''.join(re.findall(u"([\u4e00-\u9fa50-9A-Za-z峣㒶㒰玘宸諕鄕缓緩𪥵嬆嬲煙草砼赟贇龘㗊㵘㙓敠])", ques_one)) + if not ques_one_findall: + ques_one_findall = ' ' + answer = answer + ques_one_findall + answer = answer.strip().replace(' ', ' ').replace(' ', ' ') + return answer.upper() + +'''去除标点符号''' + +def get_syboml(ques): + # ques = '•“鑫菁英”教育分期手续费怎么收取?可以' + ques = strQ2B(ques) + # answer = re.sub(u'([。.,,、\;;::??!!“”"‘’'''()()…——-《》<>{}_~【】\\[])', ' ', ques).replace(' ', ' ').replace(' ', ' ') + answer = re.sub("[\.\!\/_,?;:$%^*<>()+\"\']+|[!,;:。?、“”’‘《》[\](|){}【】~@#¥%…&*\/\-—_]+", " ", ques).strip() + return answer + +'''xlrd读xls''' + +def xlsRead(sheetName=None, cols=0, fileXlsPath=None): + '''读xls文件''' + workbook = xlrd.open_workbook(fileXlsPath) + # 根据sheet索引或者名称获取sheet内容 + sheet = workbook.sheet_by_name(sheetName) + nrows = sheet.nrows + ncols = sheet.ncols + + listRows = [] + for i in range(nrows): + listRows.append(sheet.row_values(i)) + + return listRows + +'''openpyxl写xlsx''' + +def xlsxWrite(sheetName, writeList, fileXlsName): + wb = Workbook() + print('{}'.format(wb.get_sheet_names())) # 提供一个默认名叫Sheet的表,office2016下新建提供默认Sheet1 + sheet = wb.create_sheet(sheetName) + # i = 0 + for listLine_one in writeList: + # i += 1 + sheet.append(listLine_one) + # if i == 1000: + # break + wb.save(fileXlsName) + +'''读取txt文件''' + +def txtRead(filePath, encodeType='utf-8'): + listLine = [] + try: + file = open(filePath, 'r', encoding=encodeType) + + while True: + line = file.readline() + if not line: + break + + listLine.append(line) + + file.close() + + except Exception as e: + logger.info(str(e)) + + finally: + return listLine + +'''读取txt文件''' + +def txtWrite(listLine, filePath, type='w', encodeType='utf-8'): + + try: + file = open(filePath, type, encoding=encodeType) + file.writelines(listLine) + file.close() + + except Exception as e: + logger.info(str(e)) + +# -*- coding: cp936 -*- +def strQ2B(ustring): + """全角转半角""" + rstring = "" + for uchar in ustring: + inside_code = ord(uchar) + if inside_code == 12288: # 全角空格直接转换 + inside_code = 32 + elif (inside_code >= 65281 and inside_code <= 65374): # 全角字符(除空格)根据关系转化 + inside_code -= 65248 + + rstring += chr(inside_code) + return rstring + +def strB2Q(ustring): + """半角转全角""" + rstring = "" + for uchar in ustring: + inside_code = ord(uchar) + if inside_code == 32: # 半角空格直接转化 + inside_code = 12288 + elif inside_code >= 32 and inside_code <= 126: # 半角字符(除空格)根据关系转化 + inside_code += 65248 + + rstring += chr(inside_code) + return rstring + +def is_valid_date(strdate): + '''判断是否是一个有效的日期字符串''' + try: + if ":" in strdate: + time.strptime(strdate, "%Y-%m-%d %H:%M:%S") + else: + time.strptime(strdate, "%Y-%m-%d") + return True + except: + return False + +'''判断是否是全英文的''' + +def is_total_english(text): + """判断一个是否是全英文字母""" + symbol = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' + try: + sentence_punctuation_clear = get_syboml(text) + sentence_punctuation_clear = sentence_punctuation_clear.replace(' ', '').strip() + numben = 0 + for one in sentence_punctuation_clear: + if one in symbol: + numben += 1 + if numben == len(sentence_punctuation_clear): + return True + else: + return False + except: + return False + +'''判断是否是数字的''' + +def is_total_number(text): + """判断一个是否是全英文字母""" + try: + sentence_punctuation_clear = get_syboml(text) + sentence_punctuation_clear = sentence_punctuation_clear.replace(' ', '').strip() + numben = 0 + for one in sentence_punctuation_clear: + if one.isdigit(): + numben += 1 + if numben == len(sentence_punctuation_clear): + return True + else: + return False + except: + return False + +def is_number_or_english(text): + '''不为数字不为字母''' + judge = False + try: + sentence_punctuation_clear = get_syboml(text) + sentence_punctuation_clear = sentence_punctuation_clear.replace(' ', '').strip() + for words in sentence_punctuation_clear: + judge_number = is_total_number(words) + judge_english = is_total_english(words) + judge = judge_number or judge_english + if not judge: + return False + return judge + except: + return False + +#todo #句子改写,同义词替换,去停用词等 + + +if __name__ == '__main__': + + + # for i in range(10): + # sentence_vec = word2vec_model.wv["的"] + # sentence_vec_pd = pd.DataFrame(sentence_vec) + # sentence_vec_pd.to_csv('my_csv.csv', mode='a', header=False) + + # sentence_ee = pd.read_csv('my_csv.csv') + + # txtWrite([str(sentence_vec)], "gg.txt") + + + # path_test_data_government = '/data/test_data_government.csv' + # sentences = txtRead(path_test_data_government) + sentences = [] + sentences_one_clear_punctuation_all = [] + for sentences_one in sentences[1:]: + sentences_one_1 = sentences_one + sentences_one_clear_punctuation = clear_punctuation(sentences_one_1.replace(',0.0,1.0', '')) + # print(sentences_one) + # print(sentences_one_clear_punctuation) + sentences_one_clear_punctuation_jieba = jieba.cut(sentences_one_clear_punctuation, cut_all=False, HMM=False) + sentences_one_clear_punctuation_jieba_list = ' '.join(list(sentences_one_clear_punctuation_jieba)).replace(' ', ' ').replace(' ', ' ').strip() + sentences_one_clear_punctuation_all.append(sentences_one_clear_punctuation_jieba_list + ',0.0,1.0' + '\n') + + txtWrite(sentences[0:1] + sentences_one_clear_punctuation_all, '/data/test_data_government_cut.csv') + + #',0.0,1.0' + # np.savetxt('001', [word2vec_model.wv["的"], word2vec_model.wv["的"]]) + # gg = np.loadtxt('001') \ No newline at end of file diff --git a/utils/word2vec_vector.py b/utils/word2vec_vector.py new file mode 100644 index 0000000..50a3e9a --- /dev/null +++ b/utils/word2vec_vector.py @@ -0,0 +1,55 @@ +# -*- coding: UTF-8 -*- +# !/usr/bin/python +# @time :2019/4/4 10:00 +# @author :Mo +# @function : + +from __future__ import print_function +from utils.text_tools import txtRead, txtWrite +from gensim.models.word2vec import LineSentence +from gensim.models import Word2Vec +import multiprocessing +import logging +import sys +import os + +def train_word2vec_by_word(): + logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') + logging.root.setLevel(level=logging.INFO) + logging.info("running") + + inp = "Y:/BaiduNetdiskDownload/cut_zhwiki_wiki_parse/cut_zhwiki_wiki_parse.txt" + outp1 = "w2v_model_wiki.model" + outp2 = "w2v_model_wiki_word.vec" + model = Word2Vec(LineSentence(inp), size=300, window=5, min_count=5, workers=multiprocessing.cpu_count()) + model.save(outp1) + model.wv.save_word2vec_format(outp2, binary=False) + +def train_word2vec_by_char(): + logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') + logging.root.setLevel(level=logging.INFO) + logging.info("running") + + inp = "Y:/BaiduNetdiskDownload/cut_zhwiki_wiki_parse/cut_zhwiki_wiki_parse_char.txt" + outp1 = "w2v_model_wiki.model" + outp2 = "w2v_model_wiki_char.vec" + model = Word2Vec(LineSentence(inp), size=300, window=5, min_count=5, workers=multiprocessing.cpu_count()) + model.save(outp1) + model.wv.save_word2vec_format(outp2, binary=False) + + +if __name__ == '__main__': + train_word2vec_by_word() + # train_word2vec_by_char() + + # inp = "Y:/BaiduNetdiskDownload/cut_zhwiki_wiki_parse/cut_zhwiki_wiki_parse.txt" + # sentences_char = [] + # sentences = txtRead(inp) + # for sentences_one in sentences: + # sentences_one_replace = sentences_one.strip().replace(" ", "") + # sentences_one_replace_all = [] + # for sentences_one_replace_one in sentences_one_replace: + # sentences_one_replace_all.append(sentences_one_replace_one) + # sentences_char.append(" ".join(sentences_one_replace_all) + "\n") + # txtWrite(sentences_char, "Y:/BaiduNetdiskDownload/cut_zhwiki_wiki_parse/cut_zhwiki_wiki_parse_char.txt") + # gg = 0 \ No newline at end of file