diff --git a/ChatBot/__init__.py b/ChatBot/__init__.py
new file mode 100644
index 0000000..d869156
--- /dev/null
+++ b/ChatBot/__init__.py
@@ -0,0 +1,5 @@
+# -*- coding: UTF-8 -*-
+#!/usr/bin/python
+# @Time :2019/3/29 23:11
+# @author :Mo
+# @function :
\ No newline at end of file
diff --git a/ChatBot/chatbot_search/__init__.py b/ChatBot/chatbot_search/__init__.py
new file mode 100644
index 0000000..d838479
--- /dev/null
+++ b/ChatBot/chatbot_search/__init__.py
@@ -0,0 +1,5 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time :2019/4/3 15:15
+# @author :Mo
+# @function :
\ No newline at end of file
diff --git a/ChatBot/chatbot_search/chatbot_fuzzy.py b/ChatBot/chatbot_search/chatbot_fuzzy.py
new file mode 100644
index 0000000..605690c
--- /dev/null
+++ b/ChatBot/chatbot_search/chatbot_fuzzy.py
@@ -0,0 +1,163 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time :2019/4/4 10:00
+# @author :Mo
+# @function :
+
+
+from conf.path_config import chicken_and_gossip_path
+from utils.text_tools import txtRead, txtWrite
+from conf.path_config import projectdir
+from fuzzywuzzy import process
+from fuzzywuzzy import fuzz
+import pickle
+import time
+import re
+
+
+def count_same_char(x1, x2):
+ '''获取相同字符的个数'''
+ res = []
+ for x in x1:
+ if x in x2:
+ res.append(x)
+ if res:
+ return len(res)
+ else:
+ return 0
+
+
+def fuzzy_re(user_input, collection):
+ '''匹配方法, 效果不大好,只能匹配相同字数一样,或者字数比他多的那种,同义词或者是有一个词不一样,就没法区分开'''
+ suggestions = []
+ user_input = user_input.replace('.', '').replace('*', '').replace('?', '')
+
+ collection_new = []
+ len_user_input = len(user_input)
+ for coll in collection: # 获取包含所有字符的,如果不包含,就返回错误
+ count_coll = 0
+ for i in range(len_user_input):
+ if user_input[i] in coll:
+ count_coll += 1
+ if len_user_input == count_coll:
+ collection_new.append(coll)
+ if not collection_new:
+ return None
+
+
+ pattern = '.*?'.join(user_input) # Converts 'djm' to 'd.*?j.*?m'
+ try:
+ regex = re.compile(pattern) # Compiles a regex.
+ except:
+ gg = 0
+ for item in collection_new:
+ match = regex.search(item) # Checks if the current item matches the regex.
+ if match:
+ suggestions.append((len(match.group()), match.start(), item))
+ return [x for _, _, x in sorted(suggestions)]
+
+
+def fuzzy_fuzzywuzzy(fuzz, user_input, collection):
+ '''编辑距离,速度比较慢,比起匹配方法,能够处理字符不一样的问题'''
+ collection_new = []
+ len_user_input = len(user_input)
+ for coll in collection: # 获取包含一个字符的,如果不包含,就返回错误
+ for i in range(len_user_input):
+ if user_input[i] in coll:
+ collection_new.append(coll)
+ if not collection_new:
+ return None
+ collection_new = list(set(collection_new))
+
+ same_char_list = []
+ for collection_new_one in collection_new: # 获取相同字符串多的问题
+ count_same_char_one = count_same_char(user_input, collection_new_one)
+ same_char_list.append((collection_new_one, count_same_char_one))
+ same_char_list.sort(key=lambda x: x[1], reverse=True)
+ if len(same_char_list) >= 500:
+ same_char_list = same_char_list[0: 500]
+
+ result = process.extract(user_input, same_char_list, scorer=fuzz.token_set_ratio, limit=20)
+ return result
+
+
+def fuzzy_fuzzywuzzy_list(fuzz, user_input, qa_list, collection, topn=50):
+ '''编辑距离,速度比较慢,比起匹配方法,能够处理字符不一样的问题'''
+
+ start_time = time.time()
+ # user_input_set = set([user_input_one for user_input_one in user_input])
+ user_input_set = [user_input_one for user_input_one in user_input]
+
+
+ same_char_list = []
+ max_data = 0
+ max_data_list = []
+ count_collection_new_one = 0
+ for collection_new_one in collection: # 获取相同字符串多的问题
+ count_same_char_one = len([x for x in user_input_set if x in collection_new_one])
+
+ if count_same_char_one > 0:
+ same_char_list.append((count_collection_new_one, count_same_char_one))
+ if count_same_char_one > max_data:
+ max_data_list.append(count_same_char_one)
+ max_data = count_same_char_one
+ count_collection_new_one += 1
+
+ end_time1 = time.time()
+ list_max_count = []
+ len_max_data_list = len(max_data_list)
+ for x in range(len_max_data_list): # 获取前20排名
+ for k,l in same_char_list:
+ if l == max_data_list[len_max_data_list -1 - x]:
+ list_max_count.append(qa_list[k]) #问答重这里取出来
+ if len(list_max_count) >= 5000:
+ list_max_count = list_max_count[0:5000]
+ break
+
+ end_time2 = time.time()
+
+ # end_time1: 0.34090662002563477
+ # end_time2: 0.4080846309661865
+
+ # end_time1: 0.06417036056518555
+ # end_time2: 0.08422374725341797
+
+ # same_char_list.sort(key=lambda x: x[1], reverse=True)
+ # if len(same_char_list) >= 20:
+ # same_char_list = same_char_list[0: 20]
+
+ result = process.extract(user_input, list_max_count, scorer=fuzz.token_set_ratio, limit=topn)
+ end_time3 = time.time()
+
+ # print('end_time1: ' + str(end_time1 - start_time))
+ # print('end_time2: ' + str(end_time2 - start_time))
+ # print('end_time3: ' + str(end_time3 - start_time))
+
+ return result
+ # [fuzz.WRatio, fuzz.QRatio,
+ # fuzz.token_set_ratio, fuzz.token_sort_ratio,
+ # fuzz.partial_token_set_ratio, fuzz.partial_token_sort_ratio,
+ # fuzz.UWRatio, fuzz.UQRatio]
+
+
+if __name__ == '__main__':
+ start_time = time.time()
+ qa_list = txtRead(chicken_and_gossip_path)
+ questions = [qa.strip().split("\t")[0] for qa in qa_list]
+ print("read questions ok!")
+ sen = "你谁呀"
+ # list_fuzzyfinder = fuzzyfinder(base_syn_one_split[1], qa_list)
+ # list_fuzzyfinder = fuzzy_fuzzywuzzy(fuzz, base_syn_one_split[1], qa_list)
+ print("你问: " + "你谁呀")
+ list_fuzzyfinder = fuzzy_fuzzywuzzy_list(fuzz, sen, qa_list, questions, topn=5)
+ print("小姜机器人: " + list_fuzzyfinder[0][0].split("\t")[1].strip())
+ print("推荐结果: ")
+ print(list_fuzzyfinder)
+
+ while True:
+ print("你问: ")
+ ques = input()
+ list_fuzzyfinder = fuzzy_fuzzywuzzy_list(fuzz, ques, qa_list, questions, topn=5)
+ print("小姜机器人: " + list_fuzzyfinder[0][0].split("\t")[1].strip())
+ print("推荐结果: ")
+ print(list_fuzzyfinder)
diff --git a/ChatBot/chatbot_search/chatbot_sentence_vec_by_char.py b/ChatBot/chatbot_search/chatbot_sentence_vec_by_char.py
new file mode 100644
index 0000000..e229451
--- /dev/null
+++ b/ChatBot/chatbot_search/chatbot_sentence_vec_by_char.py
@@ -0,0 +1,142 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time :2019/4/4 10:00
+# @author :Mo
+# @function :chatbot based search, encode sentence_vec by char
+
+from conf.path_config import w2v_model_char_path
+from conf.path_config import matrix_ques_part_path_char
+from utils.text_tools import txtRead, txtWrite, getChinese
+from conf.path_config import projectdir, chicken_and_gossip_path
+from numpy import float32 as numpy_type
+from collections import Counter
+import pickle, jieba, os, re
+import jieba.posseg as pseg
+from gensim import matutils
+from math import log
+import numpy as np
+import gensim
+import jieba
+
+
+def load_word2vec_model(path, bin=False, limit=None):
+ word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(path, limit=limit, binary=bin, unicode_errors='ignore')
+ return word2vec_model
+
+
+def encoding_question(w2v_model, char_list):
+ ''' 生成句子向量
+ :param wordlist: 分词list
+ :param is_replaced: 是否替换default true
+ :param debug_mode: default false
+ :return: array句子的向量 len=300
+ '''
+ try:
+ sentence_vec = w2v_model.wv[word2vec_model.index2word[1]] * 0
+ except:
+ sentence_vec = w2v_model.wv[word2vec_model.index2word[0]] * 0
+
+ for k in range(len(char_list)):
+ char_list_one = char_list[k]
+ if type(char_list_one) == str:
+ try:
+ sentence_vec = sentence_vec + w2v_model.wv[char_list_one]
+ except Exception as e:
+ print(str(e))
+ if char_list_one not in [' ', '']:
+ sentence_vec = sentence_vec + 1
+ return sentence_vec
+
+
+def most_similar_sentence_vec(vec_ques, matrix_org, top_vec=20):
+ """
+ 最相似的句子,句向量与矩阵点乘
+ :param vec:
+ :param matrix:
+ :param keys:
+ :param topn:
+ :return:
+ """
+ # 首先对句向量矩阵标号
+ matrix_org_index = list(range(len(matrix_org)))
+ # Scale a vector to unit length. The only exception is the zerovector, which is returned back unchanged.
+ vec_ques_mean = matutils.unitvec(np.array([vec_ques]).mean(axis=0)).astype(numpy_type)
+ # matrix_org单位化
+ matrix_org_norm = (matrix_org / np.sqrt((matrix_org ** 2).sum(-1))[..., np.newaxis]).astype(numpy_type)
+ # 计算两个向量之间的相似度,使用numpy的dot函数,矩阵点乘
+ matrix_vec_dot = np.dot(matrix_org_norm, vec_ques_mean)
+ # 防止top_vec越界
+ top_vec = min(len(matrix_org), top_vec)
+ # 相似度排序
+ most_similar_sentence_vec_sort = matutils.argsort(matrix_vec_dot, topn=top_vec, reverse=True)
+
+ index_score = []
+ for t in most_similar_sentence_vec_sort[:top_vec]:
+ index_score.append([matrix_org_index[t], float(matrix_vec_dot[t])])
+ return index_score
+
+
+def create_matrix_org_np(sen_count, word2vec_model, qa_path, matrix_ques_path):
+ """
+ 创建问题句向量
+ :param sen_count: int
+ :param word2vec_model: gensim model
+ :param qa_path: str
+ :param matrix_ques_path:str
+ :return: None
+ """
+ if os.path.exists(matrix_ques_path):
+ file_matrix_ques = open(matrix_ques_path, 'rb')
+ matrix_ques = pickle.load(file_matrix_ques)
+ return matrix_ques
+ print('create_matrix_org_pkl start!')
+ qa_dail = txtRead(qa_path, encodeType='utf-8')
+ # questions = []
+ matrix_ques = []
+ count = 0
+ for qa_dail_one in qa_dail:
+ ques = getChinese(qa_dail_one.split('\t')[0])
+ char_list = [ques_char for ques_char in ques]
+ sentence_vec = encoding_question(word2vec_model, char_list)
+ matrix_ques.append(sentence_vec)
+ if len(matrix_ques)%sen_count == 0 and len(matrix_ques) != 0:
+ print("count: " + str(count))
+ count += 1
+ np.savetxt(projectdir + "/Data/sentence_vec_encode_char/" + str(count)+".txt", matrix_ques)
+ matrix_ques = []
+ break
+
+ # count += 1
+ # np.savetxt(projectdir + "/Data/sentence_vec_encode_char/" + str(count)+".txt", matrix_ques)
+
+ print('create_matrix_org_pkl ok!')
+ # return matrix_ques
+
+
+if __name__ == '__main__':
+
+ # 读取问答语料
+ syn_qa_dails = txtRead(chicken_and_gossip_path, encodeType='utf-8')
+ # 读取词向量
+ word2vec_model = load_word2vec_model(w2v_model_char_path, limit=None)
+ # 创建标准问答中问题的句向量,存起来,到matrix_ques_path, 10万条,可自己设置,这里需要耗费点时间
+ if not os.path.exists(matrix_ques_part_path_char):
+ # matrix_ques = create_matrix_org_np(sen_count=100000, word2vec_model=word2vec_model, qa_path=chicken_and_gossip_path, matrix_ques_path=matrix_ques_part_path_char)
+ create_matrix_org_np(sen_count=100000, word2vec_model=word2vec_model, qa_path=chicken_and_gossip_path, matrix_ques_path=matrix_ques_part_path_char)
+ # 重载
+ matrix_ques = np.loadtxt(matrix_ques_part_path_char)
+ print("np.loadtxt(matrix_ques_part_path_char) ok!")
+ while True:
+ print("你问: ")
+ ques_ask = input()
+ ques_clean = getChinese(ques_ask)
+ char_list = [ques_char for ques_char in ques_clean]
+ sentence_vic = encoding_question(word2vec_model, char_list)
+ top_20_qid = most_similar_sentence_vec(sentence_vic, matrix_ques, top_vec=20)
+ try:
+ print("小姜机器人: " + syn_qa_dails[top_20_qid[0][0]].strip().split("\t")[1])
+ print([(syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[0], syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[1]) for i in range(len(top_20_qid))])
+ except Exception as e:
+ # 有的字符可能打不出来
+ print(str(e))
+
diff --git a/ChatBot/chatbot_search/chatbot_sentence_vec_by_word.py b/ChatBot/chatbot_search/chatbot_sentence_vec_by_word.py
new file mode 100644
index 0000000..0e3d61c
--- /dev/null
+++ b/ChatBot/chatbot_search/chatbot_sentence_vec_by_word.py
@@ -0,0 +1,217 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time :2019/4/4 10:00
+# @author :Mo
+# @function :chatbot based search, encode sentence_vec by word
+
+
+from conf.path_config import w2v_model_merge_short_path, w2v_model_wiki_word_path
+from conf.path_config import projectdir, chicken_and_gossip_path
+from utils.text_tools import txtRead, txtWrite, getChinese
+from conf.path_config import matrix_ques_part_path
+from numpy import float32 as numpy_type
+from collections import Counter
+import pickle, jieba, os, re
+import jieba.posseg as pseg
+from gensim import matutils
+from math import log
+import numpy as np
+import gensim
+import jieba
+import time
+
+
+def load_word2vec_model(path, bin=False, limit=None):
+ print("load_word2vec_model start!")
+ word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(path, limit=limit, binary=bin, unicode_errors='ignore')
+ print("load_word2vec_model end!")
+ return word2vec_model
+
+
+def is_oov(model_vec, query_seg, p_max=0.16):
+ """
+ 判断查询分词的oov情况是放弃,如果oov词个数超过xx%则放弃该回答答案
+ :param topic_model:
+ :return:
+ """
+ words = [word for word in query_seg if str(word).strip() is not ""]
+ count_total = 1
+ count_oov = 0
+ if words:
+ count_total = len(words)
+ for word in words:
+ if word not in model_vec:
+ count_oov = count_oov + 1
+ return float(count_oov/count_total) > p_max
+
+
+def get_td_idf_flag(jieba_cut_list, dictionary, tfidf_model):
+ # todo
+ '''获取td-idf权重,有问题,同一个词只计算一次,有的还没有,比如说停用词'''
+ seg1_list = []
+ vec1 = tfidf_model[dictionary.doc2bow(jieba_cut_list)]
+ for vec1_one in vec1:
+ seg1_list.append(vec1_one[1])
+ sum_seg1_list = sum(seg1_list)
+
+ return [x/sum_seg1_list for x in seg1_list]
+
+
+def get_jieba_flag(flag):
+ '''词性'''
+ if flag in ['n', 'nr', 'ns', 'nt', 'nz']:
+ weight = 1.3
+ elif flag in ['r', 'i', 't', 'ng', 'an']:
+ weight = 0.7
+ else:
+ weight = 1
+ return weight
+
+
+def word_segment_process(sentence):
+ """
+ jieba切词\词性
+ :param sentence:
+ :return:
+ """
+ sentence = sentence.replace('\n', '').replace(',', '').replace('"', '').replace(' ', '').replace('\t', '').upper().strip()
+ word_list = []
+ flag_list = []
+ try:
+ sentence_cut = ''.join(jieba.lcut(sentence, cut_all=False, HMM=False))
+ words = pseg.cut(sentence_cut)
+ for word in words:
+ word_list.append(word.word)
+ flag_list.append(word.flag)
+ except Exception as e:
+ word_list = [sentence]
+ flag_list = ['nt']
+ return word_list, flag_list
+
+
+def encoding_question(w2v_model, word_list, flag_list):
+ ''' 生成句子向量
+ :param wordlist: 分词list
+ :param is_replaced: 是否替换default true
+ :param debug_mode: default false
+ :return: array句子的向量 len=300
+ '''
+ try:
+ sentence_vec = w2v_model.wv[w2v_model.index2word[1]] * 0
+ except:
+ sentence_vec = w2v_model.wv[w2v_model.index2word[1]] * 0
+
+ for k in range(len(word_list)):
+ word = word_list[k]
+ flag = flag_list[k]
+ if type(word) == str:
+ try:
+ sentence_vec = sentence_vec + w2v_model.wv[word] * get_jieba_flag(flag)
+ except Exception as e:
+ if word not in [' ', '']:
+ sentence_vec = sentence_vec + 1
+
+ return sentence_vec
+
+
+def most_similar_sentence_vec(vec_ques, matrix_org, top_vec=20):
+ """
+ 最相似的句子,句向量与矩阵点乘
+ :param vec:
+ :param matrix:
+ :param keys:
+ :param topn:
+ :return:
+ """
+ # 首先对句向量矩阵标号
+ matrix_org_index = list(range(len(matrix_org)))
+ # Scale a vector to unit length. The only exception is the zerovector, which is returned back unchanged.
+ vec_ques_mean = matutils.unitvec(np.array([vec_ques]).mean(axis=0)).astype(numpy_type)
+ # matrix_org单位化
+ matrix_org_norm = (matrix_org / np.sqrt((matrix_org ** 2).sum(-1))[..., np.newaxis]).astype(numpy_type)
+ # 计算两个向量之间的相似度,使用numpy的dot函数,矩阵点乘
+ matrix_vec_dot = np.dot(matrix_org_norm, vec_ques_mean)
+ # 防止top_vec越界
+ top_vec = min(len(matrix_org), top_vec)
+ # 相似度排序
+ most_similar_sentence_vec_sort = matutils.argsort(matrix_vec_dot, topn=top_vec, reverse=True)
+
+ index_score = []
+ for t in most_similar_sentence_vec_sort[:top_vec]:
+ index_score.append([matrix_org_index[t], float(matrix_vec_dot[t])])
+ return index_score
+
+
+def create_matrix_org_np(sen_count, word2vec_model, qa_path, matrix_ques_path_word):
+ """
+ 创建问题句向量,设置sen_count=10000, 防止内存不够奔溃
+ :param sen_count: int, write sentence_encode num per twice
+ :param word2vec_model: model
+ :param qa_path: str
+ :param matrix_ques_path: str
+ :return:
+ """
+ if os.path.exists(matrix_ques_path_word):
+ file_matrix_ques = open(matrix_ques_path_word, 'rb')
+ matrix_ques = pickle.load(file_matrix_ques)
+ return matrix_ques
+ print('create_matrix_org_pkl start!')
+ qa_dail = txtRead(qa_path, encodeType='utf-8')
+ # questions = []
+ matrix_ques = []
+ count = 0
+ for qa_dail_one in qa_dail:
+ ques = getChinese(qa_dail_one.split('\t')[0])
+ # questions.append(ques)
+ word_list, flag_list = word_segment_process(ques)
+ sentence_vec = encoding_question(word2vec_model, word_list, flag_list)
+ matrix_ques.append(sentence_vec)
+ if len(matrix_ques)%sen_count == 0 and len(matrix_ques) != 0:
+ print("count: " + str(count))
+ count += 1
+ np.savetxt(projectdir + "/Data/sentence_vec_encode_word/" + str(count)+".txt", matrix_ques)
+ matrix_ques = []
+ # break
+
+ count += 1
+ np.savetxt(projectdir + "/Data/sentence_vec_encode_word/" + str(count)+".txt", matrix_ques)
+ # matrix_ques = []
+ # file_matrix_ques = open(matrix_ques_path, 'wb')
+ # pickle.dump(matrix_ques, file_matrix_ques)
+ print('create_matrix_org_np ok!')
+ # return matrix_ques
+
+
+if __name__ == '__main__':
+ # 读取问答语料
+ syn_qa_dails = txtRead(chicken_and_gossip_path, encodeType='utf-8')
+
+ # 读取词向量,w2v_model_wiki_word_path数据是自己训练的,w2v_model_merge_short_path只取了部分数据,你可以前往下载
+ if os.path.exists(w2v_model_wiki_word_path):
+ word2vec_model = load_word2vec_model(w2v_model_wiki_word_path, limit=None)
+ print("load w2v_model_wiki_word_path ok!")
+ else:
+ word2vec_model = load_word2vec_model(w2v_model_merge_short_path, limit=None)
+ print("load w2v_model_merge_short_path ok!")
+
+ # 创建标准问答中问题的句向量,存起来,到matrix_ques_path
+ if not os.path.exists(matrix_ques_part_path):
+ create_matrix_org_np(sen_count=100000, word2vec_model=word2vec_model, qa_path=chicken_and_gossip_path, matrix_ques_path_word=matrix_ques_part_path)
+
+ # 读取
+ print("np.loadtxt(matrix_ques_part_path) start!")
+ matrix_ques = np.loadtxt(matrix_ques_part_path)
+ print("np.loadtxt(matrix_ques_part_path) end!")
+ while True:
+ print("你: ")
+ ques_ask = input()
+ ques_clean = getChinese(ques_ask)
+ word_list, flag_list = word_segment_process(ques_clean)
+ sentence_vic = encoding_question(word2vec_model, word_list, flag_list)
+ top_20_qid = most_similar_sentence_vec(sentence_vic, matrix_ques, top_vec=20)
+ try:
+ print("小姜机器人: " + syn_qa_dails[top_20_qid[0][0]].strip().split("\t")[1])
+ print([(syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[0], syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[1]) for i in range(len(top_20_qid))])
+ except Exception as e:
+ # 有的字符可能打不出来
+ print(str(e))
diff --git a/FeatureProject/__init__.py b/FeatureProject/__init__.py
new file mode 100644
index 0000000..98d55da
--- /dev/null
+++ b/FeatureProject/__init__.py
@@ -0,0 +1,5 @@
+# -*- coding: UTF-8 -*-
+#!/usr/bin/python
+# @Time :2019/3/29 23:10
+# @author :Mo
+# @function :
\ No newline at end of file
diff --git a/FeatureProject/__pycache__/__init__.cpython-36.pyc b/FeatureProject/__pycache__/__init__.cpython-36.pyc
new file mode 100644
index 0000000..1a3197e
Binary files /dev/null and b/FeatureProject/__pycache__/__init__.cpython-36.pyc differ
diff --git a/FeatureProject/__pycache__/distance_text_or_vec.cpython-36.pyc b/FeatureProject/__pycache__/distance_text_or_vec.cpython-36.pyc
new file mode 100644
index 0000000..782f92e
Binary files /dev/null and b/FeatureProject/__pycache__/distance_text_or_vec.cpython-36.pyc differ
diff --git a/FeatureProject/__pycache__/distance_vec_TS_SS.cpython-36.pyc b/FeatureProject/__pycache__/distance_vec_TS_SS.cpython-36.pyc
new file mode 100644
index 0000000..e6a5b80
Binary files /dev/null and b/FeatureProject/__pycache__/distance_vec_TS_SS.cpython-36.pyc differ
diff --git a/FeatureProject/cut_td_idf.py b/FeatureProject/cut_td_idf.py
new file mode 100644
index 0000000..ac2c4b0
--- /dev/null
+++ b/FeatureProject/cut_td_idf.py
@@ -0,0 +1,104 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time :2019/4/1 10:35
+# @author :Mo
+# @function :cut sentences
+
+
+from conf.path_config import chicken_and_gossip_path, td_idf_cut_path, td_idf_cut_pinyin
+from utils.text_tools import txtWrite, txtRead, get_syboml, strQ2B
+from conf.path_config import projectdir
+from gensim import corpora, models
+import xpinyin
+import pickle
+import jieba
+
+
+def cut_td_idf(sources_path, target_path):
+ """
+ 结巴切词,汉语
+ :param path:
+ :return:
+ """
+ print("cut_td_idf start! ")
+ corpus = txtRead(sources_path)
+ governments = []
+ for corpus_one in corpus:
+ corpus_one_clear = corpus_one.replace(' ', '').strip()
+ ques_q2b = strQ2B(corpus_one_clear.strip())
+ ques_q2b_syboml = get_syboml(ques_q2b)
+ governments.append(ques_q2b_syboml.strip())
+
+ government_ques = list(map(lambda x: ' '.join(jieba.lcut(x)), governments))
+
+ topic_ques_all = []
+ for topic_ques_one in government_ques:
+ top_ques_aqlq = topic_ques_one.replace(' ', ' ').replace(' ', ' ').strip() + '\n'
+ topic_ques_all.append(top_ques_aqlq)
+
+ txtWrite(topic_ques_all, target_path)
+ print("cut_td_idf ok! " + sources_path)
+
+
+def cut_td_idf_pinyin(sources_path, target_path): #获取拼音
+ """
+ 汉语转拼音
+ :param path:
+ :return:
+ """
+ pin = xpinyin.Pinyin()
+ corpus = txtRead(sources_path)
+ topic_ques_all = []
+ corpus_count = 0
+ for corpus_one in corpus:
+ corpus_count += 1
+ # time1 = time.time()
+ corpus_one_clear = corpus_one.replace(' ', '').strip()
+ ques_q2b = strQ2B(corpus_one_clear.strip())
+ ques_q2b_syboml = get_syboml(ques_q2b)
+ ques_q2b_syboml_pinying = pin.get_pinyin(ques_q2b_syboml.replace(' ', '').replace(' ', '').strip(), ' ')
+ topic_ques_all.append(ques_q2b_syboml_pinying + '\n')
+ # time2 = time.time()
+ # print(str(corpus_count) + 'time:' + str(time2 - time1))
+ txtWrite(topic_ques_all, target_path)
+ print("cut_td_idf_pinyin ok! " + sources_path)
+
+
+def init_tfidf_chinese_or_pinyin(sources_path):
+ """
+ 构建td_idf
+ :param path:
+ :return:
+ """
+ questions = txtRead(sources_path)
+ corpora_documents = []
+ for item_text in questions:
+ item_seg = list(jieba.cut(str(item_text).strip()))
+ corpora_documents.append(item_seg)
+
+ dictionary = corpora.Dictionary(corpora_documents)
+ corpus = [dictionary.doc2bow(text) for text in corpora_documents]
+ tfidf_model = models.TfidfModel(corpus)
+ print("init_tfidf_chinese_or_pinyin ok! " + sources_path)
+ file = open(sources_path.replace(".csv", "_dictionary_model.pkl"), 'wb')
+ pickle.dump([dictionary, tfidf_model], file)
+
+
+if __name__ == '__main__':
+ # path_text = projectdir + '/Data/chicken_gossip.txt'
+ # sentences = txtRead(path_text)
+ # sentences_q = []
+ # for sentences_one in sentences:
+ # sentences_one_replace = sentences_one.replace(" ", "").replace("\t", "")
+ # sentences_one_replace_split = sentences_one_replace.split("|")
+ # sentence_new = sentences_one_replace_split[0] + "\t" + "".join(sentences_one_replace_split[1:])
+ # sentences_q.append(sentence_new)
+ # sentences = txtWrite(sentences_q, projectdir + '/Data/chicken_and_gossip.txt')
+
+
+ cut_td_idf(chicken_and_gossip_path, td_idf_cut_path)
+ cut_td_idf_pinyin(chicken_and_gossip_path, td_idf_cut_pinyin)
+ init_tfidf_chinese_or_pinyin(td_idf_cut_path)
+ init_tfidf_chinese_or_pinyin(td_idf_cut_pinyin)
+ print("corpus ok!")
+
diff --git a/FeatureProject/distance_text_or_vec.py b/FeatureProject/distance_text_or_vec.py
new file mode 100644
index 0000000..b501b99
--- /dev/null
+++ b/FeatureProject/distance_text_or_vec.py
@@ -0,0 +1,330 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time :2019/4/4 10:00
+# @author :Mo
+# @function :
+
+from sklearn.feature_extraction.text import TfidfVectorizer
+from utils.text_tools import txtRead, get_syboml, strQ2B
+import Levenshtein as Leven
+from fuzzywuzzy import fuzz
+import jieba.analyse
+import numpy as np
+import xpinyin
+import pickle
+import jieba
+import os
+
+
+zero_bit = 0.000000001
+pin = xpinyin.Pinyin()
+
+
+def clear_sentence(sentence):
+ """
+ 数据清晰,全角转半角
+ :param sentence: str, input sentence
+ :return: str, clearned sentences
+ """
+ corpus_one_clear = str(sentence).replace(' ', '').strip()
+ ques_q2b = strQ2B(corpus_one_clear.strip())
+ ques_q2b_syboml = get_syboml(ques_q2b)
+ return ques_q2b_syboml
+
+
+def chinese2pinyin(sentence):
+ """
+ chinese translate to pingyin
+ :param sentence: str, input sentence
+ :return: str, output pingyin
+ """
+ ques_q2b_syboml_pinying = pin.get_pinyin(sentence, ' ')
+ return ques_q2b_syboml_pinying
+
+
+def hamming_distance(v1, v2):
+ n = int(v1, 2) ^ int(v2, 2)
+ return bin(n & 0xffffffff).count('1')
+
+
+def cosine_distance(v1, v2): # 余弦距离
+ if v1.all() and v2.all():
+ return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
+ else:
+ return 0
+
+
+def euclidean_distance(v1, v2): # 欧氏距离
+ return np.sqrt(np.sum(np.square(v1 - v2)))
+
+
+def manhattan_distance(v1, v2): # 曼哈顿距离
+ return np.sum(np.abs(v1 - v2))
+
+
+def chebyshev_distance(v1, v2): # 切比雪夫距离
+ return np.max(np.abs(v1 - v2))
+
+
+def minkowski_distance(v1, v2): # 闵可夫斯基距离
+ return np.sqrt(np.sum(np.square(v1 - v2)))
+
+
+def euclidean_distance_standardized(v1, v2): # 标准化欧氏距离
+ v1_v2 = np.vstack([v1, v2])
+ sk_v1_v2 = np.var(v1_v2, axis=0, ddof=1)
+ return np.sqrt(((v1 - v2) ** 2 / (sk_v1_v2 + zero_bit * np.ones_like(sk_v1_v2))).sum())
+
+
+def mahalanobis_distance(v1, v2): # 马氏距离
+ # 马氏距离要求样本数要大于维数,否则无法求协方差矩阵
+ # 此处进行转置,表示10个样本,每个样本2维
+ X = np.vstack([v1, v2])
+ XT = X.T
+
+ # 方法一:根据公式求解
+ S = np.cov(X) # 两个维度之间协方差矩阵
+ try:
+ SI = np.linalg.inv(S) # 协方差矩阵的逆矩阵 todo
+ except:
+ SI = np.zeros_like(S)
+ # 马氏距离计算两个样本之间的距离,此处共有10个样本,两两组合,共有45个距离。
+ n = XT.shape[0]
+ distance_all = []
+ for i in range(0, n):
+ for j in range(i + 1, n):
+ delta = XT[i] - XT[j]
+ distance_1 = np.sqrt(np.dot(np.dot(delta, SI), delta.T))
+ distance_all.append(distance_1)
+ return np.sum(np.abs(distance_all))
+
+
+def bray_curtis_distance(v1, v2): # 布雷柯蒂斯距离, 生物学生态距离
+ up_v1_v2 = np.sum(np.abs(v2 - v1))
+ down_v1_v2 = np.sum(v1) + np.sum(v2)
+ return up_v1_v2 / (down_v1_v2 + zero_bit)
+
+
+def pearson_correlation_distance(v1, v2): # 皮尔逊相关系数(Pearson correlation)
+ v1_v2 = np.vstack([v1, v2])
+ return np.corrcoef(v1_v2)[0][1]
+
+
+def jaccard_similarity_coefficient_distance(v1, v2): # 杰卡德相似系数(Jaccard similarity coefficient)
+ # 方法一:根据公式求解
+ v1 = np.asarray(v1)
+ v2 = np.asarray(v2)
+ up = np.double(np.bitwise_and((v1 != v2), np.bitwise_or(v1 != 0, v2 != 0)).sum())
+ down = np.double(np.bitwise_or(v1 != 0, v2 != 0).sum() + zero_bit)
+ return up / down
+
+
+def wmd_distance(model, sent1_cut_list, sent2_cut_list): # WMD距离
+ # model.init_sims(replace=True)
+ distance = model.wmdistance(sent1_cut_list, sent2_cut_list)
+ return distance
+
+
+# def HamMings_Levenshtein(str1, str2):
+# sim = Leven.hamming(str1, str2)
+# return sim
+
+def edit_levenshtein(str1, str2):
+ return Leven.distance(str1, str2)
+
+
+def ratio_levenshtein(str1, str2):
+ return Leven.ratio(str1, str2)
+
+
+def jaro_levenshtein(str1, str2):
+ return Leven.jaro(str1, str2)
+
+
+def set_ratio_fuzzywuzzy(str1, str2):
+ return fuzz.token_set_ratio(str1, str2)
+
+
+def sort_ratio_fuzzywuzzy(str1, str2):
+ return fuzz.token_sort_ratio(str1, str2)
+
+
+def num_of_common_sub_str(str1, str2):
+ '''
+ 求两个字符串的最长公共子串
+ 思想:建立一个二维数组,保存连续位相同与否的状态
+ '''
+ lstr1 = len(str1)
+ lstr2 = len(str2)
+ record = [[0 for i in range(lstr2 + 1)] for j in range(lstr1 + 1)] # 多一位
+ maxNum = 0 # 最长匹配长度
+ p = 0 # 匹配的起始位
+
+ for i in range(lstr1):
+ for j in range(lstr2):
+ if str1[i] == str2[j]:
+ # 相同则累加
+ record[i + 1][j + 1] = record[i][j] + 1
+ if record[i + 1][j + 1] > maxNum:
+ # 获取最大匹配长度
+ maxNum = record[i + 1][j + 1]
+ # 记录最大匹配长度的终止位置
+ p = i + 1
+ # return str1[p - maxNum:p], maxNum
+ return maxNum
+
+
+####################################################### 汉明距离
+def string_hash(source):
+ if source == "":
+ return 0
+ else:
+ x = ord(source[0]) << 7
+ m = 1000003
+ mask = 2 ** 128 - 1
+ for c in source:
+ x = ((x * m) ^ ord(c)) & mask
+ x ^= len(source)
+ if x == -1:
+ x = -2
+ x = bin(x).replace('0b', '').zfill(64)[-64:]
+
+ return str(x)
+
+
+def sim_hash(content):
+ seg = jieba.cut(content)
+ keyWord = jieba.analyse.extract_tags('|'.join(seg), topK=20, withWeight=True, allowPOS=())
+ # 先按照权重排序,再按照词排序
+ keyList = []
+ # print(keyWord)
+ for feature, weight in keyWord:
+ weight = int(weight * 20)
+ feature = string_hash(feature)
+ temp = []
+ for f in feature:
+ if f == '1':
+ temp.append(weight)
+ else:
+ temp.append(-weight)
+ keyList.append(temp)
+ content_list = np.sum(np.array(keyList), axis=0)
+ # 编码读不出来
+ if len(keyList) == 0:
+ return '00'
+ simhash = ''
+ for c in content_list:
+ if c > 0:
+ simhash = simhash + '1'
+ else:
+ simhash = simhash + '0'
+ return simhash
+
+
+def hamming_distance_equal(v1, v2):
+ n = int(v1, 2) ^ int(v2, 2)
+ return bin(n & 0xffffffff).count('1')
+
+
+def hamming_distance(sen1, sen2):
+ return hamming_distance_equal(sim_hash(sen1), sim_hash(sen2))
+
+
+def normalization(x):
+ """
+ 归一化,最大最小值
+ :param x:
+ :return:
+ """
+ return [(float(i) - min(x)) / float(max(x) - min(x) + zero_bit) for i in x]
+
+
+def z_score(x, axis=0):
+ """
+ 标准化
+ :param x: arrary, numpy
+ :param axis: int, 0
+ :return: arrary, numpy
+ """
+ x = np.array(x).astype(float)
+ xr = np.rollaxis(x, axis=axis)
+ xr -= np.mean(x, axis=axis)
+ xr /= np.std(x, axis=axis)
+ # print(x)
+ return x
+
+
+def tok_td_idf(data_path):
+ if os.path.exists(data_path + 'td_idf_cut.csv'):
+ '''#计算TD-DIDF,获取训练测试数据'''
+ datas = txtRead(data_path + 'td_idf_cut.csv')
+ # 默认值只匹配长度≥2的单词,修改为1;ngram_range特征所以有2个词的,总计词语50428个
+ # vec_tdidf = TfidfVectorizer(ngram_range=(1, 2), token_pattern=r"(?u)\b\w+\b", min_df=1, max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1,max_features=30000)
+ vec_tdidf = TfidfVectorizer(ngram_range=(1, 2), token_pattern=r"(?u)\b\w+\b", min_df=3,
+ max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1, max_features=50000)
+ vec_tdidf.fit_transform(datas)
+ file_vec_tdidf = open(data_path + 'td_idf_cut_model.pkl', 'wb')
+ pickle.dump(vec_tdidf, file_vec_tdidf)
+
+ return vec_tdidf
+
+
+def tok_td_idf_pinyin(data_path):
+ if os.path.exists(data_path + 'td_idf_cut_pinyin.csv'):
+ '''#计算TD-DIDF,获取训练测试数据'''
+ datas = txtRead(data_path + 'td_idf_cut_pinyin.csv')
+ # 默认值只匹配长度≥2的单词,修改为1;ngram_range特征所以有2个词的,总计词语50428个
+ # vec_tdidf = TfidfVectorizer(ngram_range=(1, 2), token_pattern=r"(?u)\b\w+\b", min_df=1, max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1,max_features=30000)
+ vec_tdidf = TfidfVectorizer(ngram_range=(1, 2), token_pattern=r"(?u)\b\w+\b", min_df=3,
+ max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1, max_features=50000)
+ vec_tdidf.fit_transform(datas)
+ file_vec_tdidf = open(data_path + 'td_idf_cut_pinyin_model.pkl', 'wb')
+ pickle.dump(vec_tdidf, file_vec_tdidf)
+
+ return vec_tdidf
+
+
+if __name__ == '__main__':
+ vec1_test = np.array([1, 38, 17, 32])
+ vec2_test = np.array([5, 6, 8, 9])
+
+ str1_test = "你到底是谁?"
+ str2_test = "没想到我是谁,是真样子"
+
+ print(clear_sentence(str1_test)) # 数据处理
+ print(chinese2pinyin(str1_test)) # 中文转拼音
+
+ print(euclidean_distance(vec1_test, vec2_test))
+ print(cosine_distance(vec1_test, vec2_test))
+ print(manhattan_distance(vec1_test, vec2_test))
+ print(euclidean_distance(vec1_test, vec2_test))
+ print(chebyshev_distance(vec1_test, vec2_test))
+ print(minkowski_distance(vec1_test, vec2_test))
+
+ print(euclidean_distance_standardized(vec1_test, vec2_test))
+ print(mahalanobis_distance(vec1_test, vec2_test))
+
+ print('###############################################')
+
+ print(bray_curtis_distance(vec1_test, vec2_test))
+ print(pearson_correlation_distance(vec1_test, vec2_test))
+ print(jaccard_similarity_coefficient_distance(vec1_test, vec2_test))
+
+ print('###############################################')
+
+ # print(HamMings_Levenshtein(str1, str2)),需要等长
+ # print(Wmd_distance(model, sent1_cut_list, sent2_cut_list)) # 需要gensim word2vec model
+
+ print(hamming_distance(str1_test, str2_test))
+ print(edit_levenshtein(str1_test, str2_test))
+ print(ratio_levenshtein(str1_test, str2_test))
+ print(jaro_levenshtein(str1_test, str2_test))
+ print(set_ratio_fuzzywuzzy(str1_test, str2_test))
+ print(sort_ratio_fuzzywuzzy(str1_test, str2_test))
+ print(num_of_common_sub_str(str1_test, str2_test))
+ print(normalization(vec1_test)) # 归一化(0-1)
+ print(z_score(vec1_test)) # 标准化(0附近,正负)
+
+ # data_path = 'D:/workspace/python/bitbucket/nlp_model_v1.0/nlp_model/models/word_feature/sim_data/'
+ # tok_TD_IDF(data_path)
+ # tok_TD_IDF_pinyin(data_path)
diff --git a/FeatureProject/distance_vec_TS_SS.py b/FeatureProject/distance_vec_TS_SS.py
new file mode 100644
index 0000000..447ee79
--- /dev/null
+++ b/FeatureProject/distance_vec_TS_SS.py
@@ -0,0 +1,84 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time :2019/4/3 10:36
+# @author :Mo
+# @function :TS-SS distance
+# @url :https://github.com/taki0112/Vector_Similarity
+# @paper :A Hybrid Geometric Approach for Measuring Similarity Level Among Documents and Document Clustering
+
+
+import numpy as np
+import math
+
+zero_bit = 0.000000001
+
+
+def Cosine(vec1, vec2):
+ """
+ 余弦相似度
+ :param vec1: arrary
+ :param vec2: arrary
+ :return: float
+ """
+ result = InnerProduct(vec1, vec2) / (VectorSize(vec1) * VectorSize(vec2) + zero_bit)
+ return result
+
+
+def VectorSize(vec):
+ vec_pow = sum(math.pow(v + zero_bit, 2) for v in vec)
+ if vec_pow >= 0:
+ return math.sqrt(vec_pow)
+ else:
+ return zero_bit
+
+
+def InnerProduct(vec1, vec2):
+ try:
+ return sum(v1 * v2 for v1, v2 in zip(vec1, vec2))
+ except:
+ return zero_bit
+
+
+def Euclidean(vec1, vec2):
+ vec12_pow = sum(math.pow((v1 - v2), 2) for v1, v2 in zip(vec1, vec2))
+ if vec12_pow >= 0:
+ return math.sqrt(vec12_pow)
+ else:
+ return zero_bit
+
+
+def Theta(vec1, vec2):
+ cosine_vec12 = Cosine(vec1, vec2)
+ if -1 <= cosine_vec12 and cosine_vec12 <= 1:
+ return math.acos(cosine_vec12) + 10
+ else:
+ return zero_bit + 10
+
+
+def Triangle(vec1, vec2):
+ theta = math.radians(Theta(vec1, vec2))
+ return (VectorSize(vec1) * VectorSize(vec2) * math.sin(theta)) / 2
+
+
+def Magnitude_Difference(vec1, vec2):
+ return abs(VectorSize(vec1) - VectorSize(vec2))
+
+
+def Sector(vec1, vec2):
+ ED = Euclidean(vec1, vec2)
+ MD = Magnitude_Difference(vec1, vec2)
+ theta = Theta(vec1, vec2)
+ return math.pi * math.pow((ED + MD), 2) * theta / 360
+
+
+def TS_SS(vec1, vec2):
+ return Triangle(vec1, vec2) * Sector(vec1, vec2)
+
+
+if __name__ == '__main__':
+ vec1_test = np.array([1, 38, 17, 32])
+ vec2_test = np.array([5, 6, 8, 9])
+
+ print(Euclidean(vec1_test, vec2_test))
+ print(Cosine(vec1_test, vec2_test))
+ print(TS_SS(vec1_test, vec2_test))
diff --git a/FeatureProject/normalization_util.py b/FeatureProject/normalization_util.py
new file mode 100644
index 0000000..4edf2b1
--- /dev/null
+++ b/FeatureProject/normalization_util.py
@@ -0,0 +1,96 @@
+# -*- coding: UTF-8 -*-
+#!/usr/bin/python
+# @Time :2019/3/12 14:18
+# @author :Mo
+# @site :https://blog.csdn.net/rensihui
+
+from sklearn import preprocessing
+import numpy as np
+
+def autoL1L2(data, norms = 'l1'):
+ '''L1或者L2正则化'''
+ return preprocessing.normalize(data, norm = norms)
+
+def autoScale(data):
+ '''标准化, (X-mean)/std.得到的结果是,对于每个属性/每列来说所有数据都聚集在0附近,方差为1。'''
+ return preprocessing.scale(data)
+
+def autoMinMaxScaler(data):
+ '''将属性缩放到一个指定范围'''
+ return preprocessing.MinMaxScaler(feature_range=(0, 1)).fit_transform(data)
+
+def autoLinNorm(data): # 传入一个矩阵
+ ''' 0-1归一化
+ :param data: []矩阵
+ :return: []
+ '''
+ mins = data.min(0) # 返回data矩阵中每一列中最小的元素,返回一个列表
+ maxs = data.max(0) # 返回data矩阵中每一列中最大的元素,返回一个列表
+ ranges = maxs - mins # 最大值列表 - 最小值列表 = 差值列表
+ normData = np.zeros(np.shape(data)) # 生成一个与 data矩阵同规格的normData全0矩阵,用于装归一化后的数据
+ row = data.shape[0] # 返回 data矩阵的行数
+ normData = data - np.tile(mins, (row, 1)) # data矩阵每一列数据都减去每一列的最小值
+ normData = normData / np.tile(ranges, (row, 1)) # data矩阵每一列数据都除去每一列的差值(差值 = 某列的最大值- 某列最小值)
+ return normData
+
+
+
+def autoAvgNorm(data): # 传入一个矩阵
+ ''' 均值归一化
+ :param data: []矩阵
+ :return: []
+ '''
+ avg = np.average(data, axis=1) # 返回data矩阵中每一列中最小的元素,返回一个列表
+ sigma = np.std(data, axis=1) # 返回data矩阵中每一列中最大的元素,返回一个列表
+ normData = np.zeros(np.shape(data)) # 生成一个与 data矩阵同规格的normData全0矩阵,用于装归一化后的数据
+ row = data.shape[0] # 返回 data矩阵的行数
+ normData = data - np.tile(avg, (row, 1)) # data矩阵每一列数据都减去每一列的最小值
+ normData = normData / np.tile(sigma, (row, 1)) # data矩阵每一列数据都除去每一列的差值(差值 = 某列的最大值- 某列最小值)
+ return normData
+
+
+
+###Sigmoid函数;Sigmoid函数是一个具有S形曲线的函数,是良好的阈值函数,在(0, 0.5)处中心对称,在(0, 0.5)附近有比较大的斜率,
+# 而当数据趋向于正无穷和负无穷的时候,映射出来的值就会无限趋向于1和0,是个人非常喜欢的“归一化方法”,之所以打引号是因为我觉得Sigmoid函数在
+# 阈值分割上也有很不错的表现,根据公式的改变,就可以改变分割阈值,这里作为归一化方法,我们只考虑(0, 0.5)作为分割阈值的点的情况:
+def sigmoid(data,useStatus):
+ ''' sig归一化
+ :param data: []矩阵
+ :return: []
+ '''
+ if useStatus:
+ row=data.shape[0]
+ column=data.shape[1]
+ normData = np.zeros(np.shape(data))
+ for i in range(row):
+ for j in range(column):
+ normData[i][j]=1.0 / (1 + np.exp(-float(data[i][j])));
+ return normData
+ else:
+ return float(data);
+
+if __name__ == '__main__':
+ arr = np.array([[8, 7, 8], [4, 3, 1], [6, 9, 8]])
+
+ print("l1正则化")
+ print(autoL1L2(arr, norms='l1'))
+
+ print("l2正则化")
+ print(autoL1L2(arr, norms='l2'))
+
+ print("0-1标准化处理")
+ print(autoScale(arr))
+
+ print("0-1缩放处理")
+ print(autoMinMaxScaler(arr))
+
+
+ print("0-1归一化处理")
+ print(autoLinNorm(arr))
+
+
+ print("均值归一化处理")
+ print(autoAvgNorm(arr))
+
+ print("sig归一化处理")
+ print(sigmoid(arr,True))
diff --git a/FeatureProject/sentence_sim_feature.py b/FeatureProject/sentence_sim_feature.py
new file mode 100644
index 0000000..ffe056c
--- /dev/null
+++ b/FeatureProject/sentence_sim_feature.py
@@ -0,0 +1,384 @@
+# -*- coding:utf-8 -*-
+# -*- created by: moyongzhuo -*-
+
+
+from FeatureProject.distance_text_or_vec import euclidean_distance, cosine_distance, manhattan_distance, euclidean_distance, jaccard_similarity_coefficient_distance
+from FeatureProject.distance_text_or_vec import chebyshev_distance, minkowski_distance, euclidean_distance_standardized
+from FeatureProject.distance_text_or_vec import mahalanobis_distance, bray_curtis_distance, pearson_correlation_distance
+from FeatureProject.distance_text_or_vec import wmd_distance, normalization, z_score
+from FeatureProject.distance_text_or_vec import hamming_distance, edit_levenshtein, ratio_levenshtein, jaro_levenshtein, set_ratio_fuzzywuzzy, sort_ratio_fuzzywuzzy
+from FeatureProject.distance_text_or_vec import clear_sentence, chinese2pinyin, num_of_common_sub_str
+from conf.path_config import word2_vec_path, td_idf_path, td_idf_path_pinyin
+from FeatureProject.distance_vec_TS_SS import TS_SS
+from gensim import corpora, models, matutils
+from conf.path_config import projectdir
+from gensim.models import KeyedVectors
+import pandas as pd
+import numpy as np
+import pickle
+import jieba
+import time
+import os
+
+
+class SentenceSimFeature:
+ def __init__(self):
+ self.sen1 = None
+ self.sen2 = None
+ self.seg1 = None
+ self.seg2 = None
+ self.sen_vec1 = None
+ self.sen_vec2 = None
+ self.tfidf_vec1 = None
+ self.tfidf_vec2 = None
+ self.dictionary = None
+ self.tfidf_model = None
+ self.w2c_model = None
+
+ self.tfidf_pinyin_model = None
+ self.dictionary_pinyin = None
+ self.sen1_pinyin = None
+ self.sen2_pinyin = None
+ self.seg1_pinyin = None
+ self.seg2_pinyin = None
+ self.tfidf_vec1_pinyin = None
+ self.tfidf_vec2_pinyin = None
+
+ def set_data(self, sen1, sen2):
+ sen1 = clear_sentence(sen1)
+ sen2 = clear_sentence(sen2)
+ self.sen1 = str(sen1).strip()
+ self.sen2 = str(sen2).strip()
+ self.seg1 = list(jieba.cut(sen1))
+ self.seg2 = list(jieba.cut(sen2))
+ self.sen1_pinyin = chinese2pinyin(sen1)
+ self.sen2_pinyin = chinese2pinyin(sen2)
+ self.seg1_pinyin = (self.sen1_pinyin).split(' ')
+ self.seg2_pinyin = (self.sen2_pinyin).split(' ')
+ self.sen_vec1 = np.zeros(300)
+ self.sen_vec2 = np.zeros(300)
+ # self.tfidf_vec1 = np.array((self.tfidf_model.transform([' '.join(self.seg1)])).toarray().tolist()[0])
+ # self.tfidf_vec2 = np.array((self.tfidf_model.transform([' '.join(self.seg2)])).toarray().tolist()[0])
+ # self.tfidf_vec1_pinyin = np.array((self.tfidf_pinyin_model.transform([' '.join(self.seg1_pinyin)])).toarray().tolist()[0])
+ # self.tfidf_vec2_pinyin = np.array((self.tfidf_pinyin_model.transform([' '.join(self.seg2_pinyin)])).toarray().tolist()[0])
+ self.tfidf_vec1 = self.tfidf_model[self.dictionary.doc2bow(self.seg1)]
+ self.tfidf_vec2 = self.tfidf_model[self.dictionary.doc2bow(self.seg2)]
+ self.tfidf_vec1_pinyin = self.tfidf_pinyin_model[self.dictionary_pinyin.doc2bow(self.seg1_pinyin)]
+ self.tfidf_vec2_pinyin = self.tfidf_pinyin_model[self.dictionary_pinyin.doc2bow(self.seg2_pinyin)]
+
+ def same_word_count(self):
+ count_left = 0
+ for s in self.seg1:
+ if s in self.seg2:
+ count_left += 1
+
+ count_right = 0
+ for s in self.seg2:
+ if s in self.seg1:
+ count_right += 1
+
+ return min(count_left, count_right)
+
+ def same_char_count(self):
+ seg1 = list(self.sen1)
+ seg2 = list(self.sen2)
+
+ count_left = 0
+ for s in seg1:
+ if s in seg2:
+ count_left += 1
+
+ count_right = 0
+ for s in seg2:
+ if s in seg1:
+ count_right += 1
+
+ return min(count_left, count_right)
+
+ def sentence_length(self):
+ len_sen1 = len(self.sen1)
+ len_sen2 = len(self.sen2)
+ len_abs_sub = abs(len_sen1 - len_sen2)
+ len_rate = len_sen1 / len_sen2
+ len_add_rate = len_sen1 * len_sen2 / (len_sen1 + len_sen2)
+
+ return [len_abs_sub, len_rate, len_add_rate]
+
+ def init_sentence_vector(self):
+ # file_path = os.path.dirname(__file__)
+ print('load w2v model begin')
+ # model_path = os.path.join(file_path, word2_vec_path)
+ self.w2c_model = KeyedVectors.load_word2vec_format(word2_vec_path, unicode_errors='ignore', limit=None) # ,binary=True)
+ print('load w2v model success')
+
+ def encode_sentence_vector(self):
+ for s in self.seg1:
+ try:
+ self.sen_vec1 += self.w2c_model[s]
+ except:
+ self.sen_vec1 += np.zeros(300)
+ continue
+
+ for s in self.seg2:
+ try:
+ self.sen_vec2 += self.w2c_model[s]
+ except:
+ self.sen_vec2 += np.zeros(300)
+ continue
+
+ def init_tfidf(self):
+ file = open(td_idf_path, 'rb')
+ tfidf_dictionary_model = pickle.load(file)
+ self.dictionary = tfidf_dictionary_model[0]
+ self.tfidf_model = tfidf_dictionary_model[1]
+
+ file = open(td_idf_path_pinyin, 'rb')
+ tfidf_dictionary_pinyin_model = pickle.load(file)
+ self.dictionary_pinyin = tfidf_dictionary_pinyin_model[0]
+ self.tfidf_pinyin_model = tfidf_dictionary_pinyin_model[1]
+ print("init_tfidf ok!")
+
+ def w2c_all_vec(self):
+ w2c_Cosine = cosine_distance(self.sen_vec1, self.sen_vec2)
+ w2c_TS_SS = TS_SS(self.sen_vec1, self.sen_vec2)
+ w2c_Manhattan = manhattan_distance(self.sen_vec1, self.sen_vec2)
+ w2c_Euclidean = euclidean_distance(self.sen_vec1, self.sen_vec2)
+ w2c_Jaccard = jaccard_similarity_coefficient_distance(self.sen_vec1, self.sen_vec2)
+
+ w2c_Chebyshev = chebyshev_distance(self.sen_vec1, self.sen_vec2)
+ w2c_Minkowski = minkowski_distance(self.sen_vec1, self.sen_vec2)
+
+ w2c_Euclidean_Standard = euclidean_distance_standardized(self.sen_vec1, self.sen_vec2)
+ w2c_Mahalanobis = mahalanobis_distance(self.sen_vec1, self.sen_vec2)
+ w2c_Bray = bray_curtis_distance(self.sen_vec1, self.sen_vec2)
+ w2c_Pearson = pearson_correlation_distance(self.sen_vec1, self.sen_vec2)
+
+ # w2c_Wmd = Wmd_Distance(self.w2c_model, self.sen_vec1, self.sen_vec2)
+ return [w2c_Cosine, w2c_TS_SS, w2c_Manhattan, w2c_Euclidean, w2c_Jaccard, w2c_Chebyshev,
+ w2c_Minkowski, w2c_Euclidean_Standard, w2c_Mahalanobis, w2c_Bray, w2c_Pearson]
+
+ def tdidf_all_vec(self):
+
+ return matutils.cossim(self.tfidf_vec1, self.tfidf_vec2)
+
+ def edit_all_str(self):
+ str_hamming = hamming_distance(self.sen1, self.sen2)
+ str_edit = edit_levenshtein(self.sen1, self.sen2)
+ str_ratio = ratio_levenshtein(self.sen1, self.sen2)
+ str_jaro = jaro_levenshtein(self.sen1, self.sen2)
+ str_set_ratio_fuzz = set_ratio_fuzzywuzzy(self.sen1, self.sen2)
+ str_sort_ratio_fuzz = sort_ratio_fuzzywuzzy(self.sen1, self.sen2)
+ str_commonsubstr = num_of_common_sub_str(self.sen1, self.sen2)
+ str_list_Wmd = wmd_distance(self.w2c_model, self.seg1, self.seg2)
+
+ return [str_hamming, str_edit, str_ratio, str_jaro,
+ str_set_ratio_fuzz, str_sort_ratio_fuzz, str_commonsubstr, str_list_Wmd]
+
+ def word_jaccard(self):
+ a = list(set(self.seg1).intersection(set(self.seg2)))
+ b = list(set(self.seg1).union(set(self.seg2)))
+ return float(len(a) / len(b))
+
+ def char_jaccard(self):
+ a = list(set(list(self.sen1)).intersection(set(list(self.sen2))))
+ b = list(set(list(self.sen1)).union(set(list(self.sen2))))
+
+ return float(len(a) / len(b))
+
+ def tdidf_all_vec_pinyin(self):
+
+ return matutils.cossim(self.tfidf_vec1_pinyin, self.tfidf_vec2_pinyin)
+
+ def edit_all_pinyin(self):
+ pinyin_hamming = hamming_distance(self.sen1_pinyin, self.sen2_pinyin)
+ pinyin_edit = edit_levenshtein(self.sen1_pinyin, self.sen2_pinyin)
+ pinyin_ratio = ratio_levenshtein(self.sen1_pinyin, self.sen2_pinyin)
+ pinyin_jaro = jaro_levenshtein(self.sen1_pinyin, self.sen2_pinyin)
+ pinyin_set_ratio_fuzz = set_ratio_fuzzywuzzy(self.sen1_pinyin, self.sen2_pinyin)
+ pinyin_sort_ratio_fuzz = sort_ratio_fuzzywuzzy(self.sen1_pinyin, self.sen2_pinyin)
+ pinyin_commonsubstr = num_of_common_sub_str(self.sen1_pinyin, self.sen2_pinyin)
+ pinyin_list_Wmd = wmd_distance(self.w2c_model, self.seg1_pinyin, self.seg2_pinyin)
+
+ return [pinyin_hamming, pinyin_edit, pinyin_ratio, pinyin_jaro,
+ pinyin_set_ratio_fuzz, pinyin_sort_ratio_fuzz, pinyin_commonsubstr, pinyin_list_Wmd]
+
+ def word_jaccard_pinyin(self):
+ a = list(set(self.seg1_pinyin).intersection(set(self.seg2_pinyin)))
+ b = list(set(self.seg1_pinyin).union(set(self.seg2_pinyin)))
+ return float(len(a) / len(b))
+
+ def char_jaccard_pinyin(self):
+ a = list(set(list(self.seg1_pinyin)).intersection(set(list(self.seg2_pinyin))))
+ b = list(set(list(self.seg1_pinyin)).union(set(list(self.seg2_pinyin))))
+
+ return float(len(a) / len(b))
+
+
+def sentence_input_t():
+ while True:
+ s1 = input('s1: ')
+ s2 = input('s2: ')
+
+ start_time = time.time()
+ ssf.set_data(s1, s2)
+ ssf.encode_sentence_vector()
+
+ time1 = time.time()
+ print('set_data time:' + str(time1 - start_time))
+
+ # 相同词、长度
+ same_word_count = ssf.same_word_count()
+ time2 = time.time()
+ print('same_word_count time:' + str(time2 - time1))
+
+ same_char_count = ssf.same_char_count()
+ time3 = time.time()
+ print('same_char_count time:' + str(time3 - time2))
+
+ [len_abs_sub, len_rate, len_add_rate] = ssf.sentence_length()
+ time4 = time.time()
+ print('sentence_length time:' + str(time4 - time3))
+
+ # w2c_all_vec
+ [w2c_Cosine, w2c_TS_SS, w2c_Manhattan, w2c_Euclidean,
+ w2c_Jaccard, w2c_Chebyshev, w2c_Minkowski, w2c_Euclidean_Standard, w2c_Mahalanobis,
+ w2c_Bray, w2c_Pearson] = ssf.w2c_all_vec()
+ time5 = time.time()
+ print('w2c_all_vec time:' + str(time5 - time4))
+
+ # tdidf_all_vec
+ # [tdidf_Cosine, tdidf_TS_SS, tdidf_Manhattan, tdidf_Euclidean,
+ # tdidf_Jaccard, tdidf_Chebyshev,tdidf_Minkowski, tdidf_Euclidean_Standard, tdidf_Mahalanobis,
+ # tdidf_Bray, tdidf_Pearson] = ssf.tdidf_all_vec()
+ tdidf_cossim = ssf.tdidf_all_vec()
+ time6 = time.time()
+ print('tdidf_all_vec time:' + str(time6 - time5))
+
+ # edit_all_str
+ [str_hamming, str_edit, str_ratio, str_jaro,
+ str_set_ratio_fuzz, str_sort_ratio_fuzz, str_commonsubstr, str_list_Wmd] = ssf.edit_all_str()
+ time7 = time.time()
+ print('edit_all_str time:' + str(time7 - time6))
+
+ # jaccard系数
+ word_jaccard = ssf.word_jaccard()
+ char_jaccard = ssf.char_jaccard()
+ time8 = time.time()
+ print('jaccard系数 time:' + str(time8 - time7))
+
+ # tdidf_all_vec_pinyin
+ # [tdidf_piyin_Cosine, tdidf_piyin_TS_SS, tdidf_piyin_Manhattan, tdidf_piyin_Euclidean, tdidf_piyin_Jaccard,
+ # tdidf_piyin_Chebyshev, tdidf_piyin_Minkowski, tdidf_piyin_Euclidean_Standard, tdidf_piyin_Mahalanobis,
+ # tdidf_piyin_Bray, tdidf_piyin_Pearson] = ssf.tdidf_all_vec_pinyin()
+ tdidf_pinyin_cossim = ssf.tdidf_all_vec_pinyin()
+ time9 = time.time()
+ print('tdidf_all_vec_pinyin time:' + str(time9 - time8))
+
+ # edit_all_pinyin
+ [pinyin_hamming, pinyin_edit, pinyin_ratio, pinyin_jaro,
+ pinyin_set_ratio_fuzz, pinyin_sort_ratio_fuzz, pinyin_commonsubstr, pinyin_list_Wmd] = ssf.edit_all_pinyin()
+ time10 = time.time()
+ print('edit_all_pinyin time:' + str(time10 - time9))
+
+ # jaccard系数
+ word_jaccard_pinyin = ssf.word_jaccard_pinyin()
+ char_jaccard_pinyin = ssf.char_jaccard_pinyin()
+ time11 = time.time()
+ print('jaccard系数pinyin time:' + str(time11 - time10))
+ sim_all_last = [same_word_count, same_char_count, len_abs_sub, len_rate, len_add_rate,
+ w2c_Cosine, w2c_TS_SS, w2c_Manhattan, w2c_Euclidean, w2c_Jaccard, w2c_Chebyshev, w2c_Minkowski,
+ w2c_Euclidean_Standard, w2c_Mahalanobis, w2c_Bray, w2c_Pearson,
+ tdidf_cossim, str_hamming, str_edit, str_ratio, str_jaro, str_set_ratio_fuzz,
+ str_sort_ratio_fuzz,
+ str_commonsubstr, str_list_Wmd,
+ word_jaccard, char_jaccard, tdidf_pinyin_cossim,
+ pinyin_hamming, pinyin_edit, pinyin_ratio, pinyin_jaro, pinyin_set_ratio_fuzz,
+ pinyin_sort_ratio_fuzz,
+ pinyin_commonsubstr, pinyin_list_Wmd,
+ word_jaccard_pinyin, char_jaccard_pinyin]
+ print("sim: ")
+ print(sim_all_last)
+
+
+if __name__ == '__main__':
+ ssf = SentenceSimFeature()
+ ssf.init_sentence_vector()
+ ssf.init_tfidf()
+ s1 = "你知道Mo的能力上限吗"
+ s2 = "你好呀,Mo水平很差"
+ start_time = time.time()
+
+ ssf.set_data(s1, s2)
+ ssf.encode_sentence_vector()
+
+ time1 = time.time()
+ print('set_data time:' + str(time1 - start_time))
+
+ # 相同词、长度
+ same_word_count = ssf.same_word_count()
+ time2 = time.time()
+ print('same_word_count time:' + str(time2 - time1))
+
+ same_char_count = ssf.same_char_count()
+ time3 = time.time()
+ print('same_char_count time:' + str(time3 - time2))
+
+ [len_abs_sub, len_rate, len_add_rate] = ssf.sentence_length()
+ time4 = time.time()
+ print('sentence_length time:' + str(time4 - time3))
+
+ # w2c_all_vec
+ [w2c_Cosine, w2c_TS_SS, w2c_Manhattan, w2c_Euclidean,
+ w2c_Jaccard, w2c_Chebyshev, w2c_Minkowski, w2c_Euclidean_Standard, w2c_Mahalanobis,
+ w2c_Bray, w2c_Pearson] = ssf.w2c_all_vec()
+ time5 = time.time()
+ print('w2c_all_vec time:' + str(time5 - time4))
+
+ # tdidf_all_vec
+ tdidf_cossim = ssf.tdidf_all_vec()
+ time6 = time.time()
+ print('tdidf_all_vec time:' + str(time6 - time5))
+
+ # edit_all_str
+ [str_hamming, str_edit, str_ratio, str_jaro,
+ str_set_ratio_fuzz, str_sort_ratio_fuzz, str_commonsubstr, str_list_Wmd] = ssf.edit_all_str()
+ time7 = time.time()
+ print('edit_all_str time:' + str(time7 - time6))
+
+ # jaccard系数
+ word_jaccard = ssf.word_jaccard()
+ char_jaccard = ssf.char_jaccard()
+ time8 = time.time()
+ print('jaccard系数 time:' + str(time8 - time7))
+
+ # pinyin
+ tdidf_pinyin_cossim = ssf.tdidf_all_vec_pinyin()
+ time9 = time.time()
+ print('tdidf_all_vec_pinyin time:' + str(time9 - time8))
+
+ # edit_all_pinyin
+ [pinyin_hamming, pinyin_edit, pinyin_ratio, pinyin_jaro,
+ pinyin_set_ratio_fuzz, pinyin_sort_ratio_fuzz, pinyin_commonsubstr, pinyin_list_Wmd] = ssf.edit_all_pinyin()
+ time10 = time.time()
+ print('edit_all_pinyin time:' + str(time10 - time9))
+
+ # jaccard系数
+ word_jaccard_pinyin = ssf.word_jaccard_pinyin()
+ char_jaccard_pinyin = ssf.char_jaccard_pinyin()
+ time11 = time.time()
+ print('jaccard系数pinyin time:' + str(time11 - time10))
+
+ sim_all_last = [same_word_count, same_char_count, len_abs_sub, len_rate, len_add_rate,
+ w2c_Cosine, w2c_TS_SS, w2c_Manhattan, w2c_Euclidean, w2c_Jaccard, w2c_Chebyshev, w2c_Minkowski,
+ w2c_Euclidean_Standard, w2c_Mahalanobis, w2c_Bray, w2c_Pearson,
+ tdidf_cossim, str_hamming, str_edit, str_ratio, str_jaro, str_set_ratio_fuzz, str_sort_ratio_fuzz,
+ str_commonsubstr, str_list_Wmd,
+ word_jaccard, char_jaccard, tdidf_pinyin_cossim,
+ pinyin_hamming, pinyin_edit, pinyin_ratio, pinyin_jaro, pinyin_set_ratio_fuzz,
+ pinyin_sort_ratio_fuzz,
+ pinyin_commonsubstr, pinyin_list_Wmd,
+ word_jaccard_pinyin, char_jaccard_pinyin]
+ print("小姜机器人计算sim: ")
+ print(sim_all_last)
+
+ sentence_input_t()
diff --git a/conf/__init__.py b/conf/__init__.py
new file mode 100644
index 0000000..b238954
--- /dev/null
+++ b/conf/__init__.py
@@ -0,0 +1,5 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time :2019/4/3 11:23
+# @author :Mo
+# @function :
\ No newline at end of file
diff --git a/conf/__pycache__/__init__.cpython-36.pyc b/conf/__pycache__/__init__.cpython-36.pyc
new file mode 100644
index 0000000..9042934
Binary files /dev/null and b/conf/__pycache__/__init__.cpython-36.pyc differ
diff --git a/conf/__pycache__/path_config.cpython-36.pyc b/conf/__pycache__/path_config.cpython-36.pyc
new file mode 100644
index 0000000..6ee0fdc
Binary files /dev/null and b/conf/__pycache__/path_config.cpython-36.pyc differ
diff --git a/conf/path_config.py b/conf/path_config.py
new file mode 100644
index 0000000..72bf2aa
--- /dev/null
+++ b/conf/path_config.py
@@ -0,0 +1,39 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time :2019/4/3 11:23
+# @author :Mo
+# @function :path
+
+
+import pathlib
+import sys
+import os
+
+
+# base dir
+projectdir = str(pathlib.Path(os.path.abspath(__file__)).parent.parent)
+sys.path.append(projectdir)
+print(projectdir)
+
+# corpus
+chicken_and_gossip_path = projectdir + '/Data/corpus/chicken_and_gossip.txt'
+
+# word2vec
+w2v_model_merge_short_path = projectdir + "/Data/chinese_vector/w2v_model_merge_short.vec"
+
+# tf_idf
+td_idf_cut_path = projectdir + '/Data/tf_idf/td_idf_cut.csv'
+td_idf_cut_pinyin = projectdir + '/Data/tf_idf/td_idf_cut_pinyin.csv'
+td_idf_path_pinyin = projectdir + '/Data/tf_idf/td_idf_cut_pinyin_dictionary_model.pkl'
+td_idf_path = projectdir + '/Data/tf_idf/td_idf_cut_dictionary_model.pkl'
+
+# word, 句向量
+w2v_model_wiki_word_path = projectdir + '/Data/chinese_vector/w2v_model_wiki_word.vec'
+matrix_ques_part_path = projectdir + '/Data/sentence_vec_encode_word/1.txt'
+
+# char, 句向量
+w2v_model_char_path = projectdir + '/Data/chinese_vector/w2v_model_wiki_char.vec'
+matrix_ques_part_path_char = projectdir + '/Data/sentence_vec_encode_char/1.txt'
+
+# word2vec select
+word2_vec_path = w2v_model_wiki_word_path if os.path.exists(w2v_model_wiki_word_path) else w2v_model_merge_short_path
\ No newline at end of file
diff --git a/python-version-time b/python-version-time
new file mode 100644
index 0000000..54dcdff
--- /dev/null
+++ b/python-version-time
@@ -0,0 +1,15 @@
+Python 3.3.2(May 15, 2013)
+Python 3.2.5(May 15, 2013)
+Python 3.1.5(April 10, 2012)
+Python 3.0.1(February 13, 2009)
+Python 2.7.5(May 15, 2013)
+Python 2.6.8(April 10, 2012)
+Python 2.5.6(May 26, 2011)
+Python 2.4.6(December 19, 2008)
+Python 2.3.7(March 11, 2008)
+Python 2.2.3(May 30, 2003)
+Python 2.1.3(April 8, 2002)
+Python 2.0.1(June 2001)
+Python 1.6.1(September 2000)
+Python 1.5.2(April 1999)
+Older releases:Source releases,binaries-1.1,binaries-1.2,binaries-1.3,binaries-1.4,binaries-1.5
diff --git a/readme.md b/readme.md
new file mode 100644
index 0000000..b6c21a2
--- /dev/null
+++ b/readme.md
@@ -0,0 +1,49 @@
+# nlp_xiaojiang
+
+# Data
+ - chinese_vector
+ - 截取的部分word2vec训练词向量(自己需要下载全效果才会好)
+ - corpus
+ - 小黄鸡和gossip问答预料(数据没清洗)
+ - sentence_vec_encode_char
+ - 1.txt(字向量生成的前100000句向量)
+ - sentence_vec_encode_word
+ - 1.txt(词向量生成的前100000句向量)
+ - tf_idf(chicken_and_gossip.txt生成的tf-idf)
+
+# ChatBot
+ - 检索式ChatBot
+ - 像ES那样直接检索(如使用fuzzywuzzy),只能字面匹配
+ - 构造句向量,检索问答库,能够检索有同义词的句子
+ - 生成式ChatBot(todo)
+ - seq2seq
+ - GAN
+
+# FeatureProject
+ - normalization_util指的是数据归一化
+ - 0-1归一化处理
+ - 均值归一化
+ - sig归一化处理
+ - sim feature(这里只有ML,没有bert、emlo等的句向量相似度)
+ - distance_text_or_vec:各种计算文本、向量距离等
+ - distance_vec_TS_SS:TS_SS计算词向量距离
+ - cut_td_idf:将小黄鸡语料和gossip结合
+ - sentence_sim_feature:计算两个文本的相似度或者距离,例如qq(问题和问题),或者qa(问题和答案)
+
+# run
+ - 1.创建tf-idf文件等(运行2需要先跑1): python cut_td_idf.py
+ - 2.计算两个句子间的各种相似度,先计算一个预定义的,然后可输入自定义的(先跑1): python sentence_sim_feature.py
+ - 3.chatbot_1跑起来(fuzzy检索-没)(独立):python chatbot_fuzzy.py
+ - 4.chatbot_2跑起来(句向量检索-词)(独立):python chatbot_sentence_vec_by_word.py
+ - 5.chatbot_3跑起来(句向量检索-字)(独立):python chatbot_sentence_vec_by_char.py
+
+# requestments.txt
+ - python_Levenshtei
+ - 调用Levenshtein,我的python是3.6,
+ - 打开其源文件https://www.lfd.uci.edu/~gohlke/pythonlibs/
+ - 查找python_Levenshtein-0.12.0-cp36-cp36m-win_amd64.whl下载即可
+ - pyemd
+ - pyhanlp
+ - 下好依赖JPype1-0.6.3-cp36-cp36m-win_amd64.whl
+
+
diff --git a/requestments.txt b/requestments.txt
new file mode 100644
index 0000000..2195e9c
--- /dev/null
+++ b/requestments.txt
@@ -0,0 +1,12 @@
+python-Levenshtein==0.12.0
+fuzzywuzzy==0.17.0
+openpyxl==2.6.2
+pandas==0.24.2
+xpinyin==0.5.6
+numpy==1.16.1
+gensim==3.7.1
+pyemd==0.5.1
+jieba==0.39
+xlrd==1.2.0
+sklearn
+pathlib
diff --git a/result_test/__init__.py b/result_test/__init__.py
new file mode 100644
index 0000000..cdaeb55
--- /dev/null
+++ b/result_test/__init__.py
@@ -0,0 +1,5 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time :2019/4/3 14:40
+# @author :Mo
+# @function :
\ No newline at end of file
diff --git a/result_test/result_chatbot_fuzzy.txt b/result_test/result_chatbot_fuzzy.txt
new file mode 100644
index 0000000..cd5b755
--- /dev/null
+++ b/result_test/result_chatbot_fuzzy.txt
@@ -0,0 +1,38 @@
+Connected to pydev debugger (build 171.3780.115)
+D:\workspace\pythonMyCode\django_project\nlp_xiaojiang
+read questions ok!
+: ˭ѽ
+Сˣ Ҿ˼˰ľۺѽ
+Ƽ:
+[('˭ѽ\tҾ˼˰ľۺѽ\n', 100), ('˭ѽ\tҾ˼˰ľۺѽ\n', 100), ('˭\t==\n', 67), ('˭\t==\n', 67), ('˭\t==\n', 67)]
+:
+nenen
+Сˣ
+Ƽ:
+[('nnn\t\n', 75), ('nnn\tnnn\n', 75), ('lene\t==\n', 67), ('tencent\t==\n', 67), ('nann\t==\n', 67)]
+:
+niguola
+Сˣ ==
+Ƽ:
+[('igdota\t==\n', 62), ('ula\t==\n', 60), ('qiulaif\t==\n', 57), ('qiulaif\t==\n', 57), ('gold\t==\n', 55)]
+:
+
+Сˣ imbaͳ˵ɵƲ
+Ƽ:
+[(',ɵ\timbaͳ˵ɵƲ\n', 100), ('!ҵİ,ɣ\tҪ~ҲᡣҪǾ˦ô\n', 100), ('\t==\n', 67), ('Ҵ,\t==\n', 60), ('\t\n', 57)]
+:
+
+Сˣ ҮҮҮ
+Ƽ:
+[('\tҮҮҮ\n', 100), ('\t2Ǽ㼦ܣ\n', 100), ('\t\n', 100), ('\tСм\n', 100), ('\tСм\n', 100)]
+:
+Ц
+Сˣ ֪
+Ƽ:
+[('Ц\t֪\n', 100), ('Ц\t==\n', 67), ('Ц\t==\n', 67), ('Ц\t\n', 50), ('Ц\t\n', 50)]
+:
+ӭ
+Сˣ ӭ
+Ƽ:
+[('\tӭ\n', 62), ('\t\n', 57), ('\t\n', 57), ('\t==\n', 57), ('\t\n', 57)]
+:
diff --git a/result_test/result_chatbot_sentence_vec_by_char.txt b/result_test/result_chatbot_sentence_vec_by_char.txt
new file mode 100644
index 0000000..f750d52
--- /dev/null
+++ b/result_test/result_chatbot_sentence_vec_by_char.txt
@@ -0,0 +1,55 @@
+Connected to pydev debugger (build 171.3780.115)
+D:\workspace\pythonMyCode\django_project\nlp_xiaojiang
+np.loadtxt(matrix_ques_part_path_char) ok!
+:
+˭ѽ
+D:/workspace/pythonMyCode/django_project/nlp_xiaojiang/ChatBot/chatbot_search/chatbot_sentence_vec_by_char.py:115: RuntimeWarning: invalid value encountered in true_divide
+ matrix_org_norm = (matrix_org / np.sqrt((matrix_org ** 2).sum(-1))[..., np.newaxis]).astype(numpy_type)
+С: £
+'gbk' codec can't encode character '\u301c' in position 227: illegal multibyte sequence
+:
+
+С: ʲô
+[('', 'ʲô'), ('', '(O_O)'), ('', '=='), ('', 'ԶԶԴ'), ('', 'ǰҵģ'), ('', '˳Ե̫?߲dzŵλ?....һǵ~Ҫ߲Ҫݭ³~(^V^)'), ('', 'ԶԶԴ'), (',,,', '=='), ('', "ô'"), ('', '˳Ե̫?߲dzŵλ?....һǵ~Ҫ߲Ҫݭ³~(^V^)'), ('', 'ʲô'), ('', '˵ˣҲ'), ('', 'ʲô'), ('', 'ǾȥԷɣװףŵ°ףߺ'), ('', '?'), ('', 'ʲô'), (',', '˳Ե̫?߲dzŵλ?....һǵ~Ҫ߲Ҫݭ³~(^V^)'), ('', "ô'"), ('', '?'), ('', '˳Ե̫?߲dzŵλ?....һǵ~Ҫ߲Ҫݭ³~(^V^)')]
+:
+
+С: _?
+[('', '_?'), (',,', '=='), ('', 'Цƨ'), (',', 'hŶ'), ('', 'Ц'), ('', ''), ('', ''), ('', 'hihi~'), ('', 'ôôDz'), ('', 'ҮҮ'), ('', ''), (',', '갡'), ('', 'Ц'), ('', 'ҲŲ'), ('', '(o?)'), ('', ''), ('', ''), ('', '_?'), ('', 'Unauthorizedaccess!.Inthisprogram(site,app),theSimSimiAPIisbeingusedillegally.Pleasecontactus.'), ('!', '')]
+:
+
+С: հ
+'gbk' codec can't encode character '\u301c' in position 136: illegal multibyte sequence
+:
+
+С: Сͨ~
+[('', 'Сͨ~'), ('', '(*^__^*)'), ('', ''), ('', 'ʹ'), ('', 'См~'), ('', 'hi'), ('', 'ҮҮҮ'), ('', ''), ('', 'ҮҮҮ'), ('', 'Сͨ~'), ('', 'Сͨ~'), ('', ''), ('', 'См'), ('', ''), ('', ''), ('', 'ʹ'), ('', 'См'), ('', '°'), ('', 'hi'), ('', '')]
+:
+ȥĸ
+С: һȥԷͰ
+[('ϸȥ', 'һȥԷͰ'), ('Ǹȥ', 'ʲô߹'), ('ȥ', 'μ̫꣡'), ('', 'ǵģ'), ('', 'ʲôأ֪Щʲôأ'), ('', "ǰŸʻһ'"), ('ĸ', 'š㡣'), ('ϸȥ', 'ϸ'), ('ĸ', '˵˵еȻǻ'), ('ĸ', 'RudyJeffEric'), ('!˸ȥ', ''), ('', 'SB'), ('˸ȥ', 'һȥԷͰ'), ('˸ȥ', '˸ȥ'), ('˸ȥ', '˸ȥ'), ('˸ȥ', ''), ('˸ȥ', 'Сͷ'), ('˸ȥ', '˸'), ('˸ȥ', '˿'), ('˸ȥ', '==')]
+:
+ȥˣ
+С: ȥ㋌
+[('ȥ', 'ȥ㋌'), ('ȥҼ', 'ȥģ'), ('ȥ', ''), ('ȥ', 'ȥģ'), ('', 'Ŷ'), ('뿴ȥ', ''), ('', 'ţǻģʹᣬҲһֱ㡣'), ('ȥϺ', 'һԵ'), ('ȥ', '봩Խ'), ('ĺ', 'ϲһﶼ'), ('ȥ', 'ðκͦİϲѩ'), ('Լȥ', '̫ˡ'), ('', 'ëߣѧϰȥ'), ('Լȥ', 'Ǹ˵'), ('', ''), ('', 'ã'), ('', 'ߣֽȥԺܶܶ~~'), ('Ķ', 'ϲһﶼ'), ('', '֪'), ('', "ⱨӦģͻᷢѪ⻯ķ'һӲ㣡")]
+:
+
+"word ' ' not in vocabulary"
+С: ܺŶ
+list index out of range
+:
+
+С: Ŷ~~~~~~~~
+[('', 'Ŷ~~~~~~~~'), ('', '=='), ('', 'ף㷢ˣ'), ('', 'ûûʲô'), ('', '㲻Ҳ'), ('ѽ', 'giոgiİոgiյչţ'), ('', 'ô߷'), ('', '^_^'), ('', 'ڵġڵġʱ̺ţ'), ('~~~(>_<', 'ʲô簮'), ('', 'ô'), ('', 'ľ'), ('', 'ӴǺ'), ('', 'Ӵ'), ('', 'Ӵ'), ('', 'ô'), ('', 'ʲô簮'), ('', '̾'), ('', 'Ŷ'), ('', '==')]
+:
+»ˮ
+С: Զ̣ܾ㶮ģ
+'gbk' codec can't encode character '\u2207' in position 329: illegal multibyte sequence
+:
+ϲ˭
+С: Ȼϲ
+[('ϲ˭', 'Ȼϲ'), ('ϲ˭', 'С'), ('ϲ˭', 'ϲʨ'), ('ϲ˭', 'bb'), ('ϲ˭', '£'), ('ϲ˭', '˭ϲҡҾϲ˭'), ('ϲ˭', 'ɰ'), ('ϲ˭', ''), ('ϲ˭', 'ȻǺ㱿'), ('ϲ˭', 'š˼Ҳ˼'), ('ϲ˭', 'Ʋqq******'), ('ϲ˭', 'ͣҵ'), ('ϲ˭', 'ȻСŮ~'), ('ϲ˭', 'Ȼ'), ('ϲ˭', 'Ը˵ladygaga'), ('ϲ˭', 'ϲ'), ('ϲ˭', ''), ('˭ϲ', 'ϲѾ>3<'), (',,ϲ˭', 'ŶŶ'), ('!ϲ˭', 'ߺ')]
+:
+
+С:
+[('?', ''), ('', 'óԳ'), ('??', ''), ('...', 'ô˻'), ('~', '~~~'), ('??', '˭֪ͥôϴγԻһϴءڶҸ'), ('', ''), ('', 'ͲҪŵô'), ('?', '벻ҪڹΪûȥ'), (',', '쵽~'), ('', 'ͷ'), ('', 'ôôˣӳû'), ('?', ''), ('', ''), ('', ''), ('', ''), ('?', '£'), ('', 'Ŷ'), ('', 'Сˣ'), ('??', '==')]
+:
diff --git a/result_test/result_chatbot_sentence_vec_by_word.txt b/result_test/result_chatbot_sentence_vec_by_word.txt
new file mode 100644
index 0000000..2c1f251
--- /dev/null
+++ b/result_test/result_chatbot_sentence_vec_by_word.txt
@@ -0,0 +1,73 @@
+Connected to pydev debugger (build 171.3780.115)
+D:\workspace\pythonMyCode\django_project\nlp_xiaojiang
+load_word2vec_model start!
+load_word2vec_model end!
+load w2v_model_wiki_word_path ok!
+np.loadtxt(matrix_ques_part_path) start!
+np.loadtxt(matrix_ques_part_path) end!
+:
+˭ѽ
+Building prefix dict from the default dictionary ...
+Loading model from cache C:\Users\MOYONG~1\AppData\Local\Temp\jieba.cache
+Loading model cost 0.815 seconds.
+Prefix dict has been built succesfully.
+D:/workspace/pythonMyCode/django_project/nlp_xiaojiang/ChatBot/chatbot_search/chatbot_sentence_vec_by_word.py:131: RuntimeWarning: invalid value encountered in true_divide
+ matrix_org_norm = (matrix_org / np.sqrt((matrix_org ** 2).sum(-1))[..., np.newaxis]).astype(numpy_type)
+С: ȥ
+[('ְ˭', 'ȥ'), ('ְ˭', 'ɷ'), ('˭', ''), ('˭', '°'), ('˭', ''), ('˭', '˵Ҳûã'), ('˭', '°'), ('˭', 'ҵˡ~~ꡭҪ߱ˣ費Ҹְ˵^o^'), ('˭', '׳'), ('˭', 'Ϲ'), ('˭', 'Ƽ'), ('²˭', 'Ҳ²¡'), ('˭', 'ڰ'), ('˭', 'õ~'), ('˭', 'ȻǴչ˶'), ('˭', 'ڿ˧'), ('˭', 'ѽ'), ('˭', 'õ~'), ('˭', 'µһ˧磬òǻ۵ĽϣӢĻ~ү'), ('˭', 'ƣ')]
+:
+
+С: ҽлСͨӴ
+[('', 'ҽлСͨӴ'), ('', 'ҽsimsimi'), ('ҵ', 'Ard!!'), ('', 'үķౣƿСͨȸС'), ('', ''), ('ֽ', 'ߣܡ~һ'), ('Ůѵ', 'Ρ'), ('ֽʲô', 'Ұְ'), ('Լ', 'Ŷ'), ('', 'Ŷ'), ('ôд', 'ÿϣ㶼кְ'), ('ҵʲô', 'һȻ'), ('', 'Ƽ'), ('ֽɶ', ''), ('е', ''), ('ҵ19˵ҵʲô', ''), ('һҵ', 'ү'), ('appʲô', ''), ('ҵŵʲô', ''), ('Խгҵ', 'ҽС˼ܴ')]
+:
+ҵ
+С: Ard!!
+[('ҵ', 'Ard!!'), ('', 'ҽsimsimi'), ('', 'ҽлСͨӴ'), ('', ''), ('', 'үķౣƿСͨȸС'), ('ҵŮʲô', 'ѽ˵ұԼ֪'), ('ҵŵʲô', ''), ('˵ҵ', 'ร'), ('ֽ', 'ߣܡ~һ'), ('ҵʲô', 'һȻ'), ('Լ', 'Ŷ'), ('', 'Ŷ'), ('ϲɶ', '֪'), ('ҵ19˵ҵʲô', ''), ('˵', 'DiҲ'), ('ֽɶ', ''), ('Ůѵ', 'Ρ'), ('ֽʲô', 'Ұְ'), ('Сƹѵ', 'Ρ'), ('', 'Ժ~')]
+:
+bվʲôÿ
+С: лѽ,յлɺóȥῴ
+'gbk' codec can't encode character '\u301c' in position 116: illegal multibyte sequence
+:
+ѽ
+С: Ϻö
+'gbk' codec can't encode character '\u301c' in position 131: illegal multibyte sequence
+:
+ƽֻ?
+С: ðɺðҳֲ
+[('仰', 'ðɺðҳֲ'), (',,,,,Ц', 'ðЦˣձ˭ձô
úѽͲί㣬̫Ƿ'), ('˵Ƕ', ''), ('˵СͨǹĻĸ', 'һ'), ('Ҫ仰', 'ǵǵΣǼҵΣ'), ('ǵط,', 'Ž'), ('ʲô', 'и'), ('ôÿ춼仰', 'ñ'), ('˵Ǽ', 'yes,mylord'), ('ǻô', 'Ҳһ'), ('˵,ƼŮ', '=='), ('仰˵ĺü', 'ϯɵһ~~~'), ('˵ǹ', 'ǹĸأ'), ('仰ҲԶظ', ''), ('仰ʲô˼', 'һ'), ('˵㵽ǹĻĸ', 'ме'), ('仰,ܻܲ仰', 'ǣֲҪ'), ('Цĸɻ', 'ÿѧУô˵еġ*ѧġ
翼ԣĴ˾Ȼͬ
˼˽Ÿ˵һ仰Ӧöʶɲˡ
һѧֱʰô'), ('˵ijݻ', 'ѩ'), ('˵ʨ', 'ţ')]
+:
+ѽ㲻
+С:
+'gbk' codec can't encode character '\xaf' in position 433: illegal multibyte sequence
+:
+㲻ѽ
+С: ô?
+[(',㲻', 'ô?'), ('㲻', '(_)'), ('㲻', 'yes,sir'), ('㲻', 'аһʹ'), ('㲻', 'һ~\\(RQ)/~'), ('˵㲻', 'ðɣ'), ('@ô', 'ȵҹ˾ʱĵظ˵һ^_^'), ('ʲôѽ,㶮', 'ҽа'), ('벻а,', '뵽ĵ'), ('Ȼ', 'Ǽ,'), ('㲻ô', 'Ҿϲ㲻ϲңѽ'), ('ɵ', 'ûɵ'), ('ô@,~~~(>_<', 'ءСͨϢһ'), ('ǺǸ,Ҫ', 'ԲװҲ'), ('ﲻ', 'ԲװҲ'), ('ɶ', '=='), ('ƨѽ~ҽʲô', ''), ('˵ô@ѽ,', '˵ֻһ㡣'), ('Ƕѽ', 'զ֪'), ('˵ҷ', 'õģСҲ(>n<)')]
+:
+Ǻ
+С: ð
+[('ǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺ', 'ð'), ('Ǻǡ', 'ɵЦɶ'), ('Ǻ', 'èġ'), ('Ǻ', ''), ('Ǻ', 'ϴȥ'), ('Ǻǡ', 'лл'), ('Ǻ', '˵Ǻ'), ('Ǻ,', '˴˱˴˰'), ('Ǻ', 'ҪǺǣҲϲҲϲĵ'), ('Ǻ', 'ɵ'), ('ǺǺǺ', 'ֲŮ'), ('Ǻ,', 'ãСͨ'), ('ǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺ', 'Ǻү'), ('Ǻ', 'ds'), ('Ǻ', ''), ('ǺǺǺ~~', '=='), ('Ǻ', '˿źǺǡ'), ('Ǻ,,,', 'ǺdzԷȥϴ'), ('Ǻ', 'ֹߣںǺǡ'), ('ǺǺǺǺǺ', 'Ǻã')]
+:
+
+С: hi
+[('', 'hi'), ('~', '=='), ('', 'ʹ'), ('', 'См~'), ('', '~\\(RQ)/~'), ('', 'См'), ('', ''), ('', 'Сͨ~'), ('', ''), ('', 'См'), ('', 'hi'), ('', ''), ('', ''), ('', 'См~'), ('', 'Сͨ~'), ('', 'См~'), ('', ''), ('', 'См~'), ('', '~\\(RQ)/~'), ('', '==')]
+:
+ȥ
+С: Ҳ֪
+'gbk' codec can't encode character '\xaf' in position 273: illegal multibyte sequence
+:
+
+С:
+'gbk' codec can't encode character '\u301c' in position 335: illegal multibyte sequence
+:
+
+С:
+'gbk' codec can't encode character '\u301c' in position 170: illegal multibyte sequence
+:
+ȥ
+'gbk' codec can't encode character '\u301c' in position 32: illegal multibyte sequence
+:
+
+С: that'sok
+[('thanks', "that'sok"), ('Ģ', '㹽'), ('ɵϻ', 'ö'), ('ſ', 'fuck'), ('baka', '·'), ('Сͷ', 'Ҫζ'), ('㹤', ''), ('Ե', '=='), ('ͯӼ', 'Ҫ'), ('5.3.6.9', '=='), ('', 'Ҹְ'), ('', 'У'), ('', 'ҧСͨ'), ('33333', ''), ('ƿ', 'һֱԥ'), ('year', '¹'), ('goodnight', 'SweetdreamdarlingXD'), ('goodnight', 'ba\u2006d'), ('goodnight', 'Ȼ'), ('', 'ٺ١')]
+:
diff --git a/result_test/result_sentence_sim_feature.txt b/result_test/result_sentence_sim_feature.txt
new file mode 100644
index 0000000..f5adfec
--- /dev/null
+++ b/result_test/result_sentence_sim_feature.txt
@@ -0,0 +1,37 @@
+Connected to pydev debugger (build 171.3780.115)
+D:\workspace\pythonMyCode\django_project\nlp_xiaojiang
+load w2v model begin
+load w2v model success
+Building prefix dict from the default dictionary ...
+Loading model from cache C:\Users\MOYONG~1\AppData\Local\Temp\jieba.cache
+Loading model cost 0.719 seconds.
+set_data time0.7200782299041748
+Prefix dict has been built succesfully.
+same_word_count time0.0
+same_char_count time0.0
+sentence_length time0.0
+w2c_all_vec time0.1994335651397705
+tdidf_all_vec time0.0
+edit_all_str time0.0019953250885009766
+jaccardϵ time0.0
+tdidf_all_vec_pinyin time0.0
+edit_all_pinyin time0.004553556442260742
+jaccardϵpinyin time0.0
+sim:
+[1, 3, 1, 1.1, 5.238095238095238, 0.6782572237857507, 3461.1677906854284, 283.83272299933014, 19.980963040347838, 0.9999999999966667, 3.0830289870500565, 19.980963040347838, 24.494821131252575, 79619.83774188746, -5.10379204991808, 0.6769724044408956, 0.0, 12, 9, 0.2857142857142857, 0.5242424242424243, 19, 19, 2, 8.141546895617283, 0.08333333333333333, 0.16666666666666666, 0.008081558347970244, 17, 22, 0.5217391304347826, 0.6838686096962837, 56, 47, 4, 6.190419904893637, 0.11764705882352941, 0.11764705882352941]
+s1:
+s2: ߵúѽ
+set_data time0.0009706020355224609
+same_word_count time0.0009982585906982422
+same_char_count time0.0
+sentence_length time0.0
+w2c_all_vec time0.20846796035766602
+tdidf_all_vec time0.0
+edit_all_str time0.0019943714141845703
+jaccardϵ time0.0
+tdidf_all_vec_pinyin time0.0
+edit_all_pinyin time0.0019960403442382812
+jaccardϵpinyin time0.0
+sim:
+[2, 3, 1, 0.875, 3.7333333333333334, 0.8200504988005877, 3746.94646712115, 236.48076447923086, 17.65693370974129, 0.9999999999966667, 4.2634280025959015, 17.65693370974129, 24.494877087856107, 78956.49194315828, -13.367107715032754, 0.8200018973656127, 0.07174613344073014, 21, 6, 0.4, 0.6011904761904762, 40, 40, 1, 5.620521171774245, 0.2, 0.25, 0.36243089354552877, 10, 15, 0.5384615384615384, 0.6417797888386123, 62, 58, 5, 6.01776904578638, 0.25, 0.25]
+s1:
\ No newline at end of file
diff --git a/utils/__init__.py b/utils/__init__.py
new file mode 100644
index 0000000..d838479
--- /dev/null
+++ b/utils/__init__.py
@@ -0,0 +1,5 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time :2019/4/3 15:15
+# @author :Mo
+# @function :
\ No newline at end of file
diff --git a/utils/text_tools.py b/utils/text_tools.py
new file mode 100644
index 0000000..a8b40f6
--- /dev/null
+++ b/utils/text_tools.py
@@ -0,0 +1,322 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time :2019/4/3 11:23
+# @author :Mo
+# @function :utils, tools
+
+
+from openpyxl import Workbook
+import logging as logger
+import gensim
+import jieba
+import time
+import xlrd
+import re
+
+
+#中英文标点符号
+filters='[!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n' + '!,;:。?、“”’‘《》()~@#¥%……&*\()/{}【】…=-]'
+#标点符号、空格
+filters_1 = "[\.\!\/_,?;:$%^*<>()+\"\']+|[!,;:。?、“”’‘《》()~@#¥%……&*\(\)\/\-]+"
+
+"""去除标点符号、空格"""
+def clear_punctuation(text):
+ """去除标点符号"""
+ sentence = text.replace(' ', '')
+ sentence_punctuation_clear = re.sub(filters, ' ', sentence).strip()
+ sentence_punctuation_clear_replace = sentence_punctuation_clear.replace(' ', ' ').replace(' ', ' ')
+ return sentence_punctuation_clear_replace
+
+
+'''截取中文、拼音、数字,去除特殊字符等'''
+def getChinese1(ques):
+ # ques = '•“鑫菁英”教育分期手续费怎么收取?可以'
+ findAllChinese = ''.join(re.findall(u"([\u4e00-\u9fa50-9A-Za-z])", ques))
+ # print(sub_str)
+ return findAllChinese
+
+
+'''xlrd读xls'''
+def xlsRead(sheetName=None, cols=0, fileXlsPath=None):
+ '''读xls文件'''
+ workbook = xlrd.open_workbook(fileXlsPath)
+ # 根据sheet索引或者名称获取sheet内容
+ sheet = workbook.sheet_by_name(sheetName)
+ nrows = sheet.nrows
+ ncols = sheet.ncols
+
+ listRows = []
+ for i in range(nrows):
+ listRows.append(sheet.row_values(i))
+
+ return listRows
+
+
+'''openpyxl写xlsx'''
+def xlsxWrite(sheetName, writeList, fileXlsName):
+ wb = Workbook()
+ print('{}'.format(wb.get_sheet_names())) # 提供一个默认名叫Sheet的表,office2016下新建提供默认Sheet1
+ sheet = wb.create_sheet(sheetName)
+ # i = 0
+ for listLine_one in writeList:
+ # i += 1
+ sheet.append(listLine_one)
+ # if i == 1000:
+ # break
+ wb.save(fileXlsName)
+
+
+
+"""判断一个unicode是否是英文字母"""
+def is_alphabet(uchar):
+ """判断一个unicode是否是英文字母"""
+ if (uchar >= u'\u0041' and uchar <= u'\u005a') or (uchar >= u'\u0061' and uchar <= u'\u007a'):
+ return True
+ else:
+ return False
+
+'''读取txt文件'''
+def txtRead(filePath, encodeType = 'utf-8'):
+ listLine = []
+ try:
+ file = open(filePath, 'r', encoding= encodeType)
+
+ while True:
+ line = file.readline()
+ if not line:
+ break
+
+ listLine.append(line)
+
+ file.close()
+
+ except Exception as e:
+ logger.info(str(e))
+
+ finally:
+ return listLine
+
+'''读取txt文件'''
+def txtWrite(listLine, filePath, type = 'w',encodeType='utf-8'):
+
+ try:
+ file = open(filePath, type, encoding=encodeType)
+ file.writelines(listLine)
+ file.close()
+
+ except Exception as e:
+ logger.info(str(e))
+
+'''截取中文、拼音、数字,去除特殊字符等'''
+'''要保留特殊字符的格式,最好的方法是每个字符都去匹配'''
+
+def getChinese(ques):
+ # ques = '•“鑫菁英”教育分期手续费怎么收取?可以'
+ ques = strQ2B(ques)
+ answer = ''
+ for ques_one in ques:
+ ques_one_findall = ''.join(re.findall(u"([\u4e00-\u9fa50-9A-Za-z峣㒶㒰玘宸諕鄕缓緩𪥵嬆嬲煙草砼赟贇龘㗊㵘㙓敠])", ques_one))
+ if not ques_one_findall:
+ ques_one_findall = ' '
+ answer = answer + ques_one_findall
+ answer = answer.strip().replace(' ', ' ').replace(' ', ' ')
+ return answer.upper()
+
+'''去除标点符号'''
+
+def get_syboml(ques):
+ # ques = '•“鑫菁英”教育分期手续费怎么收取?可以'
+ ques = strQ2B(ques)
+ # answer = re.sub(u'([。.,,、\;;::??!!“”"‘’'''()()…——-《》<>{}_~【】\\[])', ' ', ques).replace(' ', ' ').replace(' ', ' ')
+ answer = re.sub("[\.\!\/_,?;:$%^*<>()+\"\']+|[!,;:。?、“”’‘《》[\](|){}【】~@#¥%…&*\/\-—_]+", " ", ques).strip()
+ return answer
+
+'''xlrd读xls'''
+
+def xlsRead(sheetName=None, cols=0, fileXlsPath=None):
+ '''读xls文件'''
+ workbook = xlrd.open_workbook(fileXlsPath)
+ # 根据sheet索引或者名称获取sheet内容
+ sheet = workbook.sheet_by_name(sheetName)
+ nrows = sheet.nrows
+ ncols = sheet.ncols
+
+ listRows = []
+ for i in range(nrows):
+ listRows.append(sheet.row_values(i))
+
+ return listRows
+
+'''openpyxl写xlsx'''
+
+def xlsxWrite(sheetName, writeList, fileXlsName):
+ wb = Workbook()
+ print('{}'.format(wb.get_sheet_names())) # 提供一个默认名叫Sheet的表,office2016下新建提供默认Sheet1
+ sheet = wb.create_sheet(sheetName)
+ # i = 0
+ for listLine_one in writeList:
+ # i += 1
+ sheet.append(listLine_one)
+ # if i == 1000:
+ # break
+ wb.save(fileXlsName)
+
+'''读取txt文件'''
+
+def txtRead(filePath, encodeType='utf-8'):
+ listLine = []
+ try:
+ file = open(filePath, 'r', encoding=encodeType)
+
+ while True:
+ line = file.readline()
+ if not line:
+ break
+
+ listLine.append(line)
+
+ file.close()
+
+ except Exception as e:
+ logger.info(str(e))
+
+ finally:
+ return listLine
+
+'''读取txt文件'''
+
+def txtWrite(listLine, filePath, type='w', encodeType='utf-8'):
+
+ try:
+ file = open(filePath, type, encoding=encodeType)
+ file.writelines(listLine)
+ file.close()
+
+ except Exception as e:
+ logger.info(str(e))
+
+# -*- coding: cp936 -*-
+def strQ2B(ustring):
+ """全角转半角"""
+ rstring = ""
+ for uchar in ustring:
+ inside_code = ord(uchar)
+ if inside_code == 12288: # 全角空格直接转换
+ inside_code = 32
+ elif (inside_code >= 65281 and inside_code <= 65374): # 全角字符(除空格)根据关系转化
+ inside_code -= 65248
+
+ rstring += chr(inside_code)
+ return rstring
+
+def strB2Q(ustring):
+ """半角转全角"""
+ rstring = ""
+ for uchar in ustring:
+ inside_code = ord(uchar)
+ if inside_code == 32: # 半角空格直接转化
+ inside_code = 12288
+ elif inside_code >= 32 and inside_code <= 126: # 半角字符(除空格)根据关系转化
+ inside_code += 65248
+
+ rstring += chr(inside_code)
+ return rstring
+
+def is_valid_date(strdate):
+ '''判断是否是一个有效的日期字符串'''
+ try:
+ if ":" in strdate:
+ time.strptime(strdate, "%Y-%m-%d %H:%M:%S")
+ else:
+ time.strptime(strdate, "%Y-%m-%d")
+ return True
+ except:
+ return False
+
+'''判断是否是全英文的'''
+
+def is_total_english(text):
+ """判断一个是否是全英文字母"""
+ symbol = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
+ try:
+ sentence_punctuation_clear = get_syboml(text)
+ sentence_punctuation_clear = sentence_punctuation_clear.replace(' ', '').strip()
+ numben = 0
+ for one in sentence_punctuation_clear:
+ if one in symbol:
+ numben += 1
+ if numben == len(sentence_punctuation_clear):
+ return True
+ else:
+ return False
+ except:
+ return False
+
+'''判断是否是数字的'''
+
+def is_total_number(text):
+ """判断一个是否是全英文字母"""
+ try:
+ sentence_punctuation_clear = get_syboml(text)
+ sentence_punctuation_clear = sentence_punctuation_clear.replace(' ', '').strip()
+ numben = 0
+ for one in sentence_punctuation_clear:
+ if one.isdigit():
+ numben += 1
+ if numben == len(sentence_punctuation_clear):
+ return True
+ else:
+ return False
+ except:
+ return False
+
+def is_number_or_english(text):
+ '''不为数字不为字母'''
+ judge = False
+ try:
+ sentence_punctuation_clear = get_syboml(text)
+ sentence_punctuation_clear = sentence_punctuation_clear.replace(' ', '').strip()
+ for words in sentence_punctuation_clear:
+ judge_number = is_total_number(words)
+ judge_english = is_total_english(words)
+ judge = judge_number or judge_english
+ if not judge:
+ return False
+ return judge
+ except:
+ return False
+
+#todo #句子改写,同义词替换,去停用词等
+
+
+if __name__ == '__main__':
+
+
+ # for i in range(10):
+ # sentence_vec = word2vec_model.wv["的"]
+ # sentence_vec_pd = pd.DataFrame(sentence_vec)
+ # sentence_vec_pd.to_csv('my_csv.csv', mode='a', header=False)
+
+ # sentence_ee = pd.read_csv('my_csv.csv')
+
+ # txtWrite([str(sentence_vec)], "gg.txt")
+
+
+ # path_test_data_government = '/data/test_data_government.csv'
+ # sentences = txtRead(path_test_data_government)
+ sentences = []
+ sentences_one_clear_punctuation_all = []
+ for sentences_one in sentences[1:]:
+ sentences_one_1 = sentences_one
+ sentences_one_clear_punctuation = clear_punctuation(sentences_one_1.replace(',0.0,1.0', ''))
+ # print(sentences_one)
+ # print(sentences_one_clear_punctuation)
+ sentences_one_clear_punctuation_jieba = jieba.cut(sentences_one_clear_punctuation, cut_all=False, HMM=False)
+ sentences_one_clear_punctuation_jieba_list = ' '.join(list(sentences_one_clear_punctuation_jieba)).replace(' ', ' ').replace(' ', ' ').strip()
+ sentences_one_clear_punctuation_all.append(sentences_one_clear_punctuation_jieba_list + ',0.0,1.0' + '\n')
+
+ txtWrite(sentences[0:1] + sentences_one_clear_punctuation_all, '/data/test_data_government_cut.csv')
+
+ #',0.0,1.0'
+ # np.savetxt('001', [word2vec_model.wv["的"], word2vec_model.wv["的"]])
+ # gg = np.loadtxt('001')
\ No newline at end of file
diff --git a/utils/word2vec_vector.py b/utils/word2vec_vector.py
new file mode 100644
index 0000000..50a3e9a
--- /dev/null
+++ b/utils/word2vec_vector.py
@@ -0,0 +1,55 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time :2019/4/4 10:00
+# @author :Mo
+# @function :
+
+from __future__ import print_function
+from utils.text_tools import txtRead, txtWrite
+from gensim.models.word2vec import LineSentence
+from gensim.models import Word2Vec
+import multiprocessing
+import logging
+import sys
+import os
+
+def train_word2vec_by_word():
+ logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
+ logging.root.setLevel(level=logging.INFO)
+ logging.info("running")
+
+ inp = "Y:/BaiduNetdiskDownload/cut_zhwiki_wiki_parse/cut_zhwiki_wiki_parse.txt"
+ outp1 = "w2v_model_wiki.model"
+ outp2 = "w2v_model_wiki_word.vec"
+ model = Word2Vec(LineSentence(inp), size=300, window=5, min_count=5, workers=multiprocessing.cpu_count())
+ model.save(outp1)
+ model.wv.save_word2vec_format(outp2, binary=False)
+
+def train_word2vec_by_char():
+ logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
+ logging.root.setLevel(level=logging.INFO)
+ logging.info("running")
+
+ inp = "Y:/BaiduNetdiskDownload/cut_zhwiki_wiki_parse/cut_zhwiki_wiki_parse_char.txt"
+ outp1 = "w2v_model_wiki.model"
+ outp2 = "w2v_model_wiki_char.vec"
+ model = Word2Vec(LineSentence(inp), size=300, window=5, min_count=5, workers=multiprocessing.cpu_count())
+ model.save(outp1)
+ model.wv.save_word2vec_format(outp2, binary=False)
+
+
+if __name__ == '__main__':
+ train_word2vec_by_word()
+ # train_word2vec_by_char()
+
+ # inp = "Y:/BaiduNetdiskDownload/cut_zhwiki_wiki_parse/cut_zhwiki_wiki_parse.txt"
+ # sentences_char = []
+ # sentences = txtRead(inp)
+ # for sentences_one in sentences:
+ # sentences_one_replace = sentences_one.strip().replace(" ", "")
+ # sentences_one_replace_all = []
+ # for sentences_one_replace_one in sentences_one_replace:
+ # sentences_one_replace_all.append(sentences_one_replace_one)
+ # sentences_char.append(" ".join(sentences_one_replace_all) + "\n")
+ # txtWrite(sentences_char, "Y:/BaiduNetdiskDownload/cut_zhwiki_wiki_parse/cut_zhwiki_wiki_parse_char.txt")
+ # gg = 0
\ No newline at end of file