Add files via upload

This commit is contained in:
yongzhuo 2019-04-09 15:26:07 +08:00 committed by GitHub
parent 0ec8964b47
commit d27dac119e
29 changed files with 2245 additions and 0 deletions

5
ChatBot/__init__.py Normal file
View File

@ -0,0 +1,5 @@
# -*- coding: UTF-8 -*-
#!/usr/bin/python
# @Time :2019/3/29 23:11
# @author :Mo
# @function :

View File

@ -0,0 +1,5 @@
# -*- coding: UTF-8 -*-
# !/usr/bin/python
# @time :2019/4/3 15:15
# @author :Mo
# @function :

View File

@ -0,0 +1,163 @@
# -*- coding: UTF-8 -*-
# !/usr/bin/python
# @time :2019/4/4 10:00
# @author :Mo
# @function :
from conf.path_config import chicken_and_gossip_path
from utils.text_tools import txtRead, txtWrite
from conf.path_config import projectdir
from fuzzywuzzy import process
from fuzzywuzzy import fuzz
import pickle
import time
import re
def count_same_char(x1, x2):
'''获取相同字符的个数'''
res = []
for x in x1:
if x in x2:
res.append(x)
if res:
return len(res)
else:
return 0
def fuzzy_re(user_input, collection):
'''匹配方法, 效果不大好,只能匹配相同字数一样,或者字数比他多的那种,同义词或者是有一个词不一样,就没法区分开'''
suggestions = []
user_input = user_input.replace('.', '').replace('*', '').replace('?', '')
collection_new = []
len_user_input = len(user_input)
for coll in collection: # 获取包含所有字符的,如果不包含,就返回错误
count_coll = 0
for i in range(len_user_input):
if user_input[i] in coll:
count_coll += 1
if len_user_input == count_coll:
collection_new.append(coll)
if not collection_new:
return None
pattern = '.*?'.join(user_input) # Converts 'djm' to 'd.*?j.*?m'
try:
regex = re.compile(pattern) # Compiles a regex.
except:
gg = 0
for item in collection_new:
match = regex.search(item) # Checks if the current item matches the regex.
if match:
suggestions.append((len(match.group()), match.start(), item))
return [x for _, _, x in sorted(suggestions)]
def fuzzy_fuzzywuzzy(fuzz, user_input, collection):
'''编辑距离,速度比较慢,比起匹配方法,能够处理字符不一样的问题'''
collection_new = []
len_user_input = len(user_input)
for coll in collection: # 获取包含一个字符的,如果不包含,就返回错误
for i in range(len_user_input):
if user_input[i] in coll:
collection_new.append(coll)
if not collection_new:
return None
collection_new = list(set(collection_new))
same_char_list = []
for collection_new_one in collection_new: # 获取相同字符串多的问题
count_same_char_one = count_same_char(user_input, collection_new_one)
same_char_list.append((collection_new_one, count_same_char_one))
same_char_list.sort(key=lambda x: x[1], reverse=True)
if len(same_char_list) >= 500:
same_char_list = same_char_list[0: 500]
result = process.extract(user_input, same_char_list, scorer=fuzz.token_set_ratio, limit=20)
return result
def fuzzy_fuzzywuzzy_list(fuzz, user_input, qa_list, collection, topn=50):
'''编辑距离,速度比较慢,比起匹配方法,能够处理字符不一样的问题'''
start_time = time.time()
# user_input_set = set([user_input_one for user_input_one in user_input])
user_input_set = [user_input_one for user_input_one in user_input]
same_char_list = []
max_data = 0
max_data_list = []
count_collection_new_one = 0
for collection_new_one in collection: # 获取相同字符串多的问题
count_same_char_one = len([x for x in user_input_set if x in collection_new_one])
if count_same_char_one > 0:
same_char_list.append((count_collection_new_one, count_same_char_one))
if count_same_char_one > max_data:
max_data_list.append(count_same_char_one)
max_data = count_same_char_one
count_collection_new_one += 1
end_time1 = time.time()
list_max_count = []
len_max_data_list = len(max_data_list)
for x in range(len_max_data_list): # 获取前20排名
for k,l in same_char_list:
if l == max_data_list[len_max_data_list -1 - x]:
list_max_count.append(qa_list[k]) #问答重这里取出来
if len(list_max_count) >= 5000:
list_max_count = list_max_count[0:5000]
break
end_time2 = time.time()
# end_time1: 0.34090662002563477
# end_time2: 0.4080846309661865
# end_time1: 0.06417036056518555
# end_time2: 0.08422374725341797
# same_char_list.sort(key=lambda x: x[1], reverse=True)
# if len(same_char_list) >= 20:
# same_char_list = same_char_list[0: 20]
result = process.extract(user_input, list_max_count, scorer=fuzz.token_set_ratio, limit=topn)
end_time3 = time.time()
# print('end_time1: ' + str(end_time1 - start_time))
# print('end_time2: ' + str(end_time2 - start_time))
# print('end_time3: ' + str(end_time3 - start_time))
return result
# [fuzz.WRatio, fuzz.QRatio,
# fuzz.token_set_ratio, fuzz.token_sort_ratio,
# fuzz.partial_token_set_ratio, fuzz.partial_token_sort_ratio,
# fuzz.UWRatio, fuzz.UQRatio]
if __name__ == '__main__':
start_time = time.time()
qa_list = txtRead(chicken_and_gossip_path)
questions = [qa.strip().split("\t")[0] for qa in qa_list]
print("read questions ok!")
sen = "你谁呀"
# list_fuzzyfinder = fuzzyfinder(base_syn_one_split[1], qa_list)
# list_fuzzyfinder = fuzzy_fuzzywuzzy(fuzz, base_syn_one_split[1], qa_list)
print("你问: " + "你谁呀")
list_fuzzyfinder = fuzzy_fuzzywuzzy_list(fuzz, sen, qa_list, questions, topn=5)
print("小姜机器人: " + list_fuzzyfinder[0][0].split("\t")[1].strip())
print("推荐结果: ")
print(list_fuzzyfinder)
while True:
print("你问: ")
ques = input()
list_fuzzyfinder = fuzzy_fuzzywuzzy_list(fuzz, ques, qa_list, questions, topn=5)
print("小姜机器人: " + list_fuzzyfinder[0][0].split("\t")[1].strip())
print("推荐结果: ")
print(list_fuzzyfinder)

View File

@ -0,0 +1,142 @@
# -*- coding: UTF-8 -*-
# !/usr/bin/python
# @time :2019/4/4 10:00
# @author :Mo
# @function :chatbot based search, encode sentence_vec by char
from conf.path_config import w2v_model_char_path
from conf.path_config import matrix_ques_part_path_char
from utils.text_tools import txtRead, txtWrite, getChinese
from conf.path_config import projectdir, chicken_and_gossip_path
from numpy import float32 as numpy_type
from collections import Counter
import pickle, jieba, os, re
import jieba.posseg as pseg
from gensim import matutils
from math import log
import numpy as np
import gensim
import jieba
def load_word2vec_model(path, bin=False, limit=None):
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(path, limit=limit, binary=bin, unicode_errors='ignore')
return word2vec_model
def encoding_question(w2v_model, char_list):
''' 生成句子向量
:param wordlist: 分词list
:param is_replaced: 是否替换default true
:param debug_mode: default false
:return: array句子的向量 len=300
'''
try:
sentence_vec = w2v_model.wv[word2vec_model.index2word[1]] * 0
except:
sentence_vec = w2v_model.wv[word2vec_model.index2word[0]] * 0
for k in range(len(char_list)):
char_list_one = char_list[k]
if type(char_list_one) == str:
try:
sentence_vec = sentence_vec + w2v_model.wv[char_list_one]
except Exception as e:
print(str(e))
if char_list_one not in [' ', '']:
sentence_vec = sentence_vec + 1
return sentence_vec
def most_similar_sentence_vec(vec_ques, matrix_org, top_vec=20):
"""
最相似的句子句向量与矩阵点乘
:param vec:
:param matrix:
:param keys:
:param topn:
:return:
"""
# 首先对句向量矩阵标号
matrix_org_index = list(range(len(matrix_org)))
# Scale a vector to unit length. The only exception is the zerovector, which is returned back unchanged.
vec_ques_mean = matutils.unitvec(np.array([vec_ques]).mean(axis=0)).astype(numpy_type)
# matrix_org单位化
matrix_org_norm = (matrix_org / np.sqrt((matrix_org ** 2).sum(-1))[..., np.newaxis]).astype(numpy_type)
# 计算两个向量之间的相似度使用numpy的dot函数矩阵点乘
matrix_vec_dot = np.dot(matrix_org_norm, vec_ques_mean)
# 防止top_vec越界
top_vec = min(len(matrix_org), top_vec)
# 相似度排序
most_similar_sentence_vec_sort = matutils.argsort(matrix_vec_dot, topn=top_vec, reverse=True)
index_score = []
for t in most_similar_sentence_vec_sort[:top_vec]:
index_score.append([matrix_org_index[t], float(matrix_vec_dot[t])])
return index_score
def create_matrix_org_np(sen_count, word2vec_model, qa_path, matrix_ques_path):
"""
创建问题句向量
:param sen_count: int
:param word2vec_model: gensim model
:param qa_path: str
:param matrix_ques_path:str
:return: None
"""
if os.path.exists(matrix_ques_path):
file_matrix_ques = open(matrix_ques_path, 'rb')
matrix_ques = pickle.load(file_matrix_ques)
return matrix_ques
print('create_matrix_org_pkl start!')
qa_dail = txtRead(qa_path, encodeType='utf-8')
# questions = []
matrix_ques = []
count = 0
for qa_dail_one in qa_dail:
ques = getChinese(qa_dail_one.split('\t')[0])
char_list = [ques_char for ques_char in ques]
sentence_vec = encoding_question(word2vec_model, char_list)
matrix_ques.append(sentence_vec)
if len(matrix_ques)%sen_count == 0 and len(matrix_ques) != 0:
print("count: " + str(count))
count += 1
np.savetxt(projectdir + "/Data/sentence_vec_encode_char/" + str(count)+".txt", matrix_ques)
matrix_ques = []
break
# count += 1
# np.savetxt(projectdir + "/Data/sentence_vec_encode_char/" + str(count)+".txt", matrix_ques)
print('create_matrix_org_pkl ok!')
# return matrix_ques
if __name__ == '__main__':
# 读取问答语料
syn_qa_dails = txtRead(chicken_and_gossip_path, encodeType='utf-8')
# 读取词向量
word2vec_model = load_word2vec_model(w2v_model_char_path, limit=None)
# 创建标准问答中问题的句向量存起来到matrix_ques_path 10万条可自己设置这里需要耗费点时间
if not os.path.exists(matrix_ques_part_path_char):
# matrix_ques = create_matrix_org_np(sen_count=100000, word2vec_model=word2vec_model, qa_path=chicken_and_gossip_path, matrix_ques_path=matrix_ques_part_path_char)
create_matrix_org_np(sen_count=100000, word2vec_model=word2vec_model, qa_path=chicken_and_gossip_path, matrix_ques_path=matrix_ques_part_path_char)
# 重载
matrix_ques = np.loadtxt(matrix_ques_part_path_char)
print("np.loadtxt(matrix_ques_part_path_char) ok!")
while True:
print("你问: ")
ques_ask = input()
ques_clean = getChinese(ques_ask)
char_list = [ques_char for ques_char in ques_clean]
sentence_vic = encoding_question(word2vec_model, char_list)
top_20_qid = most_similar_sentence_vec(sentence_vic, matrix_ques, top_vec=20)
try:
print("小姜机器人: " + syn_qa_dails[top_20_qid[0][0]].strip().split("\t")[1])
print([(syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[0], syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[1]) for i in range(len(top_20_qid))])
except Exception as e:
# 有的字符可能打不出来
print(str(e))

View File

@ -0,0 +1,217 @@
# -*- coding: UTF-8 -*-
# !/usr/bin/python
# @time :2019/4/4 10:00
# @author :Mo
# @function :chatbot based search, encode sentence_vec by word
from conf.path_config import w2v_model_merge_short_path, w2v_model_wiki_word_path
from conf.path_config import projectdir, chicken_and_gossip_path
from utils.text_tools import txtRead, txtWrite, getChinese
from conf.path_config import matrix_ques_part_path
from numpy import float32 as numpy_type
from collections import Counter
import pickle, jieba, os, re
import jieba.posseg as pseg
from gensim import matutils
from math import log
import numpy as np
import gensim
import jieba
import time
def load_word2vec_model(path, bin=False, limit=None):
print("load_word2vec_model start!")
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(path, limit=limit, binary=bin, unicode_errors='ignore')
print("load_word2vec_model end!")
return word2vec_model
def is_oov(model_vec, query_seg, p_max=0.16):
"""
判断查询分词的oov情况是放弃如果oov词个数超过xx%则放弃该回答答案
:param topic_model:
:return:
"""
words = [word for word in query_seg if str(word).strip() is not ""]
count_total = 1
count_oov = 0
if words:
count_total = len(words)
for word in words:
if word not in model_vec:
count_oov = count_oov + 1
return float(count_oov/count_total) > p_max
def get_td_idf_flag(jieba_cut_list, dictionary, tfidf_model):
# todo
'''获取td-idf权重有问题同一个词只计算一次有的还没有比如说停用词'''
seg1_list = []
vec1 = tfidf_model[dictionary.doc2bow(jieba_cut_list)]
for vec1_one in vec1:
seg1_list.append(vec1_one[1])
sum_seg1_list = sum(seg1_list)
return [x/sum_seg1_list for x in seg1_list]
def get_jieba_flag(flag):
'''词性'''
if flag in ['n', 'nr', 'ns', 'nt', 'nz']:
weight = 1.3
elif flag in ['r', 'i', 't', 'ng', 'an']:
weight = 0.7
else:
weight = 1
return weight
def word_segment_process(sentence):
"""
jieba切词\词性
:param sentence:
:return:
"""
sentence = sentence.replace('\n', '').replace(',', '').replace('"', '').replace(' ', '').replace('\t', '').upper().strip()
word_list = []
flag_list = []
try:
sentence_cut = ''.join(jieba.lcut(sentence, cut_all=False, HMM=False))
words = pseg.cut(sentence_cut)
for word in words:
word_list.append(word.word)
flag_list.append(word.flag)
except Exception as e:
word_list = [sentence]
flag_list = ['nt']
return word_list, flag_list
def encoding_question(w2v_model, word_list, flag_list):
''' 生成句子向量
:param wordlist: 分词list
:param is_replaced: 是否替换default true
:param debug_mode: default false
:return: array句子的向量 len=300
'''
try:
sentence_vec = w2v_model.wv[w2v_model.index2word[1]] * 0
except:
sentence_vec = w2v_model.wv[w2v_model.index2word[1]] * 0
for k in range(len(word_list)):
word = word_list[k]
flag = flag_list[k]
if type(word) == str:
try:
sentence_vec = sentence_vec + w2v_model.wv[word] * get_jieba_flag(flag)
except Exception as e:
if word not in [' ', '']:
sentence_vec = sentence_vec + 1
return sentence_vec
def most_similar_sentence_vec(vec_ques, matrix_org, top_vec=20):
"""
最相似的句子句向量与矩阵点乘
:param vec:
:param matrix:
:param keys:
:param topn:
:return:
"""
# 首先对句向量矩阵标号
matrix_org_index = list(range(len(matrix_org)))
# Scale a vector to unit length. The only exception is the zerovector, which is returned back unchanged.
vec_ques_mean = matutils.unitvec(np.array([vec_ques]).mean(axis=0)).astype(numpy_type)
# matrix_org单位化
matrix_org_norm = (matrix_org / np.sqrt((matrix_org ** 2).sum(-1))[..., np.newaxis]).astype(numpy_type)
# 计算两个向量之间的相似度使用numpy的dot函数矩阵点乘
matrix_vec_dot = np.dot(matrix_org_norm, vec_ques_mean)
# 防止top_vec越界
top_vec = min(len(matrix_org), top_vec)
# 相似度排序
most_similar_sentence_vec_sort = matutils.argsort(matrix_vec_dot, topn=top_vec, reverse=True)
index_score = []
for t in most_similar_sentence_vec_sort[:top_vec]:
index_score.append([matrix_org_index[t], float(matrix_vec_dot[t])])
return index_score
def create_matrix_org_np(sen_count, word2vec_model, qa_path, matrix_ques_path_word):
"""
创建问题句向量,设置sen_count=10000, 防止内存不够奔溃
:param sen_count: int, write sentence_encode num per twice
:param word2vec_model: model
:param qa_path: str
:param matrix_ques_path: str
:return:
"""
if os.path.exists(matrix_ques_path_word):
file_matrix_ques = open(matrix_ques_path_word, 'rb')
matrix_ques = pickle.load(file_matrix_ques)
return matrix_ques
print('create_matrix_org_pkl start!')
qa_dail = txtRead(qa_path, encodeType='utf-8')
# questions = []
matrix_ques = []
count = 0
for qa_dail_one in qa_dail:
ques = getChinese(qa_dail_one.split('\t')[0])
# questions.append(ques)
word_list, flag_list = word_segment_process(ques)
sentence_vec = encoding_question(word2vec_model, word_list, flag_list)
matrix_ques.append(sentence_vec)
if len(matrix_ques)%sen_count == 0 and len(matrix_ques) != 0:
print("count: " + str(count))
count += 1
np.savetxt(projectdir + "/Data/sentence_vec_encode_word/" + str(count)+".txt", matrix_ques)
matrix_ques = []
# break
count += 1
np.savetxt(projectdir + "/Data/sentence_vec_encode_word/" + str(count)+".txt", matrix_ques)
# matrix_ques = []
# file_matrix_ques = open(matrix_ques_path, 'wb')
# pickle.dump(matrix_ques, file_matrix_ques)
print('create_matrix_org_np ok!')
# return matrix_ques
if __name__ == '__main__':
# 读取问答语料
syn_qa_dails = txtRead(chicken_and_gossip_path, encodeType='utf-8')
# 读取词向量w2v_model_wiki_word_path数据是自己训练的w2v_model_merge_short_path只取了部分数据你可以前往下载
if os.path.exists(w2v_model_wiki_word_path):
word2vec_model = load_word2vec_model(w2v_model_wiki_word_path, limit=None)
print("load w2v_model_wiki_word_path ok!")
else:
word2vec_model = load_word2vec_model(w2v_model_merge_short_path, limit=None)
print("load w2v_model_merge_short_path ok!")
# 创建标准问答中问题的句向量存起来到matrix_ques_path
if not os.path.exists(matrix_ques_part_path):
create_matrix_org_np(sen_count=100000, word2vec_model=word2vec_model, qa_path=chicken_and_gossip_path, matrix_ques_path_word=matrix_ques_part_path)
# 读取
print("np.loadtxt(matrix_ques_part_path) start!")
matrix_ques = np.loadtxt(matrix_ques_part_path)
print("np.loadtxt(matrix_ques_part_path) end!")
while True:
print("你: ")
ques_ask = input()
ques_clean = getChinese(ques_ask)
word_list, flag_list = word_segment_process(ques_clean)
sentence_vic = encoding_question(word2vec_model, word_list, flag_list)
top_20_qid = most_similar_sentence_vec(sentence_vic, matrix_ques, top_vec=20)
try:
print("小姜机器人: " + syn_qa_dails[top_20_qid[0][0]].strip().split("\t")[1])
print([(syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[0], syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[1]) for i in range(len(top_20_qid))])
except Exception as e:
# 有的字符可能打不出来
print(str(e))

View File

@ -0,0 +1,5 @@
# -*- coding: UTF-8 -*-
#!/usr/bin/python
# @Time :2019/3/29 23:10
# @author :Mo
# @function :

Binary file not shown.

View File

@ -0,0 +1,104 @@
# -*- coding: UTF-8 -*-
# !/usr/bin/python
# @time :2019/4/1 10:35
# @author :Mo
# @function :cut sentences
from conf.path_config import chicken_and_gossip_path, td_idf_cut_path, td_idf_cut_pinyin
from utils.text_tools import txtWrite, txtRead, get_syboml, strQ2B
from conf.path_config import projectdir
from gensim import corpora, models
import xpinyin
import pickle
import jieba
def cut_td_idf(sources_path, target_path):
"""
结巴切词汉语
:param path:
:return:
"""
print("cut_td_idf start! ")
corpus = txtRead(sources_path)
governments = []
for corpus_one in corpus:
corpus_one_clear = corpus_one.replace(' ', '').strip()
ques_q2b = strQ2B(corpus_one_clear.strip())
ques_q2b_syboml = get_syboml(ques_q2b)
governments.append(ques_q2b_syboml.strip())
government_ques = list(map(lambda x: ' '.join(jieba.lcut(x)), governments))
topic_ques_all = []
for topic_ques_one in government_ques:
top_ques_aqlq = topic_ques_one.replace(' ', ' ').replace(' ', ' ').strip() + '\n'
topic_ques_all.append(top_ques_aqlq)
txtWrite(topic_ques_all, target_path)
print("cut_td_idf ok! " + sources_path)
def cut_td_idf_pinyin(sources_path, target_path): #获取拼音
"""
汉语转拼音
:param path:
:return:
"""
pin = xpinyin.Pinyin()
corpus = txtRead(sources_path)
topic_ques_all = []
corpus_count = 0
for corpus_one in corpus:
corpus_count += 1
# time1 = time.time()
corpus_one_clear = corpus_one.replace(' ', '').strip()
ques_q2b = strQ2B(corpus_one_clear.strip())
ques_q2b_syboml = get_syboml(ques_q2b)
ques_q2b_syboml_pinying = pin.get_pinyin(ques_q2b_syboml.replace(' ', '').replace(' ', '').strip(), ' ')
topic_ques_all.append(ques_q2b_syboml_pinying + '\n')
# time2 = time.time()
# print(str(corpus_count) + 'time:' + str(time2 - time1))
txtWrite(topic_ques_all, target_path)
print("cut_td_idf_pinyin ok! " + sources_path)
def init_tfidf_chinese_or_pinyin(sources_path):
"""
构建td_idf
:param path:
:return:
"""
questions = txtRead(sources_path)
corpora_documents = []
for item_text in questions:
item_seg = list(jieba.cut(str(item_text).strip()))
corpora_documents.append(item_seg)
dictionary = corpora.Dictionary(corpora_documents)
corpus = [dictionary.doc2bow(text) for text in corpora_documents]
tfidf_model = models.TfidfModel(corpus)
print("init_tfidf_chinese_or_pinyin ok! " + sources_path)
file = open(sources_path.replace(".csv", "_dictionary_model.pkl"), 'wb')
pickle.dump([dictionary, tfidf_model], file)
if __name__ == '__main__':
# path_text = projectdir + '/Data/chicken_gossip.txt'
# sentences = txtRead(path_text)
# sentences_q = []
# for sentences_one in sentences:
# sentences_one_replace = sentences_one.replace(" ", "").replace("\t", "")
# sentences_one_replace_split = sentences_one_replace.split("|")
# sentence_new = sentences_one_replace_split[0] + "\t" + "".join(sentences_one_replace_split[1:])
# sentences_q.append(sentence_new)
# sentences = txtWrite(sentences_q, projectdir + '/Data/chicken_and_gossip.txt')
cut_td_idf(chicken_and_gossip_path, td_idf_cut_path)
cut_td_idf_pinyin(chicken_and_gossip_path, td_idf_cut_pinyin)
init_tfidf_chinese_or_pinyin(td_idf_cut_path)
init_tfidf_chinese_or_pinyin(td_idf_cut_pinyin)
print("corpus ok!")

View File

@ -0,0 +1,330 @@
# -*- coding: UTF-8 -*-
# !/usr/bin/python
# @time :2019/4/4 10:00
# @author :Mo
# @function :
from sklearn.feature_extraction.text import TfidfVectorizer
from utils.text_tools import txtRead, get_syboml, strQ2B
import Levenshtein as Leven
from fuzzywuzzy import fuzz
import jieba.analyse
import numpy as np
import xpinyin
import pickle
import jieba
import os
zero_bit = 0.000000001
pin = xpinyin.Pinyin()
def clear_sentence(sentence):
"""
数据清晰全角转半角
:param sentence: str, input sentence
:return: str, clearned sentences
"""
corpus_one_clear = str(sentence).replace(' ', '').strip()
ques_q2b = strQ2B(corpus_one_clear.strip())
ques_q2b_syboml = get_syboml(ques_q2b)
return ques_q2b_syboml
def chinese2pinyin(sentence):
"""
chinese translate to pingyin
:param sentence: str, input sentence
:return: str, output pingyin
"""
ques_q2b_syboml_pinying = pin.get_pinyin(sentence, ' ')
return ques_q2b_syboml_pinying
def hamming_distance(v1, v2):
n = int(v1, 2) ^ int(v2, 2)
return bin(n & 0xffffffff).count('1')
def cosine_distance(v1, v2): # 余弦距离
if v1.all() and v2.all():
return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
else:
return 0
def euclidean_distance(v1, v2): # 欧氏距离
return np.sqrt(np.sum(np.square(v1 - v2)))
def manhattan_distance(v1, v2): # 曼哈顿距离
return np.sum(np.abs(v1 - v2))
def chebyshev_distance(v1, v2): # 切比雪夫距离
return np.max(np.abs(v1 - v2))
def minkowski_distance(v1, v2): # 闵可夫斯基距离
return np.sqrt(np.sum(np.square(v1 - v2)))
def euclidean_distance_standardized(v1, v2): # 标准化欧氏距离
v1_v2 = np.vstack([v1, v2])
sk_v1_v2 = np.var(v1_v2, axis=0, ddof=1)
return np.sqrt(((v1 - v2) ** 2 / (sk_v1_v2 + zero_bit * np.ones_like(sk_v1_v2))).sum())
def mahalanobis_distance(v1, v2): # 马氏距离
# 马氏距离要求样本数要大于维数,否则无法求协方差矩阵
# 此处进行转置表示10个样本每个样本2维
X = np.vstack([v1, v2])
XT = X.T
# 方法一:根据公式求解
S = np.cov(X) # 两个维度之间协方差矩阵
try:
SI = np.linalg.inv(S) # 协方差矩阵的逆矩阵 todo
except:
SI = np.zeros_like(S)
# 马氏距离计算两个样本之间的距离此处共有10个样本两两组合共有45个距离。
n = XT.shape[0]
distance_all = []
for i in range(0, n):
for j in range(i + 1, n):
delta = XT[i] - XT[j]
distance_1 = np.sqrt(np.dot(np.dot(delta, SI), delta.T))
distance_all.append(distance_1)
return np.sum(np.abs(distance_all))
def bray_curtis_distance(v1, v2): # 布雷柯蒂斯距离, 生物学生态距离
up_v1_v2 = np.sum(np.abs(v2 - v1))
down_v1_v2 = np.sum(v1) + np.sum(v2)
return up_v1_v2 / (down_v1_v2 + zero_bit)
def pearson_correlation_distance(v1, v2): # 皮尔逊相关系数Pearson correlation
v1_v2 = np.vstack([v1, v2])
return np.corrcoef(v1_v2)[0][1]
def jaccard_similarity_coefficient_distance(v1, v2): # 杰卡德相似系数(Jaccard similarity coefficient)
# 方法一:根据公式求解
v1 = np.asarray(v1)
v2 = np.asarray(v2)
up = np.double(np.bitwise_and((v1 != v2), np.bitwise_or(v1 != 0, v2 != 0)).sum())
down = np.double(np.bitwise_or(v1 != 0, v2 != 0).sum() + zero_bit)
return up / down
def wmd_distance(model, sent1_cut_list, sent2_cut_list): # WMD距离
# model.init_sims(replace=True)
distance = model.wmdistance(sent1_cut_list, sent2_cut_list)
return distance
# def HamMings_Levenshtein(str1, str2):
# sim = Leven.hamming(str1, str2)
# return sim
def edit_levenshtein(str1, str2):
return Leven.distance(str1, str2)
def ratio_levenshtein(str1, str2):
return Leven.ratio(str1, str2)
def jaro_levenshtein(str1, str2):
return Leven.jaro(str1, str2)
def set_ratio_fuzzywuzzy(str1, str2):
return fuzz.token_set_ratio(str1, str2)
def sort_ratio_fuzzywuzzy(str1, str2):
return fuzz.token_sort_ratio(str1, str2)
def num_of_common_sub_str(str1, str2):
'''
求两个字符串的最长公共子串
思想建立一个二维数组保存连续位相同与否的状态
'''
lstr1 = len(str1)
lstr2 = len(str2)
record = [[0 for i in range(lstr2 + 1)] for j in range(lstr1 + 1)] # 多一位
maxNum = 0 # 最长匹配长度
p = 0 # 匹配的起始位
for i in range(lstr1):
for j in range(lstr2):
if str1[i] == str2[j]:
# 相同则累加
record[i + 1][j + 1] = record[i][j] + 1
if record[i + 1][j + 1] > maxNum:
# 获取最大匹配长度
maxNum = record[i + 1][j + 1]
# 记录最大匹配长度的终止位置
p = i + 1
# return str1[p - maxNum:p], maxNum
return maxNum
####################################################### 汉明距离
def string_hash(source):
if source == "":
return 0
else:
x = ord(source[0]) << 7
m = 1000003
mask = 2 ** 128 - 1
for c in source:
x = ((x * m) ^ ord(c)) & mask
x ^= len(source)
if x == -1:
x = -2
x = bin(x).replace('0b', '').zfill(64)[-64:]
return str(x)
def sim_hash(content):
seg = jieba.cut(content)
keyWord = jieba.analyse.extract_tags('|'.join(seg), topK=20, withWeight=True, allowPOS=())
# 先按照权重排序,再按照词排序
keyList = []
# print(keyWord)
for feature, weight in keyWord:
weight = int(weight * 20)
feature = string_hash(feature)
temp = []
for f in feature:
if f == '1':
temp.append(weight)
else:
temp.append(-weight)
keyList.append(temp)
content_list = np.sum(np.array(keyList), axis=0)
# 编码读不出来
if len(keyList) == 0:
return '00'
simhash = ''
for c in content_list:
if c > 0:
simhash = simhash + '1'
else:
simhash = simhash + '0'
return simhash
def hamming_distance_equal(v1, v2):
n = int(v1, 2) ^ int(v2, 2)
return bin(n & 0xffffffff).count('1')
def hamming_distance(sen1, sen2):
return hamming_distance_equal(sim_hash(sen1), sim_hash(sen2))
def normalization(x):
"""
归一化最大最小值
:param x:
:return:
"""
return [(float(i) - min(x)) / float(max(x) - min(x) + zero_bit) for i in x]
def z_score(x, axis=0):
"""
标准化
:param x: arrary, numpy
:param axis: int, 0
:return: arrary, numpy
"""
x = np.array(x).astype(float)
xr = np.rollaxis(x, axis=axis)
xr -= np.mean(x, axis=axis)
xr /= np.std(x, axis=axis)
# print(x)
return x
def tok_td_idf(data_path):
if os.path.exists(data_path + 'td_idf_cut.csv'):
'''#计算TD-DIDF获取训练测试数据'''
datas = txtRead(data_path + 'td_idf_cut.csv')
# 默认值只匹配长度≥2的单词,修改为1ngram_range特征所以有2个词的,总计词语50428个
# vec_tdidf = TfidfVectorizer(ngram_range=(1, 2), token_pattern=r"(?u)\b\w+\b", min_df=1, max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1,max_features=30000)
vec_tdidf = TfidfVectorizer(ngram_range=(1, 2), token_pattern=r"(?u)\b\w+\b", min_df=3,
max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1, max_features=50000)
vec_tdidf.fit_transform(datas)
file_vec_tdidf = open(data_path + 'td_idf_cut_model.pkl', 'wb')
pickle.dump(vec_tdidf, file_vec_tdidf)
return vec_tdidf
def tok_td_idf_pinyin(data_path):
if os.path.exists(data_path + 'td_idf_cut_pinyin.csv'):
'''#计算TD-DIDF获取训练测试数据'''
datas = txtRead(data_path + 'td_idf_cut_pinyin.csv')
# 默认值只匹配长度≥2的单词,修改为1ngram_range特征所以有2个词的,总计词语50428个
# vec_tdidf = TfidfVectorizer(ngram_range=(1, 2), token_pattern=r"(?u)\b\w+\b", min_df=1, max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1,max_features=30000)
vec_tdidf = TfidfVectorizer(ngram_range=(1, 2), token_pattern=r"(?u)\b\w+\b", min_df=3,
max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1, max_features=50000)
vec_tdidf.fit_transform(datas)
file_vec_tdidf = open(data_path + 'td_idf_cut_pinyin_model.pkl', 'wb')
pickle.dump(vec_tdidf, file_vec_tdidf)
return vec_tdidf
if __name__ == '__main__':
vec1_test = np.array([1, 38, 17, 32])
vec2_test = np.array([5, 6, 8, 9])
str1_test = "你到底是谁?"
str2_test = "没想到我是谁,是真样子"
print(clear_sentence(str1_test)) # 数据处理
print(chinese2pinyin(str1_test)) # 中文转拼音
print(euclidean_distance(vec1_test, vec2_test))
print(cosine_distance(vec1_test, vec2_test))
print(manhattan_distance(vec1_test, vec2_test))
print(euclidean_distance(vec1_test, vec2_test))
print(chebyshev_distance(vec1_test, vec2_test))
print(minkowski_distance(vec1_test, vec2_test))
print(euclidean_distance_standardized(vec1_test, vec2_test))
print(mahalanobis_distance(vec1_test, vec2_test))
print('###############################################')
print(bray_curtis_distance(vec1_test, vec2_test))
print(pearson_correlation_distance(vec1_test, vec2_test))
print(jaccard_similarity_coefficient_distance(vec1_test, vec2_test))
print('###############################################')
# print(HamMings_Levenshtein(str1, str2)),需要等长
# print(Wmd_distance(model, sent1_cut_list, sent2_cut_list)) # 需要gensim word2vec model
print(hamming_distance(str1_test, str2_test))
print(edit_levenshtein(str1_test, str2_test))
print(ratio_levenshtein(str1_test, str2_test))
print(jaro_levenshtein(str1_test, str2_test))
print(set_ratio_fuzzywuzzy(str1_test, str2_test))
print(sort_ratio_fuzzywuzzy(str1_test, str2_test))
print(num_of_common_sub_str(str1_test, str2_test))
print(normalization(vec1_test)) # 归一化0-1
print(z_score(vec1_test)) # 标准化0附近正负
# data_path = 'D:/workspace/python/bitbucket/nlp_model_v1.0/nlp_model/models/word_feature/sim_data/'
# tok_TD_IDF(data_path)
# tok_TD_IDF_pinyin(data_path)

View File

@ -0,0 +1,84 @@
# -*- coding: UTF-8 -*-
# !/usr/bin/python
# @time :2019/4/3 10:36
# @author :Mo
# @function :TS-SS distance
# @url :https://github.com/taki0112/Vector_Similarity
# @paper :A Hybrid Geometric Approach for Measuring Similarity Level Among Documents and Document Clustering
import numpy as np
import math
zero_bit = 0.000000001
def Cosine(vec1, vec2):
"""
余弦相似度
:param vec1: arrary
:param vec2: arrary
:return: float
"""
result = InnerProduct(vec1, vec2) / (VectorSize(vec1) * VectorSize(vec2) + zero_bit)
return result
def VectorSize(vec):
vec_pow = sum(math.pow(v + zero_bit, 2) for v in vec)
if vec_pow >= 0:
return math.sqrt(vec_pow)
else:
return zero_bit
def InnerProduct(vec1, vec2):
try:
return sum(v1 * v2 for v1, v2 in zip(vec1, vec2))
except:
return zero_bit
def Euclidean(vec1, vec2):
vec12_pow = sum(math.pow((v1 - v2), 2) for v1, v2 in zip(vec1, vec2))
if vec12_pow >= 0:
return math.sqrt(vec12_pow)
else:
return zero_bit
def Theta(vec1, vec2):
cosine_vec12 = Cosine(vec1, vec2)
if -1 <= cosine_vec12 and cosine_vec12 <= 1:
return math.acos(cosine_vec12) + 10
else:
return zero_bit + 10
def Triangle(vec1, vec2):
theta = math.radians(Theta(vec1, vec2))
return (VectorSize(vec1) * VectorSize(vec2) * math.sin(theta)) / 2
def Magnitude_Difference(vec1, vec2):
return abs(VectorSize(vec1) - VectorSize(vec2))
def Sector(vec1, vec2):
ED = Euclidean(vec1, vec2)
MD = Magnitude_Difference(vec1, vec2)
theta = Theta(vec1, vec2)
return math.pi * math.pow((ED + MD), 2) * theta / 360
def TS_SS(vec1, vec2):
return Triangle(vec1, vec2) * Sector(vec1, vec2)
if __name__ == '__main__':
vec1_test = np.array([1, 38, 17, 32])
vec2_test = np.array([5, 6, 8, 9])
print(Euclidean(vec1_test, vec2_test))
print(Cosine(vec1_test, vec2_test))
print(TS_SS(vec1_test, vec2_test))

View File

@ -0,0 +1,96 @@
# -*- coding: UTF-8 -*-
#!/usr/bin/python
# @Time :2019/3/12 14:18
# @author :Mo
# @site :https://blog.csdn.net/rensihui
from sklearn import preprocessing
import numpy as np
def autoL1L2(data, norms = 'l1'):
'''L1或者L2正则化'''
return preprocessing.normalize(data, norm = norms)
def autoScale(data):
'''标准化, (X-mean)/std.得到的结果是,对于每个属性/每列来说所有数据都聚集在0附近方差为1。'''
return preprocessing.scale(data)
def autoMinMaxScaler(data):
'''将属性缩放到一个指定范围'''
return preprocessing.MinMaxScaler(feature_range=(0, 1)).fit_transform(data)
def autoLinNorm(data): # 传入一个矩阵
''' 0-1归一化
:param data: []矩阵
:return: []
'''
mins = data.min(0) # 返回data矩阵中每一列中最小的元素返回一个列表
maxs = data.max(0) # 返回data矩阵中每一列中最大的元素返回一个列表
ranges = maxs - mins # 最大值列表 - 最小值列表 = 差值列表
normData = np.zeros(np.shape(data)) # 生成一个与 data矩阵同规格的normData全0矩阵用于装归一化后的数据
row = data.shape[0] # 返回 data矩阵的行数
normData = data - np.tile(mins, (row, 1)) # data矩阵每一列数据都减去每一列的最小值
normData = normData / np.tile(ranges, (row, 1)) # data矩阵每一列数据都除去每一列的差值差值 = 某列的最大值- 某列最小值)
return normData
def autoAvgNorm(data): # 传入一个矩阵
''' 均值归一化
:param data: []矩阵
:return: []
'''
avg = np.average(data, axis=1) # 返回data矩阵中每一列中最小的元素返回一个列表
sigma = np.std(data, axis=1) # 返回data矩阵中每一列中最大的元素返回一个列表
normData = np.zeros(np.shape(data)) # 生成一个与 data矩阵同规格的normData全0矩阵用于装归一化后的数据
row = data.shape[0] # 返回 data矩阵的行数
normData = data - np.tile(avg, (row, 1)) # data矩阵每一列数据都减去每一列的最小值
normData = normData / np.tile(sigma, (row, 1)) # data矩阵每一列数据都除去每一列的差值差值 = 某列的最大值- 某列最小值)
return normData
###Sigmoid函数Sigmoid函数是一个具有S形曲线的函数是良好的阈值函数在(0, 0.5)处中心对称,在(0, 0.5)附近有比较大的斜率,
# 而当数据趋向于正无穷和负无穷的时候映射出来的值就会无限趋向于1和0是个人非常喜欢的“归一化方法”之所以打引号是因为我觉得Sigmoid函数在
# 阈值分割上也有很不错的表现,根据公式的改变,就可以改变分割阈值,这里作为归一化方法,我们只考虑(0, 0.5)作为分割阈值的点的情况:
def sigmoid(data,useStatus):
''' sig归一化
:param data: []矩阵
:return: []
'''
if useStatus:
row=data.shape[0]
column=data.shape[1]
normData = np.zeros(np.shape(data))
for i in range(row):
for j in range(column):
normData[i][j]=1.0 / (1 + np.exp(-float(data[i][j])));
return normData
else:
return float(data);
if __name__ == '__main__':
arr = np.array([[8, 7, 8], [4, 3, 1], [6, 9, 8]])
print("l1正则化")
print(autoL1L2(arr, norms='l1'))
print("l2正则化")
print(autoL1L2(arr, norms='l2'))
print("0-1标准化处理")
print(autoScale(arr))
print("0-1缩放处理")
print(autoMinMaxScaler(arr))
print("0-1归一化处理")
print(autoLinNorm(arr))
print("均值归一化处理")
print(autoAvgNorm(arr))
print("sig归一化处理")
print(sigmoid(arr,True))

View File

@ -0,0 +1,384 @@
# -*- coding:utf-8 -*-
# -*- created by: moyongzhuo -*-
from FeatureProject.distance_text_or_vec import euclidean_distance, cosine_distance, manhattan_distance, euclidean_distance, jaccard_similarity_coefficient_distance
from FeatureProject.distance_text_or_vec import chebyshev_distance, minkowski_distance, euclidean_distance_standardized
from FeatureProject.distance_text_or_vec import mahalanobis_distance, bray_curtis_distance, pearson_correlation_distance
from FeatureProject.distance_text_or_vec import wmd_distance, normalization, z_score
from FeatureProject.distance_text_or_vec import hamming_distance, edit_levenshtein, ratio_levenshtein, jaro_levenshtein, set_ratio_fuzzywuzzy, sort_ratio_fuzzywuzzy
from FeatureProject.distance_text_or_vec import clear_sentence, chinese2pinyin, num_of_common_sub_str
from conf.path_config import word2_vec_path, td_idf_path, td_idf_path_pinyin
from FeatureProject.distance_vec_TS_SS import TS_SS
from gensim import corpora, models, matutils
from conf.path_config import projectdir
from gensim.models import KeyedVectors
import pandas as pd
import numpy as np
import pickle
import jieba
import time
import os
class SentenceSimFeature:
def __init__(self):
self.sen1 = None
self.sen2 = None
self.seg1 = None
self.seg2 = None
self.sen_vec1 = None
self.sen_vec2 = None
self.tfidf_vec1 = None
self.tfidf_vec2 = None
self.dictionary = None
self.tfidf_model = None
self.w2c_model = None
self.tfidf_pinyin_model = None
self.dictionary_pinyin = None
self.sen1_pinyin = None
self.sen2_pinyin = None
self.seg1_pinyin = None
self.seg2_pinyin = None
self.tfidf_vec1_pinyin = None
self.tfidf_vec2_pinyin = None
def set_data(self, sen1, sen2):
sen1 = clear_sentence(sen1)
sen2 = clear_sentence(sen2)
self.sen1 = str(sen1).strip()
self.sen2 = str(sen2).strip()
self.seg1 = list(jieba.cut(sen1))
self.seg2 = list(jieba.cut(sen2))
self.sen1_pinyin = chinese2pinyin(sen1)
self.sen2_pinyin = chinese2pinyin(sen2)
self.seg1_pinyin = (self.sen1_pinyin).split(' ')
self.seg2_pinyin = (self.sen2_pinyin).split(' ')
self.sen_vec1 = np.zeros(300)
self.sen_vec2 = np.zeros(300)
# self.tfidf_vec1 = np.array((self.tfidf_model.transform([' '.join(self.seg1)])).toarray().tolist()[0])
# self.tfidf_vec2 = np.array((self.tfidf_model.transform([' '.join(self.seg2)])).toarray().tolist()[0])
# self.tfidf_vec1_pinyin = np.array((self.tfidf_pinyin_model.transform([' '.join(self.seg1_pinyin)])).toarray().tolist()[0])
# self.tfidf_vec2_pinyin = np.array((self.tfidf_pinyin_model.transform([' '.join(self.seg2_pinyin)])).toarray().tolist()[0])
self.tfidf_vec1 = self.tfidf_model[self.dictionary.doc2bow(self.seg1)]
self.tfidf_vec2 = self.tfidf_model[self.dictionary.doc2bow(self.seg2)]
self.tfidf_vec1_pinyin = self.tfidf_pinyin_model[self.dictionary_pinyin.doc2bow(self.seg1_pinyin)]
self.tfidf_vec2_pinyin = self.tfidf_pinyin_model[self.dictionary_pinyin.doc2bow(self.seg2_pinyin)]
def same_word_count(self):
count_left = 0
for s in self.seg1:
if s in self.seg2:
count_left += 1
count_right = 0
for s in self.seg2:
if s in self.seg1:
count_right += 1
return min(count_left, count_right)
def same_char_count(self):
seg1 = list(self.sen1)
seg2 = list(self.sen2)
count_left = 0
for s in seg1:
if s in seg2:
count_left += 1
count_right = 0
for s in seg2:
if s in seg1:
count_right += 1
return min(count_left, count_right)
def sentence_length(self):
len_sen1 = len(self.sen1)
len_sen2 = len(self.sen2)
len_abs_sub = abs(len_sen1 - len_sen2)
len_rate = len_sen1 / len_sen2
len_add_rate = len_sen1 * len_sen2 / (len_sen1 + len_sen2)
return [len_abs_sub, len_rate, len_add_rate]
def init_sentence_vector(self):
# file_path = os.path.dirname(__file__)
print('load w2v model begin')
# model_path = os.path.join(file_path, word2_vec_path)
self.w2c_model = KeyedVectors.load_word2vec_format(word2_vec_path, unicode_errors='ignore', limit=None) # ,binary=True)
print('load w2v model success')
def encode_sentence_vector(self):
for s in self.seg1:
try:
self.sen_vec1 += self.w2c_model[s]
except:
self.sen_vec1 += np.zeros(300)
continue
for s in self.seg2:
try:
self.sen_vec2 += self.w2c_model[s]
except:
self.sen_vec2 += np.zeros(300)
continue
def init_tfidf(self):
file = open(td_idf_path, 'rb')
tfidf_dictionary_model = pickle.load(file)
self.dictionary = tfidf_dictionary_model[0]
self.tfidf_model = tfidf_dictionary_model[1]
file = open(td_idf_path_pinyin, 'rb')
tfidf_dictionary_pinyin_model = pickle.load(file)
self.dictionary_pinyin = tfidf_dictionary_pinyin_model[0]
self.tfidf_pinyin_model = tfidf_dictionary_pinyin_model[1]
print("init_tfidf ok!")
def w2c_all_vec(self):
w2c_Cosine = cosine_distance(self.sen_vec1, self.sen_vec2)
w2c_TS_SS = TS_SS(self.sen_vec1, self.sen_vec2)
w2c_Manhattan = manhattan_distance(self.sen_vec1, self.sen_vec2)
w2c_Euclidean = euclidean_distance(self.sen_vec1, self.sen_vec2)
w2c_Jaccard = jaccard_similarity_coefficient_distance(self.sen_vec1, self.sen_vec2)
w2c_Chebyshev = chebyshev_distance(self.sen_vec1, self.sen_vec2)
w2c_Minkowski = minkowski_distance(self.sen_vec1, self.sen_vec2)
w2c_Euclidean_Standard = euclidean_distance_standardized(self.sen_vec1, self.sen_vec2)
w2c_Mahalanobis = mahalanobis_distance(self.sen_vec1, self.sen_vec2)
w2c_Bray = bray_curtis_distance(self.sen_vec1, self.sen_vec2)
w2c_Pearson = pearson_correlation_distance(self.sen_vec1, self.sen_vec2)
# w2c_Wmd = Wmd_Distance(self.w2c_model, self.sen_vec1, self.sen_vec2)
return [w2c_Cosine, w2c_TS_SS, w2c_Manhattan, w2c_Euclidean, w2c_Jaccard, w2c_Chebyshev,
w2c_Minkowski, w2c_Euclidean_Standard, w2c_Mahalanobis, w2c_Bray, w2c_Pearson]
def tdidf_all_vec(self):
return matutils.cossim(self.tfidf_vec1, self.tfidf_vec2)
def edit_all_str(self):
str_hamming = hamming_distance(self.sen1, self.sen2)
str_edit = edit_levenshtein(self.sen1, self.sen2)
str_ratio = ratio_levenshtein(self.sen1, self.sen2)
str_jaro = jaro_levenshtein(self.sen1, self.sen2)
str_set_ratio_fuzz = set_ratio_fuzzywuzzy(self.sen1, self.sen2)
str_sort_ratio_fuzz = sort_ratio_fuzzywuzzy(self.sen1, self.sen2)
str_commonsubstr = num_of_common_sub_str(self.sen1, self.sen2)
str_list_Wmd = wmd_distance(self.w2c_model, self.seg1, self.seg2)
return [str_hamming, str_edit, str_ratio, str_jaro,
str_set_ratio_fuzz, str_sort_ratio_fuzz, str_commonsubstr, str_list_Wmd]
def word_jaccard(self):
a = list(set(self.seg1).intersection(set(self.seg2)))
b = list(set(self.seg1).union(set(self.seg2)))
return float(len(a) / len(b))
def char_jaccard(self):
a = list(set(list(self.sen1)).intersection(set(list(self.sen2))))
b = list(set(list(self.sen1)).union(set(list(self.sen2))))
return float(len(a) / len(b))
def tdidf_all_vec_pinyin(self):
return matutils.cossim(self.tfidf_vec1_pinyin, self.tfidf_vec2_pinyin)
def edit_all_pinyin(self):
pinyin_hamming = hamming_distance(self.sen1_pinyin, self.sen2_pinyin)
pinyin_edit = edit_levenshtein(self.sen1_pinyin, self.sen2_pinyin)
pinyin_ratio = ratio_levenshtein(self.sen1_pinyin, self.sen2_pinyin)
pinyin_jaro = jaro_levenshtein(self.sen1_pinyin, self.sen2_pinyin)
pinyin_set_ratio_fuzz = set_ratio_fuzzywuzzy(self.sen1_pinyin, self.sen2_pinyin)
pinyin_sort_ratio_fuzz = sort_ratio_fuzzywuzzy(self.sen1_pinyin, self.sen2_pinyin)
pinyin_commonsubstr = num_of_common_sub_str(self.sen1_pinyin, self.sen2_pinyin)
pinyin_list_Wmd = wmd_distance(self.w2c_model, self.seg1_pinyin, self.seg2_pinyin)
return [pinyin_hamming, pinyin_edit, pinyin_ratio, pinyin_jaro,
pinyin_set_ratio_fuzz, pinyin_sort_ratio_fuzz, pinyin_commonsubstr, pinyin_list_Wmd]
def word_jaccard_pinyin(self):
a = list(set(self.seg1_pinyin).intersection(set(self.seg2_pinyin)))
b = list(set(self.seg1_pinyin).union(set(self.seg2_pinyin)))
return float(len(a) / len(b))
def char_jaccard_pinyin(self):
a = list(set(list(self.seg1_pinyin)).intersection(set(list(self.seg2_pinyin))))
b = list(set(list(self.seg1_pinyin)).union(set(list(self.seg2_pinyin))))
return float(len(a) / len(b))
def sentence_input_t():
while True:
s1 = input('s1: ')
s2 = input('s2: ')
start_time = time.time()
ssf.set_data(s1, s2)
ssf.encode_sentence_vector()
time1 = time.time()
print('set_data time' + str(time1 - start_time))
# 相同词、长度
same_word_count = ssf.same_word_count()
time2 = time.time()
print('same_word_count time' + str(time2 - time1))
same_char_count = ssf.same_char_count()
time3 = time.time()
print('same_char_count time' + str(time3 - time2))
[len_abs_sub, len_rate, len_add_rate] = ssf.sentence_length()
time4 = time.time()
print('sentence_length time' + str(time4 - time3))
# w2c_all_vec
[w2c_Cosine, w2c_TS_SS, w2c_Manhattan, w2c_Euclidean,
w2c_Jaccard, w2c_Chebyshev, w2c_Minkowski, w2c_Euclidean_Standard, w2c_Mahalanobis,
w2c_Bray, w2c_Pearson] = ssf.w2c_all_vec()
time5 = time.time()
print('w2c_all_vec time' + str(time5 - time4))
# tdidf_all_vec
# [tdidf_Cosine, tdidf_TS_SS, tdidf_Manhattan, tdidf_Euclidean,
# tdidf_Jaccard, tdidf_Chebyshev,tdidf_Minkowski, tdidf_Euclidean_Standard, tdidf_Mahalanobis,
# tdidf_Bray, tdidf_Pearson] = ssf.tdidf_all_vec()
tdidf_cossim = ssf.tdidf_all_vec()
time6 = time.time()
print('tdidf_all_vec time' + str(time6 - time5))
# edit_all_str
[str_hamming, str_edit, str_ratio, str_jaro,
str_set_ratio_fuzz, str_sort_ratio_fuzz, str_commonsubstr, str_list_Wmd] = ssf.edit_all_str()
time7 = time.time()
print('edit_all_str time' + str(time7 - time6))
# jaccard系数
word_jaccard = ssf.word_jaccard()
char_jaccard = ssf.char_jaccard()
time8 = time.time()
print('jaccard系数 time' + str(time8 - time7))
# tdidf_all_vec_pinyin
# [tdidf_piyin_Cosine, tdidf_piyin_TS_SS, tdidf_piyin_Manhattan, tdidf_piyin_Euclidean, tdidf_piyin_Jaccard,
# tdidf_piyin_Chebyshev, tdidf_piyin_Minkowski, tdidf_piyin_Euclidean_Standard, tdidf_piyin_Mahalanobis,
# tdidf_piyin_Bray, tdidf_piyin_Pearson] = ssf.tdidf_all_vec_pinyin()
tdidf_pinyin_cossim = ssf.tdidf_all_vec_pinyin()
time9 = time.time()
print('tdidf_all_vec_pinyin time' + str(time9 - time8))
# edit_all_pinyin
[pinyin_hamming, pinyin_edit, pinyin_ratio, pinyin_jaro,
pinyin_set_ratio_fuzz, pinyin_sort_ratio_fuzz, pinyin_commonsubstr, pinyin_list_Wmd] = ssf.edit_all_pinyin()
time10 = time.time()
print('edit_all_pinyin time' + str(time10 - time9))
# jaccard系数
word_jaccard_pinyin = ssf.word_jaccard_pinyin()
char_jaccard_pinyin = ssf.char_jaccard_pinyin()
time11 = time.time()
print('jaccard系数pinyin time' + str(time11 - time10))
sim_all_last = [same_word_count, same_char_count, len_abs_sub, len_rate, len_add_rate,
w2c_Cosine, w2c_TS_SS, w2c_Manhattan, w2c_Euclidean, w2c_Jaccard, w2c_Chebyshev, w2c_Minkowski,
w2c_Euclidean_Standard, w2c_Mahalanobis, w2c_Bray, w2c_Pearson,
tdidf_cossim, str_hamming, str_edit, str_ratio, str_jaro, str_set_ratio_fuzz,
str_sort_ratio_fuzz,
str_commonsubstr, str_list_Wmd,
word_jaccard, char_jaccard, tdidf_pinyin_cossim,
pinyin_hamming, pinyin_edit, pinyin_ratio, pinyin_jaro, pinyin_set_ratio_fuzz,
pinyin_sort_ratio_fuzz,
pinyin_commonsubstr, pinyin_list_Wmd,
word_jaccard_pinyin, char_jaccard_pinyin]
print("sim: ")
print(sim_all_last)
if __name__ == '__main__':
ssf = SentenceSimFeature()
ssf.init_sentence_vector()
ssf.init_tfidf()
s1 = "你知道Mo的能力上限吗"
s2 = "你好呀Mo水平很差"
start_time = time.time()
ssf.set_data(s1, s2)
ssf.encode_sentence_vector()
time1 = time.time()
print('set_data time' + str(time1 - start_time))
# 相同词、长度
same_word_count = ssf.same_word_count()
time2 = time.time()
print('same_word_count time' + str(time2 - time1))
same_char_count = ssf.same_char_count()
time3 = time.time()
print('same_char_count time' + str(time3 - time2))
[len_abs_sub, len_rate, len_add_rate] = ssf.sentence_length()
time4 = time.time()
print('sentence_length time' + str(time4 - time3))
# w2c_all_vec
[w2c_Cosine, w2c_TS_SS, w2c_Manhattan, w2c_Euclidean,
w2c_Jaccard, w2c_Chebyshev, w2c_Minkowski, w2c_Euclidean_Standard, w2c_Mahalanobis,
w2c_Bray, w2c_Pearson] = ssf.w2c_all_vec()
time5 = time.time()
print('w2c_all_vec time' + str(time5 - time4))
# tdidf_all_vec
tdidf_cossim = ssf.tdidf_all_vec()
time6 = time.time()
print('tdidf_all_vec time' + str(time6 - time5))
# edit_all_str
[str_hamming, str_edit, str_ratio, str_jaro,
str_set_ratio_fuzz, str_sort_ratio_fuzz, str_commonsubstr, str_list_Wmd] = ssf.edit_all_str()
time7 = time.time()
print('edit_all_str time' + str(time7 - time6))
# jaccard系数
word_jaccard = ssf.word_jaccard()
char_jaccard = ssf.char_jaccard()
time8 = time.time()
print('jaccard系数 time' + str(time8 - time7))
# pinyin
tdidf_pinyin_cossim = ssf.tdidf_all_vec_pinyin()
time9 = time.time()
print('tdidf_all_vec_pinyin time' + str(time9 - time8))
# edit_all_pinyin
[pinyin_hamming, pinyin_edit, pinyin_ratio, pinyin_jaro,
pinyin_set_ratio_fuzz, pinyin_sort_ratio_fuzz, pinyin_commonsubstr, pinyin_list_Wmd] = ssf.edit_all_pinyin()
time10 = time.time()
print('edit_all_pinyin time' + str(time10 - time9))
# jaccard系数
word_jaccard_pinyin = ssf.word_jaccard_pinyin()
char_jaccard_pinyin = ssf.char_jaccard_pinyin()
time11 = time.time()
print('jaccard系数pinyin time' + str(time11 - time10))
sim_all_last = [same_word_count, same_char_count, len_abs_sub, len_rate, len_add_rate,
w2c_Cosine, w2c_TS_SS, w2c_Manhattan, w2c_Euclidean, w2c_Jaccard, w2c_Chebyshev, w2c_Minkowski,
w2c_Euclidean_Standard, w2c_Mahalanobis, w2c_Bray, w2c_Pearson,
tdidf_cossim, str_hamming, str_edit, str_ratio, str_jaro, str_set_ratio_fuzz, str_sort_ratio_fuzz,
str_commonsubstr, str_list_Wmd,
word_jaccard, char_jaccard, tdidf_pinyin_cossim,
pinyin_hamming, pinyin_edit, pinyin_ratio, pinyin_jaro, pinyin_set_ratio_fuzz,
pinyin_sort_ratio_fuzz,
pinyin_commonsubstr, pinyin_list_Wmd,
word_jaccard_pinyin, char_jaccard_pinyin]
print("小姜机器人计算sim: ")
print(sim_all_last)
sentence_input_t()

5
conf/__init__.py Normal file
View File

@ -0,0 +1,5 @@
# -*- coding: UTF-8 -*-
# !/usr/bin/python
# @time :2019/4/3 11:23
# @author :Mo
# @function :

Binary file not shown.

Binary file not shown.

39
conf/path_config.py Normal file
View File

@ -0,0 +1,39 @@
# -*- coding: UTF-8 -*-
# !/usr/bin/python
# @time :2019/4/3 11:23
# @author :Mo
# @function :path
import pathlib
import sys
import os
# base dir
projectdir = str(pathlib.Path(os.path.abspath(__file__)).parent.parent)
sys.path.append(projectdir)
print(projectdir)
# corpus
chicken_and_gossip_path = projectdir + '/Data/corpus/chicken_and_gossip.txt'
# word2vec
w2v_model_merge_short_path = projectdir + "/Data/chinese_vector/w2v_model_merge_short.vec"
# tf_idf
td_idf_cut_path = projectdir + '/Data/tf_idf/td_idf_cut.csv'
td_idf_cut_pinyin = projectdir + '/Data/tf_idf/td_idf_cut_pinyin.csv'
td_idf_path_pinyin = projectdir + '/Data/tf_idf/td_idf_cut_pinyin_dictionary_model.pkl'
td_idf_path = projectdir + '/Data/tf_idf/td_idf_cut_dictionary_model.pkl'
# word, 句向量
w2v_model_wiki_word_path = projectdir + '/Data/chinese_vector/w2v_model_wiki_word.vec'
matrix_ques_part_path = projectdir + '/Data/sentence_vec_encode_word/1.txt'
# char, 句向量
w2v_model_char_path = projectdir + '/Data/chinese_vector/w2v_model_wiki_char.vec'
matrix_ques_part_path_char = projectdir + '/Data/sentence_vec_encode_char/1.txt'
# word2vec select
word2_vec_path = w2v_model_wiki_word_path if os.path.exists(w2v_model_wiki_word_path) else w2v_model_merge_short_path

15
python-version-time Normal file
View File

@ -0,0 +1,15 @@
Python 3.3.2(May 15, 2013)
Python 3.2.5(May 15, 2013)
Python 3.1.5(April 10, 2012)
Python 3.0.1(February 13, 2009)
Python 2.7.5(May 15, 2013)
Python 2.6.8(April 10, 2012)
Python 2.5.6(May 26, 2011)
Python 2.4.6(December 19, 2008)
Python 2.3.7(March 11, 2008)
Python 2.2.3(May 30, 2003)
Python 2.1.3(April 8, 2002)
Python 2.0.1(June 2001)
Python 1.6.1(September 2000)
Python 1.5.2(April 1999)
Older releases:Source releases,binaries-1.1,binaries-1.2,binaries-1.3,binaries-1.4,binaries-1.5

49
readme.md Normal file
View File

@ -0,0 +1,49 @@
# nlp_xiaojiang
# Data
- chinese_vector
- 截取的部分word2vec训练词向量自己需要下载全效果才会好
- corpus
- 小黄鸡和gossip问答预料数据没清洗
- sentence_vec_encode_char
- 1.txt字向量生成的前100000句向量
- sentence_vec_encode_word
- 1.txt词向量生成的前100000句向量
- tf_idfchicken_and_gossip.txt生成的tf-idf
# ChatBot
- 检索式ChatBot
- 像ES那样直接检索(如使用fuzzywuzzy),只能字面匹配
- 构造句向量,检索问答库,能够检索有同义词的句子
- 生成式ChatBottodo
- seq2seq
- GAN
# FeatureProject
- normalization_util指的是数据归一化
- 0-1归一化处理
- 均值归一化
- sig归一化处理
- sim feature这里只有ML没有bert、emlo等的句向量相似度
- distance_text_or_vec:各种计算文本、向量距离等
- distance_vec_TS_SSTS_SS计算词向量距离
- cut_td_idf将小黄鸡语料和gossip结合
- sentence_sim_feature计算两个文本的相似度或者距离例如qq问题和问题或者qa问题和答案
# run
- 1.创建tf-idf文件等运行2需要先跑1: python cut_td_idf.py
- 2.计算两个句子间的各种相似度先计算一个预定义的然后可输入自定义的先跑1: python sentence_sim_feature.py
- 3.chatbot_1跑起来(fuzzy检索-没)独立python chatbot_fuzzy.py
- 4.chatbot_2跑起来(句向量检索-词)独立python chatbot_sentence_vec_by_word.py
- 5.chatbot_3跑起来(句向量检索-字)独立python chatbot_sentence_vec_by_char.py
# requestments.txt
- python_Levenshtei
- 调用Levenshtein我的python是3.6
- 打开其源文件https://www.lfd.uci.edu/~gohlke/pythonlibs/
- 查找python_Levenshtein-0.12.0-cp36-cp36m-win_amd64.whl下载即可
- pyemd
- pyhanlp
- 下好依赖JPype1-0.6.3-cp36-cp36m-win_amd64.whl

12
requestments.txt Normal file
View File

@ -0,0 +1,12 @@
python-Levenshtein==0.12.0
fuzzywuzzy==0.17.0
openpyxl==2.6.2
pandas==0.24.2
xpinyin==0.5.6
numpy==1.16.1
gensim==3.7.1
pyemd==0.5.1
jieba==0.39
xlrd==1.2.0
sklearn
pathlib

5
result_test/__init__.py Normal file
View File

@ -0,0 +1,5 @@
# -*- coding: UTF-8 -*-
# !/usr/bin/python
# @time :2019/4/3 14:40
# @author :Mo
# @function :

View File

@ -0,0 +1,38 @@
Connected to pydev debugger (build 171.3780.115)
D:\workspace\pythonMyCode\django_project\nlp_xiaojiang
read questions ok!
你问: 你谁呀
小姜机器人: 我就是人见人爱,花见花开的聚合数据呀
推荐结果:
[('你谁呀\t我就是人见人爱花见花开的聚合数据呀\n', 100), ('你谁呀\t我就是人见人爱花见花开的聚合数据呀\n', 100), ('你是谁\t=。=\n', 67), ('你谁都\t==\n', 67), ('你泡谁\t==\n', 67)]
你问:
nenen
小姜机器人:
推荐结果:
[('nnn\t\n', 75), ('nnn\tnnn\n', 75), ('lene\t==\n', 67), ('tencent\t==\n', 67), ('nann\t=。=\n', 67)]
你问:
niguola
小姜机器人: =。=
推荐结果:
[('igdota\t=。=\n', 62), ('ula\t==\n', 60), ('qiulaif\t=。=\n', 57), ('qiulaif\t==\n', 57), ('gold\t=。=\n', 55)]
你问:
你错啦
小姜机器人: imba送超鬼你说傻逼不
推荐结果:
[('你错啦,傻逼\timba送超鬼你说傻逼不\n', 100), ('你错啦!不懂我的心啊,桑心\t要理解他嘛~也许有误会。要不我们就甩了他?你舍得么?\n', 100), ('你错了\t=。=\n', 67), ('我错啦,不逗你\t==\n', 60), ('你错乱了\t…………………\n', 57)]
你问:
啦啦啦
小姜机器人: 耶耶耶
推荐结果:
[('啦啦啦\t耶耶耶\n', 100), ('\t不就是2嘛。啦啦啦……我是计算鸡…… ̄︶ ̄\n', 100), ('啦啦啦\t啦啦\n', 100), ('啦啦啦\t我是卖报的小行家\n', 100), ('啦啦啦\t我是买报的小行家\n', 100)]
你问:
笑脸
小姜机器人: 是知道了嘻嘻
推荐结果:
[('笑脸\t是知道了嘻嘻\n', 100), ('笑\t=。=\n', 67), ('笑\t==\n', 67), ('笑你\t…\n', 50), ('就笑\t\n', 50)]
你问:
北京欢迎宁
小姜机器人: 北京欢迎你
推荐结果:
[('北京\t北京欢迎你\n', 62), ('北京\t北京\n', 57), ('北京\t\n', 57), ('北京\t==\n', 57), ('北京\t\n', 57)]
你问:

View File

@ -0,0 +1,55 @@
Connected to pydev debugger (build 171.3780.115)
D:\workspace\pythonMyCode\django_project\nlp_xiaojiang
np.loadtxt(matrix_ques_part_path_char) ok!
你问:
你谁呀
D:/workspace/pythonMyCode/django_project/nlp_xiaojiang/ChatBot/chatbot_search/chatbot_sentence_vec_by_char.py:115: RuntimeWarning: invalid value encountered in true_divide
matrix_org_norm = (matrix_org / np.sqrt((matrix_org ** 2).sum(-1))[..., np.newaxis]).astype(numpy_type)
小姜机器人: 你猜?!
'gbk' codec can't encode character '\u301c' in position 227: illegal multibyte sequence
你问:
呃呃呃
小姜机器人: 呃什么呃?
[('呃', '呃什么呃?'), ('呃……', '嘟嘟(O_O)'), ('呃…', '=。='), ('呃呃呃', '对对对大大方方'), ('呃…', '主人你是爱我的,对吧'), ('呃……', '乃吃滴太饱了吗?是卟是撑滴慌?嗯....一定是滴~要卟要莪帮乃吐出来~(^V^)'), ('呃', '对对对大大方方'), ('呃,,,', '=。='), ('呃', "别这么无奈嘛'"), ('呃', '乃吃滴太饱了吗?是卟是撑滴慌?嗯....一定是滴~要卟要莪帮乃吐出来~(^V^)'), ('呃呃呃', '呃什么呃?'), ('呃呃呃', '被我说中了!他也在想你'), ('呃。。。。。', '呃什么呃?'), ('呃…', '饿啦?那就去吃饭吧,白白,撒由那拉,古德白,走好'), ('呃…', '打嗝了?'), ('呃。。', '呃什么呃?'), ('呃,', '乃吃滴太饱了吗?是卟是撑滴慌?嗯....一定是滴~要卟要莪帮乃吐出来~(^V^)'), ('呃呃', "别这么无奈嘛'"), ('呃', '?'), ('呃呃呃', '乃吃滴太饱了吗?是卟是撑滴慌?嗯....一定是滴~要卟要莪帮乃吐出来~(^V^)')]
你问:
哈哈
小姜机器人: ←_←?
[('哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈', '←_←?'), ('哈,哈,哈。', '=。='), ('哈', '笑屁'), ('哈,', 'h哦吧'), ('哈哈……', '你笑的真甜'), ('哈哈哈哈', '饿'), ('哈哈', '哈!'), ('哈哈哈哈哈哈', 'hihi~'), ('哈哈哈哈哈哈哈', '么么。咱们不无语哈'), ('哈哈', '耶耶'), ('哈哈哈哈哈哈哈哈哈哈', '哈哈哈'), ('哈哈,', '狂风大雨啊'), ('哈哈哈哈哈', '你笑的真甜'), ('哈哈哈', '我才不贱呢'), ('哈哈。', '(⊙o⊙?)'), ('哈哈哈哈哈', '哈!'), ('哈哈哈哈', '吼吼'), ('哈哈哈', '←_←?'), ('哈哈哈哈', 'Unauthorizedaccess!.Inthisprogram(site,app),theSimSimiAPIisbeingusedillegally.Pleasecontactus.'), ('哈哈!', '哈哈哈哈!')]
你问:
嘻嘻
小姜机器人: 好阴险啊。。
'gbk' codec can't encode character '\u301c' in position 136: illegal multibyte sequence
你问:
啦啦啦
小姜机器人: 我是卖报的小通~
[('啦啦啦', '我是卖报的小通~'), ('啦啦', '(*^__^*)嘻嘻……'), ('啦啦啦', '你是啦啦?'), ('啦啦啦', '痛'), ('啦啦啦啦', '我是卖报的小行家~'), ('啦啦', 'hi'), ('啦啦啦啦', '耶耶耶'), ('啦啦啦', '哈哈'), ('啦啦', '耶耶耶'), ('啦啦啦啦', '我是卖报的小通~'), ('啦', '我是卖报的小通~'), ('啦啦啦', '啦啦'), ('啦啦', '我是卖报的小行家'), ('啦啦啦', '背背'), ('啦啦', '哈'), ('啦啦啦啦', '痛'), ('啦啦啦', '我是买报的小行家'), ('啦啦啦', '你猜啊'), ('啦啦啦啦啦', 'hi'), ('啦啦', '啦')]
你问:
我去哪个二零
小姜机器人: 一起去吃饭!你请客啊。
[('我嘞个去', '一起去吃饭!你请客啊。'), ('我那个去', '什么吖'), ('去你个噔噔', '参见太后,万岁万岁万万岁!'), ('你个二', '是的,主人'), ('你个六', '对啊我什么都不懂。可是你呢,你又知道些什么呢?'), ('哪三个', "前排副驾驶一个'后排两个"), ('哪个她', '嗯嗯。她爱你。'), ('我再嘞个去', '我嘞个叮'), ('哪个二逼', '你听说过传说中的王浩然那货吗?'), ('哪四个', 'RudyJeffEric建二君'), ('唉!我了个去', '来了'), ('你个二逼', '你个大SB'), ('我了个去', '一起去吃饭!你请客啊。'), ('我了个去', '我了个深深的去'), ('我了个去', '你了个去哪'), ('我了个去', '来了'), ('我了个去', '汤小米口头禅'), ('我了个去', '你了个回来'), ('我了个去', '鸡丝啊!'), ('我了个去', '=。=')]
你问:
我想去北京玩耍
小姜机器人: 去你媽
[('你想去哪玩', '去你媽'), ('我想你去我家玩', '让我妹去哪?'), ('去北京玩呗', '好玩的'), ('我想你去', '去哪?'), ('我想玩你', '好想好想你哦'), ('我想看你去', '五月天的吗'), ('我想和你玩', '嗯,我们会的,即使不会,我也会一直爱你。'), ('我想去上海', '你一定可以的'), ('你想去哪', '我想穿越!!!'), ('北京哪好玩', '和喜欢的人在一起,哪里都好玩'), ('我玩去了', '好啊,玩澄海挺不错的啊,还是喜欢三国雪啊?'), ('我想自己出去玩', '她太坏了。。。'), ('我想玩你妹', '玩毛线,学习去!'), ('你自己玩去', '我是个传说。'), ('我想吃', '爱妈妈'), ('我想吃', '想你妹!'), ('我想吃', '走,老纸请你去吃很多很多肉~~'), ('北京哪儿好玩', '和喜欢的人在一起,哪里都好玩'), ('我想吃你', '不知道'), ('我想吃你', "你吃了我妈妈会遭报应的!明早你就会发现我妈妈的血肉化作了你的肥肉'一辈子缠着你!!")]
你问:
啦啦啦啦,啊啦啦啦
"word ' ' not in vocabulary"
小姜机器人: 主人心情很好哦~
list index out of range
你问:
小姜机器人: 哦~~~~~~~~
[('咦', '哦~~~~~~~~'), ('咦', '=='), ('咦', '咦?你发现我在这里了?'), ('咦', '没……没什么'), ('咦', '你不在我不在'), ('咦呀咦', '啊哒gi啊哒个啊哒gi的啊哒个啊哒gi啊哒的哒古'), ('——哎', '怎么叻'), ('哎', '^_^'), ('哎', '在的。在的。时刻候着!'), ('哎。~~~(>_<', '哎什么?不如爱李琳铛吧'), ('哎', '肿么啦'), ('哎', '肿木了'), ('哎', '哟呵呵'), ('哎哎哎', '哎哟'), ('哎', '哎哟'), ('哎哎', '怎么啦'), ('哎', '哎什么?不如爱李琳铛吧'), ('哎', '表叹气嘛'), ('哎', '哦'), ('哎', '==')]
你问:
你下换水
小姜机器人: 来会打的远程!!拒绝你懂的!
'gbk' codec can't encode character '\u2207' in position 329: illegal multibyte sequence
你问:
你喜欢谁
小姜机器人: 当然喜欢催妞啦
[('你喜欢谁', '当然喜欢催妞啦'), ('你喜欢谁', '小震!'), ('你喜欢谁', '我喜欢狮子'), ('你喜欢谁', 'bb'), ('你喜欢谁', '马里奥!!'), ('你喜欢谁', '谁喜欢我。我就喜欢谁'), ('你喜欢谁', '我最可爱的主人'), ('你喜欢谁', '身心'), ('你喜欢谁', '当然是航航啦,你笨死了'), ('你喜欢谁', '嗯。。。。。人家不好意思啦。。。。。'), ('你喜欢谁', '黄博qq******'), ('你喜欢谁', '张致贤!!我的主人'), ('你喜欢谁', '当然是小女友啦~'), ('你喜欢谁', '当然是潼潼姐姐啦'), ('你喜欢谁', '性感迷人的ladygaga'), ('你喜欢谁', '我喜欢二吉'), ('你喜欢谁', '最爱有天'), ('谁喜欢你', '我喜欢你丫~>3<'), ('唉,,你喜欢谁', '哦哦,你猜嘛'), ('嗯!我喜欢谁', '蒋沁寒嘛')]
你问:
小姜机器人: 嗯
[('啊?', '嗯'), ('啊啊啊啊啊啊', '买好吃吃'), ('啊??', '啊啊啊啊'), ('啊...', '怎么了怀孕了嘛?'), ('啊~', '咯咯~~~'), ('啊??', '谁不知道他啊家庭好主妇么,上次吃火锅那一堆男生里就他会洗菜呢~有他在多幸福啊?'), ('啊啊啊啊啊', '啊你妈'), ('啊', '嘴巴不要张的那么大嘛,蛀牙都看见了'), ('啊?', '请不要教授广告因为会让话题无法进行下去'), ('啊啊啊啊,', '快到碗里来~'), ('啊', '啊个头啊'), ('啊', '怎么了怎么了?贞子出没了吗?'), ('啊?', '哈?'), ('啊啊啊啊啊啊啊啊啊', '啊你妈'), ('啊啊', '干吗?'), ('啊啊啊', '啊啊啊啊啊啊啊啊'), ('啊?', '温温!'), ('啊啊啊啊啊', '哦'), ('啊啊啊啊啊啊啊啊啊啊', '小王八!!'), ('啊??', '=。=')]
你问:

View File

@ -0,0 +1,73 @@
Connected to pydev debugger (build 171.3780.115)
D:\workspace\pythonMyCode\django_project\nlp_xiaojiang
load_word2vec_model start!
load_word2vec_model end!
load w2v_model_wiki_word_path ok!
np.loadtxt(matrix_ques_part_path) start!
np.loadtxt(matrix_ques_part_path) end!
你:
你是谁呀
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\MOYONG~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.815 seconds.
Prefix dict has been built succesfully.
D:/workspace/pythonMyCode/django_project/nlp_xiaojiang/ChatBot/chatbot_search/chatbot_sentence_vec_by_word.py:131: RuntimeWarning: invalid value encountered in true_divide
matrix_org_norm = (matrix_org / np.sqrt((matrix_org ** 2).sum(-1))[..., np.newaxis]).astype(numpy_type)
小姜机器人: 他们俩口子蜜月去啦!
[('你爸爸妈妈是谁', '他们俩口子蜜月去啦!'), ('你爸爸妈妈是谁', '娃娃飞飞'), ('你爹是谁', '刘阿波'), ('谁是你爹', '你猜啊'), ('谁是你爹', '刘阿波'), ('谁是你爹', '说了也没用,反正他不认我'), ('你爹是谁', '你猜啊'), ('你爹是谁', '我爹爹是世界上最棒的男人——张文轩~妈妈眼里的天~嘘……不要告诉别人,妈妈不让我给爸爸说^o^'), ('你爹是谁', '朱庆壮'), ('你爹是谁', '你老公'), ('你爹是谁', '大黄鸡'), ('你猜猜我是谁', '你猜我猜不猜。'), ('你爸是谁', '磊磊啊'), ('谁是你爸', '世界上最好的男人妈妈最爱的人张文轩~'), ('你爸是谁', '当然是崔展硕了'), ('你爸是谁', '于凯大帅哥'), ('你爸是谁', '在呀'), ('你爸是谁', '世界上最好的男人妈妈最爱的人张文轩~'), ('你爸是谁', '天下第一大帅哥,美貌与智慧的结合,英雄与侠义的化身~旭爷。'), ('你爸是谁', '冯天浩!!')]
你:
你的名字
小姜机器人: 我叫黄小通哟
[('你的名字', '我叫黄小通哟'), ('你的名字', '我叫simsimi'), ('我的名字', 'Ard!!'), ('那你的名字', '我是普爷的肥啾!闷油瓶的小通!云雀的小黄鸟!'), ('她的名字', '卞芷妍'), ('你的名字叫', '恩哼?孙浩哲。。唉~都一个逼样。。'), ('你女朋友的名字', '吴鑫涛。'), ('你的名字叫什么', '杨天阳!最爱我爸爸啦!'), ('是你自己的名字', '对哦'), ('嘟嘟是你的名字', '对哦'), ('你的名字怎么写', '额,每天晚上,你都梦中喊她名字唉。。。'), ('我的名字是什么', '陈一然姐姐'), ('不负你的名字真', '黄鸡'), ('她的名字叫啥', '卞芷妍'), ('你的名字有点淫荡', '鱼'), ('我的名字有19画。你说我的他叫什么名字', '张琨!'), ('你叫一声我的名字', '大爷'), ('你的app叫什么名字', '贱贱鸡'), ('我的老婆的名字是什么', '王茹'), ('你可以叫出我的名字吗', '我叫小阳仔思密达!!')]
你:
我的姓氏
小姜机器人: Ard!!
[('我的名字', 'Ard!!'), ('你的名字', '我叫simsimi'), ('你的名字', '我叫黄小通哟'), ('她的名字', '卞芷妍'), ('那你的名字', '我是普爷的肥啾!闷油瓶的小通!云雀的小黄鸟!'), ('我的女朋友姓什么', '姓王呀,还说我笨,你自己都不知道吗'), ('我的老婆的名字是什么', '王茹'), ('我是说我的名字', '喔,葛二蛋'), ('你的名字叫', '恩哼?孙浩哲。。唉~都一个逼样。。'), ('我的名字是什么', '陈一然姐姐'), ('是你自己的名字', '对哦'), ('嘟嘟是你的名字', '对哦'), ('我喜欢的他姓啥', '那你先让他知道吧'), ('我的名字有19画。你说我的他叫什么名字', '张琨!'), ('我说的是你的微博名字', '反轉豬腩也不是王子'), ('她的名字叫啥', '卞芷妍'), ('你女朋友的名字', '吴鑫涛。'), ('你的名字叫什么', '杨天阳!最爱我爸爸啦!'), ('小黄狗是我朋友的名字', '吴鑫涛。'), ('我是问你他的名字', '深深的在脑海里~')]
你:
b站有什么好看的
小姜机器人: 他会烧有机物呀,他烧的有机物可好吃啦,不信你去他的相册看看
'gbk' codec can't encode character '\u301c' in position 116: illegal multibyte sequence
你:
你呀你
小姜机器人: 哎呦好恶心
'gbk' codec can't encode character '\u301c' in position 131: illegal multibyte sequence
你:
平乐话是明朝传下来的?
小姜机器人: 好吧好吧我承认我又岔开话题了
[('又是这句话', '好吧好吧我承认我又岔开话题了'), ('诶,,,,,都是冷笑话', '妹啊,你别开玩笑了,你那哪是烧饼脸啊,谁家烧饼做这么大!!<br/>泥煤呀!!你就不会委婉点,这嘴太欠了'), ('我说的是东北话', '滚犊子'), ('话说小通是公的还是母的', '跟你妈一样'), ('不要老是这句话', '是滴是滴!这是家昀老先生教我滴!!'), ('这是地方话,听不懂了', '法国号结果方法发'), ('你这是什么话', '切糕'), ('怎么每天都是这句话', '你若安好便是晴天'), ('那你现在说的是鸡话', 'yes,mylord'), ('你是话唠么', '死也跟你一起死'), ('话说,大黄鸡是女的', '=='), ('这句话说的好贱的', '马主席,傻逼一个~~~'), ('话说你真的是公的', '那你是公的母的呢?'), ('这句话也是自动回复', '我想你了'), ('这句话是什么意思', '一会儿'), ('话说你到底是公的还是母的', '先有鸡还是先有蛋'), ('又是这句话,能不能换句话', '就是,怪不得雯雯要和他绝交。'), ('继续冷笑话的干活', '每个学校都有那么几个传说中的“*大名捕”。学生都懂的……<br/>今天上午考试,四大名捕有两人居然安排了同场监考!<br/>人家两个人进门刚说了一句话“我们两个大家应该都认识,考场纪律不念了。”<br/>我会告诉你有三个学生直接收拾东西弃考了么?!'), ('我说的常州话', '雪'), ('话说姐是狮子', '金牛座,')]
你:
错误呀,你不想
小姜机器人: 打我
'gbk' codec can't encode character '\xaf' in position 433: illegal multibyte sequence
你:
你不行呀
小姜机器人: 那么我呢?
[('啧啧,你不行', '那么我呢?'), ('你不行', '(⊙_⊙)'), ('你不行', 'yes,sir'), ('你不行', '行啊不过我会心痛'), ('你不行', '三九一五四三三二八~\\(≧▽≦)/~'), ('别说你不行', '好吧,我掌嘴'), ('我@你怎么不行', '等到夜深人静的时候我再悄悄地给你说一个秘密^_^'), ('什么呀,你懂不懂', '我叫阿治仔'), ('不想不行啊,怕你忽悠我', '想到天荒地老'), ('你果然不行', '我是贱鸡,我最贱'), ('你不行么', '我就喜欢你不喜欢我,你改呀!'), ('我骂你呢你傻', '没你傻'), ('我怎么@不上你呢,~~~(>_<', '呜呜……小通累了想休息一下'), ('你呵呵干嘛,要死', '对不起嘛亲爱的我再也不敢了'), ('你干嘛不理我', '对不起嘛亲爱的我再也不敢了'), ('你羞啥', '=。='), ('屁呀~我教你什么', '娘亲'), ('我说怎么@你呀,不会', '他生你的气了但是只是一点点。。。'), ('你真是朵奇葩呀你', '你咋知道'), ('你再说我烦', '好的,小的再也不敢了(><)')]
你:
呵呵
小姜机器人: 呵你妹啊
[('呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵', '呵你妹啊'), ('呵呵…', '傻笑啥阿'), ('呵呵', '是王若猫的。'), ('呵呵', '主人万岁万岁万万岁'), ('呵呵', '洗澡去'), ('呵呵…', '谢谢'), ('呵呵', '我最讨厌别人说呵呵'), ('呵呵,', '哈哈哈,彼此彼此啊'), ('呵呵', '不要呵呵,我不喜欢,你也不喜欢,开心点儿'), ('呵呵', '傻吊'), ('呵呵呵呵', '你又不是女神'), ('呵呵,', '那是我妹妹,我是小通'), ('呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵呵', '呵呵你大爷'), ('呵呵', '你是ds'), ('呵呵', '啊啊啊'), ('呵呵呵呵~~', '=。='), ('呵呵', '吊丝才呵呵。。'), ('呵呵,,,', '呵呵吃饭去洗澡'), ('呵呵', '流言止于智者,聊天至于呵呵。。'), ('呵呵呵呵呵呵', '呵呵你妹!')]
你:
啦啦啦啦啦啦
小姜机器人: hi
[('啦啦', 'hi'), ('啦啦啦~', '=。='), ('啦啦啦', '痛'), ('啦啦啦', '我是卖报的小行家~'), ('啦啦', '啦啦啦~\\(≧▽≦)/~'), ('啦啦啦', '我是买报的小行家'), ('啦啦啦啦啦啦', '啦啦'), ('啦啦', '我是卖报的小通~'), ('啦啦', '哈'), ('啦啦', '我是卖报的小行家'), ('啦啦啦', 'hi'), ('啦啦啦啦啦', '啦啦'), ('啦', '你是啦啦?'), ('啦啦啦啦啦', '我是卖报的小行家~'), ('啦啦啦', '我是卖报的小通~'), ('啦啦啦啦啦啦', '我是卖报的小行家~'), ('啦啦啦', '背背'), ('啦啦啦啦', '我是卖报的小行家~'), ('啦啦啦', '啦啦啦~\\(≧▽≦)/~'), ('啦啦啦啦啦。。', '==')]
你:
去玩吧
小姜机器人: 呃,这个……我也不知道
'gbk' codec can't encode character '\xaf' in position 273: illegal multibyte sequence
你:
小姜机器人: ……
'gbk' codec can't encode character '\u301c' in position 335: illegal multibyte sequence
你:
嘻嘻
小姜机器人: 哈哈哈
'gbk' codec can't encode character '\u301c' in position 170: illegal multibyte sequence
你:
我去
'gbk' codec can't encode character '\u301c' in position 32: illegal multibyte sequence
你:
骂骂
小姜机器人: that'sok
[('thanks', "that'sok"), ('采蘑菇', '香菇'), ('傻瓜机', '好东西'), ('张康', '啊啊啊啊啊啊啊啊啊fuck'), ('baka', '无路赛!'), ('旺仔小馒头', '我要蜂蜜味的'), ('浙工大', '基佬天堂'), ('赵迪', '=='), ('童子鸡', '啊啊,不要炖我'), ('5.3.6.9', '=='), ('狗蛋', '叫我干嘛爸爸妈妈'), ('狗蛋', '蛋狗蛋狗你最有!'), ('狗蛋', '狗蛋被咬小通'), ('33333', ''), ('唐宽', '一直很犹豫'), ('year', '年下攻,年上受'), ('goodnight', 'SweetdreamdarlingXD'), ('goodnight', 'ba\u2006d'), ('goodnight', '不然呢'), ('嚜嚜', '嘿嘿。')]
你:

View File

@ -0,0 +1,37 @@
Connected to pydev debugger (build 171.3780.115)
D:\workspace\pythonMyCode\django_project\nlp_xiaojiang
load w2v model begin
load w2v model success
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\MOYONG~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.719 seconds.
set_data time0.7200782299041748
Prefix dict has been built succesfully.
same_word_count time0.0
same_char_count time0.0
sentence_length time0.0
w2c_all_vec time0.1994335651397705
tdidf_all_vec time0.0
edit_all_str time0.0019953250885009766
jaccard系数 time0.0
tdidf_all_vec_pinyin time0.0
edit_all_pinyin time0.004553556442260742
jaccard系数pinyin time0.0
sim:
[1, 3, 1, 1.1, 5.238095238095238, 0.6782572237857507, 3461.1677906854284, 283.83272299933014, 19.980963040347838, 0.9999999999966667, 3.0830289870500565, 19.980963040347838, 24.494821131252575, 79619.83774188746, -5.10379204991808, 0.6769724044408956, 0.0, 12, 9, 0.2857142857142857, 0.5242424242424243, 19, 19, 2, 8.141546895617283, 0.08333333333333333, 0.16666666666666666, 0.008081558347970244, 17, 22, 0.5217391304347826, 0.6838686096962837, 56, 47, 4, 6.190419904893637, 0.11764705882352941, 0.11764705882352941]
s1: 你打篮球很像我
s2: 你足球踢得很渣呀
set_data time0.0009706020355224609
same_word_count time0.0009982585906982422
same_char_count time0.0
sentence_length time0.0
w2c_all_vec time0.20846796035766602
tdidf_all_vec time0.0
edit_all_str time0.0019943714141845703
jaccard系数 time0.0
tdidf_all_vec_pinyin time0.0
edit_all_pinyin time0.0019960403442382812
jaccard系数pinyin time0.0
sim:
[2, 3, 1, 0.875, 3.7333333333333334, 0.8200504988005877, 3746.94646712115, 236.48076447923086, 17.65693370974129, 0.9999999999966667, 4.2634280025959015, 17.65693370974129, 24.494877087856107, 78956.49194315828, -13.367107715032754, 0.8200018973656127, 0.07174613344073014, 21, 6, 0.4, 0.6011904761904762, 40, 40, 1, 5.620521171774245, 0.2, 0.25, 0.36243089354552877, 10, 15, 0.5384615384615384, 0.6417797888386123, 62, 58, 5, 6.01776904578638, 0.25, 0.25]
s1:

5
utils/__init__.py Normal file
View File

@ -0,0 +1,5 @@
# -*- coding: UTF-8 -*-
# !/usr/bin/python
# @time :2019/4/3 15:15
# @author :Mo
# @function :

322
utils/text_tools.py Normal file
View File

@ -0,0 +1,322 @@
# -*- coding: UTF-8 -*-
# !/usr/bin/python
# @time :2019/4/3 11:23
# @author :Mo
# @function :utils, tools
from openpyxl import Workbook
import logging as logger
import gensim
import jieba
import time
import xlrd
import re
#中英文标点符号
filters='[!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n' + '!,;:。?、“”’‘《》()~@#¥%……&*\/{}【】…=-]'
#标点符号、空格
filters_1 = "[\.\!\/_,?;:$%^*<>()+\"\']+|[!,;:。?、“”’‘《》()~@#¥%……&*\\\/\-]+"
"""去除标点符号、空格"""
def clear_punctuation(text):
"""去除标点符号"""
sentence = text.replace(' ', '')
sentence_punctuation_clear = re.sub(filters, ' ', sentence).strip()
sentence_punctuation_clear_replace = sentence_punctuation_clear.replace(' ', ' ').replace(' ', ' ')
return sentence_punctuation_clear_replace
'''截取中文、拼音、数字,去除特殊字符等'''
def getChinese1(ques):
# ques = '•“鑫菁英”教育分期手续费怎么收取?可以'
findAllChinese = ''.join(re.findall(u"([\u4e00-\u9fa50-9A-Za-z])", ques))
# print(sub_str)
return findAllChinese
'''xlrd读xls'''
def xlsRead(sheetName=None, cols=0, fileXlsPath=None):
'''读xls文件'''
workbook = xlrd.open_workbook(fileXlsPath)
# 根据sheet索引或者名称获取sheet内容
sheet = workbook.sheet_by_name(sheetName)
nrows = sheet.nrows
ncols = sheet.ncols
listRows = []
for i in range(nrows):
listRows.append(sheet.row_values(i))
return listRows
'''openpyxl写xlsx'''
def xlsxWrite(sheetName, writeList, fileXlsName):
wb = Workbook()
print('{}'.format(wb.get_sheet_names())) # 提供一个默认名叫Sheet的表office2016下新建提供默认Sheet1
sheet = wb.create_sheet(sheetName)
# i = 0
for listLine_one in writeList:
# i += 1
sheet.append(listLine_one)
# if i == 1000:
# break
wb.save(fileXlsName)
"""判断一个unicode是否是英文字母"""
def is_alphabet(uchar):
"""判断一个unicode是否是英文字母"""
if (uchar >= u'\u0041' and uchar <= u'\u005a') or (uchar >= u'\u0061' and uchar <= u'\u007a'):
return True
else:
return False
'''读取txt文件'''
def txtRead(filePath, encodeType = 'utf-8'):
listLine = []
try:
file = open(filePath, 'r', encoding= encodeType)
while True:
line = file.readline()
if not line:
break
listLine.append(line)
file.close()
except Exception as e:
logger.info(str(e))
finally:
return listLine
'''读取txt文件'''
def txtWrite(listLine, filePath, type = 'w',encodeType='utf-8'):
try:
file = open(filePath, type, encoding=encodeType)
file.writelines(listLine)
file.close()
except Exception as e:
logger.info(str(e))
'''截取中文、拼音、数字,去除特殊字符等'''
'''要保留特殊字符的格式,最好的方法是每个字符都去匹配'''
def getChinese(ques):
# ques = '•“鑫菁英”教育分期手续费怎么收取?可以'
ques = strQ2B(ques)
answer = ''
for ques_one in ques:
ques_one_findall = ''.join(re.findall(u"([\u4e00-\u9fa50-9A-Za-z峣㒶㒰玘宸諕鄕缓緩𪥵嬆嬲煙草砼赟贇龘㗊㵘㙓敠])", ques_one))
if not ques_one_findall:
ques_one_findall = ' '
answer = answer + ques_one_findall
answer = answer.strip().replace(' ', ' ').replace(' ', ' ')
return answer.upper()
'''去除标点符号'''
def get_syboml(ques):
# ques = '•“鑫菁英”教育分期手续费怎么收取?可以'
ques = strQ2B(ques)
# answer = re.sub(u'([。.,,、\;:?!“”"'''()…——-《》<>{}_~【】\\[])', ' ', ques).replace(' ', ' ').replace(' ', ' ')
answer = re.sub("[\.\!\/_,?;:$%^*<>()+\"\']+|[!,;:。?、“”’‘《》[\]|{}【】~@#¥%…&*\/\-—_]+", " ", ques).strip()
return answer
'''xlrd读xls'''
def xlsRead(sheetName=None, cols=0, fileXlsPath=None):
'''读xls文件'''
workbook = xlrd.open_workbook(fileXlsPath)
# 根据sheet索引或者名称获取sheet内容
sheet = workbook.sheet_by_name(sheetName)
nrows = sheet.nrows
ncols = sheet.ncols
listRows = []
for i in range(nrows):
listRows.append(sheet.row_values(i))
return listRows
'''openpyxl写xlsx'''
def xlsxWrite(sheetName, writeList, fileXlsName):
wb = Workbook()
print('{}'.format(wb.get_sheet_names())) # 提供一个默认名叫Sheet的表office2016下新建提供默认Sheet1
sheet = wb.create_sheet(sheetName)
# i = 0
for listLine_one in writeList:
# i += 1
sheet.append(listLine_one)
# if i == 1000:
# break
wb.save(fileXlsName)
'''读取txt文件'''
def txtRead(filePath, encodeType='utf-8'):
listLine = []
try:
file = open(filePath, 'r', encoding=encodeType)
while True:
line = file.readline()
if not line:
break
listLine.append(line)
file.close()
except Exception as e:
logger.info(str(e))
finally:
return listLine
'''读取txt文件'''
def txtWrite(listLine, filePath, type='w', encodeType='utf-8'):
try:
file = open(filePath, type, encoding=encodeType)
file.writelines(listLine)
file.close()
except Exception as e:
logger.info(str(e))
# -*- coding: cp936 -*-
def strQ2B(ustring):
"""全角转半角"""
rstring = ""
for uchar in ustring:
inside_code = ord(uchar)
if inside_code == 12288: # 全角空格直接转换
inside_code = 32
elif (inside_code >= 65281 and inside_code <= 65374): # 全角字符(除空格)根据关系转化
inside_code -= 65248
rstring += chr(inside_code)
return rstring
def strB2Q(ustring):
"""半角转全角"""
rstring = ""
for uchar in ustring:
inside_code = ord(uchar)
if inside_code == 32: # 半角空格直接转化
inside_code = 12288
elif inside_code >= 32 and inside_code <= 126: # 半角字符(除空格)根据关系转化
inside_code += 65248
rstring += chr(inside_code)
return rstring
def is_valid_date(strdate):
'''判断是否是一个有效的日期字符串'''
try:
if ":" in strdate:
time.strptime(strdate, "%Y-%m-%d %H:%M:%S")
else:
time.strptime(strdate, "%Y-%m-%d")
return True
except:
return False
'''判断是否是全英文的'''
def is_total_english(text):
"""判断一个是否是全英文字母"""
symbol = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
try:
sentence_punctuation_clear = get_syboml(text)
sentence_punctuation_clear = sentence_punctuation_clear.replace(' ', '').strip()
numben = 0
for one in sentence_punctuation_clear:
if one in symbol:
numben += 1
if numben == len(sentence_punctuation_clear):
return True
else:
return False
except:
return False
'''判断是否是数字的'''
def is_total_number(text):
"""判断一个是否是全英文字母"""
try:
sentence_punctuation_clear = get_syboml(text)
sentence_punctuation_clear = sentence_punctuation_clear.replace(' ', '').strip()
numben = 0
for one in sentence_punctuation_clear:
if one.isdigit():
numben += 1
if numben == len(sentence_punctuation_clear):
return True
else:
return False
except:
return False
def is_number_or_english(text):
'''不为数字不为字母'''
judge = False
try:
sentence_punctuation_clear = get_syboml(text)
sentence_punctuation_clear = sentence_punctuation_clear.replace(' ', '').strip()
for words in sentence_punctuation_clear:
judge_number = is_total_number(words)
judge_english = is_total_english(words)
judge = judge_number or judge_english
if not judge:
return False
return judge
except:
return False
#todo #句子改写,同义词替换,去停用词等
if __name__ == '__main__':
# for i in range(10):
# sentence_vec = word2vec_model.wv["的"]
# sentence_vec_pd = pd.DataFrame(sentence_vec)
# sentence_vec_pd.to_csv('my_csv.csv', mode='a', header=False)
# sentence_ee = pd.read_csv('my_csv.csv')
# txtWrite([str(sentence_vec)], "gg.txt")
# path_test_data_government = '/data/test_data_government.csv'
# sentences = txtRead(path_test_data_government)
sentences = []
sentences_one_clear_punctuation_all = []
for sentences_one in sentences[1:]:
sentences_one_1 = sentences_one
sentences_one_clear_punctuation = clear_punctuation(sentences_one_1.replace(',0.0,1.0', ''))
# print(sentences_one)
# print(sentences_one_clear_punctuation)
sentences_one_clear_punctuation_jieba = jieba.cut(sentences_one_clear_punctuation, cut_all=False, HMM=False)
sentences_one_clear_punctuation_jieba_list = ' '.join(list(sentences_one_clear_punctuation_jieba)).replace(' ', ' ').replace(' ', ' ').strip()
sentences_one_clear_punctuation_all.append(sentences_one_clear_punctuation_jieba_list + ',0.0,1.0' + '\n')
txtWrite(sentences[0:1] + sentences_one_clear_punctuation_all, '/data/test_data_government_cut.csv')
#',0.0,1.0'
# np.savetxt('001', [word2vec_model.wv["的"], word2vec_model.wv["的"]])
# gg = np.loadtxt('001')

55
utils/word2vec_vector.py Normal file
View File

@ -0,0 +1,55 @@
# -*- coding: UTF-8 -*-
# !/usr/bin/python
# @time :2019/4/4 10:00
# @author :Mo
# @function :
from __future__ import print_function
from utils.text_tools import txtRead, txtWrite
from gensim.models.word2vec import LineSentence
from gensim.models import Word2Vec
import multiprocessing
import logging
import sys
import os
def train_word2vec_by_word():
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logging.info("running")
inp = "Y:/BaiduNetdiskDownload/cut_zhwiki_wiki_parse/cut_zhwiki_wiki_parse.txt"
outp1 = "w2v_model_wiki.model"
outp2 = "w2v_model_wiki_word.vec"
model = Word2Vec(LineSentence(inp), size=300, window=5, min_count=5, workers=multiprocessing.cpu_count())
model.save(outp1)
model.wv.save_word2vec_format(outp2, binary=False)
def train_word2vec_by_char():
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logging.info("running")
inp = "Y:/BaiduNetdiskDownload/cut_zhwiki_wiki_parse/cut_zhwiki_wiki_parse_char.txt"
outp1 = "w2v_model_wiki.model"
outp2 = "w2v_model_wiki_char.vec"
model = Word2Vec(LineSentence(inp), size=300, window=5, min_count=5, workers=multiprocessing.cpu_count())
model.save(outp1)
model.wv.save_word2vec_format(outp2, binary=False)
if __name__ == '__main__':
train_word2vec_by_word()
# train_word2vec_by_char()
# inp = "Y:/BaiduNetdiskDownload/cut_zhwiki_wiki_parse/cut_zhwiki_wiki_parse.txt"
# sentences_char = []
# sentences = txtRead(inp)
# for sentences_one in sentences:
# sentences_one_replace = sentences_one.strip().replace(" ", "")
# sentences_one_replace_all = []
# for sentences_one_replace_one in sentences_one_replace:
# sentences_one_replace_all.append(sentences_one_replace_one)
# sentences_char.append(" ".join(sentences_one_replace_all) + "\n")
# txtWrite(sentences_char, "Y:/BaiduNetdiskDownload/cut_zhwiki_wiki_parse/cut_zhwiki_wiki_parse_char.txt")
# gg = 0