diff --git a/ChatBot/__init__.py b/ChatBot/__init__.py
new file mode 100644
index 0000000..d869156
--- /dev/null
+++ b/ChatBot/__init__.py
@@ -0,0 +1,5 @@
+# -*- coding: UTF-8 -*-
+#!/usr/bin/python
+# @Time     :2019/3/29 23:11
+# @author   :Mo
+# @function :
\ No newline at end of file
diff --git a/ChatBot/chatbot_search/__init__.py b/ChatBot/chatbot_search/__init__.py
new file mode 100644
index 0000000..d838479
--- /dev/null
+++ b/ChatBot/chatbot_search/__init__.py
@@ -0,0 +1,5 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time     :2019/4/3 15:15
+# @author   :Mo
+# @function :
\ No newline at end of file
diff --git a/ChatBot/chatbot_search/chatbot_fuzzy.py b/ChatBot/chatbot_search/chatbot_fuzzy.py
new file mode 100644
index 0000000..605690c
--- /dev/null
+++ b/ChatBot/chatbot_search/chatbot_fuzzy.py
@@ -0,0 +1,163 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time     :2019/4/4 10:00
+# @author   :Mo
+# @function :
+
+
+from conf.path_config import chicken_and_gossip_path
+from utils.text_tools import txtRead, txtWrite
+from conf.path_config import projectdir
+from fuzzywuzzy import process
+from fuzzywuzzy import fuzz
+import pickle
+import time
+import re
+
+
+def count_same_char(x1, x2):
+    '''获取相同字符的个数'''
+    res = []
+    for x in x1:
+      if x in x2:
+        res.append(x)
+    if res:
+        return len(res)
+    else:
+        return 0
+
+
+def fuzzy_re(user_input, collection):
+    '''匹配方法， 效果不大好，只能匹配相同字数一样，或者字数比他多的那种，同义词或者是有一个词不一样，就没法区分开'''
+    suggestions = []
+    user_input = user_input.replace('.', '').replace('*', '').replace('?', '')
+
+    collection_new = []
+    len_user_input = len(user_input)
+    for coll in collection:  # 获取包含所有字符的，如果不包含，就返回错误
+        count_coll = 0
+        for i in range(len_user_input):
+            if user_input[i] in coll:
+                count_coll += 1
+        if len_user_input == count_coll:
+            collection_new.append(coll)
+    if not collection_new:
+        return None
+
+
+    pattern = '.*?'.join(user_input)  # Converts 'djm' to 'd.*?j.*?m'
+    try:
+        regex = re.compile(pattern)  # Compiles a regex.
+    except:
+        gg  = 0
+    for item in collection_new:
+        match = regex.search(item)  # Checks if the current item matches the regex.
+        if match:
+            suggestions.append((len(match.group()), match.start(), item))
+    return [x for _, _, x in sorted(suggestions)]
+
+
+def fuzzy_fuzzywuzzy(fuzz, user_input, collection):
+    '''编辑距离，速度比较慢，比起匹配方法，能够处理字符不一样的问题'''
+    collection_new = []
+    len_user_input = len(user_input)
+    for coll in collection:  # 获取包含一个字符的，如果不包含，就返回错误
+        for i in range(len_user_input):
+            if user_input[i] in coll:
+                collection_new.append(coll)
+    if not collection_new:
+        return None
+    collection_new = list(set(collection_new))
+
+    same_char_list = []
+    for collection_new_one in collection_new: # 获取相同字符串多的问题
+        count_same_char_one = count_same_char(user_input, collection_new_one)
+        same_char_list.append((collection_new_one, count_same_char_one))
+    same_char_list.sort(key=lambda x: x[1], reverse=True)
+    if len(same_char_list) >= 500:
+        same_char_list = same_char_list[0: 500]
+
+    result =  process.extract(user_input, same_char_list, scorer=fuzz.token_set_ratio, limit=20)
+    return result
+
+
+def fuzzy_fuzzywuzzy_list(fuzz, user_input, qa_list, collection, topn=50):
+    '''编辑距离，速度比较慢，比起匹配方法，能够处理字符不一样的问题'''
+
+    start_time = time.time()
+    # user_input_set = set([user_input_one for user_input_one in user_input])
+    user_input_set = [user_input_one for user_input_one in user_input]
+
+
+    same_char_list = []
+    max_data = 0
+    max_data_list = []
+    count_collection_new_one = 0
+    for collection_new_one in collection: # 获取相同字符串多的问题
+        count_same_char_one = len([x for x in user_input_set if x in collection_new_one])
+
+        if count_same_char_one > 0:
+            same_char_list.append((count_collection_new_one, count_same_char_one))
+        if count_same_char_one > max_data:
+            max_data_list.append(count_same_char_one)
+            max_data = count_same_char_one
+        count_collection_new_one += 1
+
+    end_time1 = time.time()
+    list_max_count = []
+    len_max_data_list = len(max_data_list)
+    for x in range(len_max_data_list):  # 获取前20排名
+        for k,l in same_char_list:
+            if l == max_data_list[len_max_data_list -1 - x]:
+                list_max_count.append(qa_list[k]) #问答重这里取出来
+        if len(list_max_count) >= 5000:
+            list_max_count = list_max_count[0:5000]
+            break
+
+    end_time2 = time.time()
+
+    # end_time1: 0.34090662002563477
+    # end_time2: 0.4080846309661865
+
+    # end_time1: 0.06417036056518555
+    # end_time2: 0.08422374725341797
+
+    # same_char_list.sort(key=lambda x: x[1], reverse=True)
+    # if len(same_char_list) >= 20:
+    #     same_char_list = same_char_list[0: 20]
+
+    result =  process.extract(user_input, list_max_count, scorer=fuzz.token_set_ratio, limit=topn)
+    end_time3 = time.time()
+
+    # print('end_time1: ' + str(end_time1 - start_time))
+    # print('end_time2: ' + str(end_time2 - start_time))
+    # print('end_time3: ' + str(end_time3 - start_time))
+
+    return result
+    # [fuzz.WRatio, fuzz.QRatio,
+    #  fuzz.token_set_ratio, fuzz.token_sort_ratio,
+    #  fuzz.partial_token_set_ratio, fuzz.partial_token_sort_ratio,
+    #  fuzz.UWRatio, fuzz.UQRatio]
+
+
+if __name__ == '__main__':
+    start_time = time.time()
+    qa_list = txtRead(chicken_and_gossip_path)
+    questions = [qa.strip().split("\t")[0] for qa in qa_list]
+    print("read questions ok!")
+    sen = "你谁呀"
+    # list_fuzzyfinder = fuzzyfinder(base_syn_one_split[1], qa_list)
+    # list_fuzzyfinder = fuzzy_fuzzywuzzy(fuzz, base_syn_one_split[1], qa_list)
+    print("你问: " + "你谁呀")
+    list_fuzzyfinder = fuzzy_fuzzywuzzy_list(fuzz, sen, qa_list, questions, topn=5)
+    print("小姜机器人： " + list_fuzzyfinder[0][0].split("\t")[1].strip())
+    print("推荐结果: ")
+    print(list_fuzzyfinder)
+
+    while True:
+        print("你问: ")
+        ques = input()
+        list_fuzzyfinder = fuzzy_fuzzywuzzy_list(fuzz, ques, qa_list, questions, topn=5)
+        print("小姜机器人： " + list_fuzzyfinder[0][0].split("\t")[1].strip())
+        print("推荐结果: ")
+        print(list_fuzzyfinder)
diff --git a/ChatBot/chatbot_search/chatbot_sentence_vec_by_char.py b/ChatBot/chatbot_search/chatbot_sentence_vec_by_char.py
new file mode 100644
index 0000000..e229451
--- /dev/null
+++ b/ChatBot/chatbot_search/chatbot_sentence_vec_by_char.py
@@ -0,0 +1,142 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time     :2019/4/4 10:00
+# @author   :Mo
+# @function :chatbot based search, encode sentence_vec by char
+
+from conf.path_config import w2v_model_char_path
+from conf.path_config import matrix_ques_part_path_char
+from utils.text_tools import txtRead, txtWrite, getChinese
+from conf.path_config import projectdir, chicken_and_gossip_path
+from numpy import float32 as numpy_type
+from collections import Counter
+import pickle, jieba, os, re
+import jieba.posseg as pseg
+from gensim import matutils
+from math import log
+import numpy as np
+import gensim
+import jieba
+
+
+def load_word2vec_model(path, bin=False, limit=None):
+    word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(path, limit=limit, binary=bin, unicode_errors='ignore')
+    return word2vec_model
+
+
+def encoding_question(w2v_model, char_list):
+    '''    生成句子向量
+    :param wordlist: 分词list
+    :param is_replaced: 是否替换default true
+    :param debug_mode: default false
+    :return: array句子的向量 len=300
+    '''
+    try:
+        sentence_vec = w2v_model.wv[word2vec_model.index2word[1]] * 0
+    except:
+        sentence_vec = w2v_model.wv[word2vec_model.index2word[0]] * 0
+
+    for k in range(len(char_list)):
+        char_list_one = char_list[k]
+        if type(char_list_one) == str:
+            try:
+                sentence_vec = sentence_vec + w2v_model.wv[char_list_one]
+            except Exception as e:
+                print(str(e))
+                if char_list_one not in [' ', '']:
+                    sentence_vec = sentence_vec + 1
+    return sentence_vec
+
+
+def most_similar_sentence_vec(vec_ques, matrix_org, top_vec=20):
+    """
+      最相似的句子，句向量与矩阵点乘
+    :param vec: 
+    :param matrix: 
+    :param keys: 
+    :param topn: 
+    :return: 
+    """
+    # 首先对句向量矩阵标号
+    matrix_org_index = list(range(len(matrix_org)))
+    # Scale a vector to unit length. The only exception is the zerovector, which is returned back unchanged.
+    vec_ques_mean = matutils.unitvec(np.array([vec_ques]).mean(axis=0)).astype(numpy_type)
+    # matrix_org单位化
+    matrix_org_norm = (matrix_org / np.sqrt((matrix_org ** 2).sum(-1))[..., np.newaxis]).astype(numpy_type)
+    # 计算两个向量之间的相似度，使用numpy的dot函数，矩阵点乘
+    matrix_vec_dot = np.dot(matrix_org_norm, vec_ques_mean)
+    # 防止top_vec越界
+    top_vec = min(len(matrix_org), top_vec)
+    # 相似度排序
+    most_similar_sentence_vec_sort = matutils.argsort(matrix_vec_dot, topn=top_vec, reverse=True)
+
+    index_score = []
+    for t in most_similar_sentence_vec_sort[:top_vec]:
+        index_score.append([matrix_org_index[t], float(matrix_vec_dot[t])])
+    return index_score
+
+
+def create_matrix_org_np(sen_count, word2vec_model, qa_path, matrix_ques_path):
+    """
+      创建问题句向量
+    :param sen_count: int
+    :param word2vec_model: gensim model
+    :param qa_path: str
+    :param matrix_ques_path:str 
+    :return: None
+    """
+    if os.path.exists(matrix_ques_path):
+        file_matrix_ques = open(matrix_ques_path, 'rb')
+        matrix_ques = pickle.load(file_matrix_ques)
+        return matrix_ques
+    print('create_matrix_org_pkl start!')
+    qa_dail = txtRead(qa_path, encodeType='utf-8')
+    # questions = []
+    matrix_ques = []
+    count = 0
+    for qa_dail_one in qa_dail:
+        ques = getChinese(qa_dail_one.split('\t')[0])
+        char_list = [ques_char for ques_char in ques]
+        sentence_vec = encoding_question(word2vec_model, char_list)
+        matrix_ques.append(sentence_vec)
+        if len(matrix_ques)%sen_count == 0 and len(matrix_ques) != 0:
+            print("count: " + str(count))
+            count += 1
+            np.savetxt(projectdir + "/Data/sentence_vec_encode_char/" + str(count)+".txt", matrix_ques)
+            matrix_ques = []
+            break
+
+    # count += 1
+    # np.savetxt(projectdir + "/Data/sentence_vec_encode_char/" + str(count)+".txt", matrix_ques)
+
+    print('create_matrix_org_pkl ok!')
+    # return matrix_ques
+
+
+if __name__ == '__main__':
+
+    # 读取问答语料
+    syn_qa_dails = txtRead(chicken_and_gossip_path, encodeType='utf-8')
+    # 读取词向量
+    word2vec_model = load_word2vec_model(w2v_model_char_path, limit=None)
+    # 创建标准问答中问题的句向量，存起来，到matrix_ques_path， 10万条，可自己设置，这里需要耗费点时间
+    if not os.path.exists(matrix_ques_part_path_char):
+        # matrix_ques = create_matrix_org_np(sen_count=100000, word2vec_model=word2vec_model, qa_path=chicken_and_gossip_path, matrix_ques_path=matrix_ques_part_path_char)
+        create_matrix_org_np(sen_count=100000, word2vec_model=word2vec_model, qa_path=chicken_and_gossip_path, matrix_ques_path=matrix_ques_part_path_char)
+    # 重载
+    matrix_ques = np.loadtxt(matrix_ques_part_path_char)
+    print("np.loadtxt(matrix_ques_part_path_char) ok!")
+    while True:
+        print("你问: ")
+        ques_ask = input()
+        ques_clean = getChinese(ques_ask)
+        char_list = [ques_char for ques_char in ques_clean]
+        sentence_vic = encoding_question(word2vec_model, char_list)
+        top_20_qid = most_similar_sentence_vec(sentence_vic, matrix_ques, top_vec=20)
+        try:
+            print("小姜机器人: " + syn_qa_dails[top_20_qid[0][0]].strip().split("\t")[1])
+            print([(syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[0], syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[1]) for i in range(len(top_20_qid))])
+        except Exception as e:
+            # 有的字符可能打不出来
+            print(str(e))
+
diff --git a/ChatBot/chatbot_search/chatbot_sentence_vec_by_word.py b/ChatBot/chatbot_search/chatbot_sentence_vec_by_word.py
new file mode 100644
index 0000000..0e3d61c
--- /dev/null
+++ b/ChatBot/chatbot_search/chatbot_sentence_vec_by_word.py
@@ -0,0 +1,217 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time     :2019/4/4 10:00
+# @author   :Mo
+# @function :chatbot based search, encode sentence_vec by word
+
+
+from conf.path_config import w2v_model_merge_short_path, w2v_model_wiki_word_path
+from conf.path_config import projectdir, chicken_and_gossip_path
+from utils.text_tools import txtRead, txtWrite, getChinese
+from conf.path_config import matrix_ques_part_path
+from numpy import float32 as numpy_type
+from collections import Counter
+import pickle, jieba, os, re
+import jieba.posseg as pseg
+from gensim import matutils
+from math import log
+import numpy as np
+import gensim
+import jieba
+import time
+
+
+def load_word2vec_model(path, bin=False, limit=None):
+    print("load_word2vec_model start!")
+    word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(path, limit=limit, binary=bin, unicode_errors='ignore')
+    print("load_word2vec_model end!")
+    return word2vec_model
+
+
+def is_oov(model_vec, query_seg, p_max=0.16):
+    """
+    判断查询分词的oov情况是放弃，如果oov词个数超过xx%则放弃该回答答案
+    :param topic_model:
+    :return:
+    """
+    words = [word for word in query_seg if str(word).strip() is not ""]
+    count_total = 1
+    count_oov = 0
+    if words:
+        count_total = len(words)
+    for word in words:
+        if word not in model_vec:
+            count_oov = count_oov + 1
+    return float(count_oov/count_total) > p_max
+
+
+def get_td_idf_flag(jieba_cut_list, dictionary, tfidf_model):
+    # todo
+    '''获取td-idf权重，有问题，同一个词只计算一次，有的还没有，比如说停用词'''
+    seg1_list = []
+    vec1 = tfidf_model[dictionary.doc2bow(jieba_cut_list)]
+    for vec1_one in vec1:
+        seg1_list.append(vec1_one[1])
+    sum_seg1_list = sum(seg1_list)
+
+    return [x/sum_seg1_list for x in seg1_list]
+
+
+def get_jieba_flag(flag):
+    '''词性'''
+    if flag in ['n', 'nr', 'ns', 'nt', 'nz']:
+        weight = 1.3
+    elif flag in ['r', 'i', 't', 'ng', 'an']:
+        weight = 0.7
+    else:
+        weight = 1
+    return weight
+
+
+def word_segment_process(sentence):
+    """
+        jieba切词\词性
+    :param sentence: 
+    :return: 
+    """
+    sentence = sentence.replace('\n', '').replace(',', '').replace('"', '').replace(' ', '').replace('\t', '').upper().strip()
+    word_list = []
+    flag_list = []
+    try:
+        sentence_cut =  ''.join(jieba.lcut(sentence, cut_all=False, HMM=False))
+        words = pseg.cut(sentence_cut)
+        for word in words:
+            word_list.append(word.word)
+            flag_list.append(word.flag)
+    except Exception as e:
+        word_list = [sentence]
+        flag_list = ['nt']
+    return word_list, flag_list
+
+
+def encoding_question(w2v_model, word_list, flag_list):
+    '''    生成句子向量
+    :param wordlist: 分词list
+    :param is_replaced: 是否替换default true
+    :param debug_mode: default false
+    :return: array句子的向量 len=300
+    '''
+    try:
+        sentence_vec = w2v_model.wv[w2v_model.index2word[1]] * 0
+    except:
+        sentence_vec = w2v_model.wv[w2v_model.index2word[1]] * 0
+
+    for k in range(len(word_list)):
+        word = word_list[k]
+        flag = flag_list[k]
+        if type(word) == str:
+            try:
+                sentence_vec = sentence_vec + w2v_model.wv[word] * get_jieba_flag(flag)
+            except Exception as e:
+                if word not in [' ', '']:
+                    sentence_vec = sentence_vec + 1
+
+    return sentence_vec
+
+
+def most_similar_sentence_vec(vec_ques, matrix_org, top_vec=20):
+    """
+      最相似的句子，句向量与矩阵点乘
+    :param vec: 
+    :param matrix: 
+    :param keys: 
+    :param topn: 
+    :return: 
+    """
+    # 首先对句向量矩阵标号
+    matrix_org_index = list(range(len(matrix_org)))
+    # Scale a vector to unit length. The only exception is the zerovector, which is returned back unchanged.
+    vec_ques_mean = matutils.unitvec(np.array([vec_ques]).mean(axis=0)).astype(numpy_type)
+    # matrix_org单位化
+    matrix_org_norm = (matrix_org / np.sqrt((matrix_org ** 2).sum(-1))[..., np.newaxis]).astype(numpy_type)
+    # 计算两个向量之间的相似度，使用numpy的dot函数，矩阵点乘
+    matrix_vec_dot = np.dot(matrix_org_norm, vec_ques_mean)
+    # 防止top_vec越界
+    top_vec = min(len(matrix_org), top_vec)
+    # 相似度排序
+    most_similar_sentence_vec_sort = matutils.argsort(matrix_vec_dot, topn=top_vec, reverse=True)
+
+    index_score = []
+    for t in most_similar_sentence_vec_sort[:top_vec]:
+        index_score.append([matrix_org_index[t], float(matrix_vec_dot[t])])
+    return index_score
+
+
+def create_matrix_org_np(sen_count, word2vec_model, qa_path, matrix_ques_path_word):
+    """
+      创建问题句向量,设置sen_count=10000, 防止内存不够奔溃
+    :param sen_count: int, write sentence_encode num per twice
+    :param word2vec_model: model
+    :param qa_path: str
+    :param matrix_ques_path: str
+    :return: 
+    """
+    if os.path.exists(matrix_ques_path_word):
+        file_matrix_ques = open(matrix_ques_path_word, 'rb')
+        matrix_ques = pickle.load(file_matrix_ques)
+        return matrix_ques
+    print('create_matrix_org_pkl start!')
+    qa_dail = txtRead(qa_path, encodeType='utf-8')
+    # questions = []
+    matrix_ques = []
+    count = 0
+    for qa_dail_one in qa_dail:
+        ques = getChinese(qa_dail_one.split('\t')[0])
+        # questions.append(ques)
+        word_list, flag_list = word_segment_process(ques)
+        sentence_vec = encoding_question(word2vec_model, word_list, flag_list)
+        matrix_ques.append(sentence_vec)
+        if len(matrix_ques)%sen_count == 0 and len(matrix_ques) != 0:
+            print("count: " + str(count))
+            count += 1
+            np.savetxt(projectdir + "/Data/sentence_vec_encode_word/" + str(count)+".txt", matrix_ques)
+            matrix_ques = []
+            # break
+
+    count += 1
+    np.savetxt(projectdir + "/Data/sentence_vec_encode_word/" + str(count)+".txt", matrix_ques)
+    # matrix_ques = []
+    # file_matrix_ques = open(matrix_ques_path, 'wb')
+    # pickle.dump(matrix_ques, file_matrix_ques)
+    print('create_matrix_org_np ok!')
+    # return matrix_ques
+
+
+if __name__ == '__main__':
+    # 读取问答语料
+    syn_qa_dails = txtRead(chicken_and_gossip_path, encodeType='utf-8')
+
+    # 读取词向量，w2v_model_wiki_word_path数据是自己训练的，w2v_model_merge_short_path只取了部分数据，你可以前往下载
+    if os.path.exists(w2v_model_wiki_word_path):
+        word2vec_model = load_word2vec_model(w2v_model_wiki_word_path, limit=None)
+        print("load w2v_model_wiki_word_path ok!")
+    else:
+        word2vec_model = load_word2vec_model(w2v_model_merge_short_path, limit=None)
+        print("load w2v_model_merge_short_path ok!")
+
+    # 创建标准问答中问题的句向量，存起来，到matrix_ques_path
+    if not os.path.exists(matrix_ques_part_path):
+        create_matrix_org_np(sen_count=100000, word2vec_model=word2vec_model, qa_path=chicken_and_gossip_path, matrix_ques_path_word=matrix_ques_part_path)
+
+    # 读取
+    print("np.loadtxt(matrix_ques_part_path) start!")
+    matrix_ques = np.loadtxt(matrix_ques_part_path)
+    print("np.loadtxt(matrix_ques_part_path) end!")
+    while True:
+        print("你: ")
+        ques_ask = input()
+        ques_clean = getChinese(ques_ask)
+        word_list, flag_list = word_segment_process(ques_clean)
+        sentence_vic = encoding_question(word2vec_model, word_list, flag_list)
+        top_20_qid = most_similar_sentence_vec(sentence_vic, matrix_ques, top_vec=20)
+        try:
+            print("小姜机器人: " + syn_qa_dails[top_20_qid[0][0]].strip().split("\t")[1])
+            print([(syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[0], syn_qa_dails[top_20_qid[i][0]].strip().split("\t")[1]) for i in range(len(top_20_qid))])
+        except Exception as e:
+            # 有的字符可能打不出来
+            print(str(e))
diff --git a/FeatureProject/__init__.py b/FeatureProject/__init__.py
new file mode 100644
index 0000000..98d55da
--- /dev/null
+++ b/FeatureProject/__init__.py
@@ -0,0 +1,5 @@
+# -*- coding: UTF-8 -*-
+#!/usr/bin/python
+# @Time     :2019/3/29 23:10
+# @author   :Mo
+# @function :
\ No newline at end of file
diff --git a/FeatureProject/__pycache__/__init__.cpython-36.pyc b/FeatureProject/__pycache__/__init__.cpython-36.pyc
new file mode 100644
index 0000000..1a3197e
Binary files /dev/null and b/FeatureProject/__pycache__/__init__.cpython-36.pyc differ
diff --git a/FeatureProject/__pycache__/distance_text_or_vec.cpython-36.pyc b/FeatureProject/__pycache__/distance_text_or_vec.cpython-36.pyc
new file mode 100644
index 0000000..782f92e
Binary files /dev/null and b/FeatureProject/__pycache__/distance_text_or_vec.cpython-36.pyc differ
diff --git a/FeatureProject/__pycache__/distance_vec_TS_SS.cpython-36.pyc b/FeatureProject/__pycache__/distance_vec_TS_SS.cpython-36.pyc
new file mode 100644
index 0000000..e6a5b80
Binary files /dev/null and b/FeatureProject/__pycache__/distance_vec_TS_SS.cpython-36.pyc differ
diff --git a/FeatureProject/cut_td_idf.py b/FeatureProject/cut_td_idf.py
new file mode 100644
index 0000000..ac2c4b0
--- /dev/null
+++ b/FeatureProject/cut_td_idf.py
@@ -0,0 +1,104 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time     :2019/4/1 10:35
+# @author   :Mo
+# @function :cut sentences
+
+
+from conf.path_config import chicken_and_gossip_path, td_idf_cut_path, td_idf_cut_pinyin
+from utils.text_tools import txtWrite, txtRead, get_syboml, strQ2B
+from conf.path_config import projectdir
+from gensim import corpora, models
+import xpinyin
+import pickle
+import jieba
+
+
+def cut_td_idf(sources_path, target_path):
+    """
+    结巴切词，汉语
+    :param path: 
+    :return: 
+    """
+    print("cut_td_idf start! ")
+    corpus = txtRead(sources_path)
+    governments = []
+    for corpus_one in corpus:
+        corpus_one_clear = corpus_one.replace(' ', '').strip()
+        ques_q2b = strQ2B(corpus_one_clear.strip())
+        ques_q2b_syboml = get_syboml(ques_q2b)
+        governments.append(ques_q2b_syboml.strip())
+
+    government_ques = list(map(lambda x: ' '.join(jieba.lcut(x)), governments))
+
+    topic_ques_all = []
+    for topic_ques_one in government_ques:
+        top_ques_aqlq = topic_ques_one.replace('   ', ' ').replace('  ', ' ').strip() + '\n'
+        topic_ques_all.append(top_ques_aqlq)
+
+    txtWrite(topic_ques_all, target_path)
+    print("cut_td_idf ok! " + sources_path)
+
+
+def cut_td_idf_pinyin(sources_path, target_path): #获取拼音
+    """
+       汉语转拼音
+    :param path: 
+    :return: 
+    """
+    pin = xpinyin.Pinyin()
+    corpus = txtRead(sources_path)
+    topic_ques_all = []
+    corpus_count = 0
+    for corpus_one in corpus:
+        corpus_count += 1
+        # time1 = time.time()
+        corpus_one_clear = corpus_one.replace(' ', '').strip()
+        ques_q2b = strQ2B(corpus_one_clear.strip())
+        ques_q2b_syboml = get_syboml(ques_q2b)
+        ques_q2b_syboml_pinying = pin.get_pinyin(ques_q2b_syboml.replace('   ', '').replace('  ', '').strip(), ' ')
+        topic_ques_all.append(ques_q2b_syboml_pinying + '\n')
+        # time2 = time.time()
+        # print(str(corpus_count) + 'time:' + str(time2 - time1))
+    txtWrite(topic_ques_all, target_path)
+    print("cut_td_idf_pinyin ok! " + sources_path)
+
+
+def init_tfidf_chinese_or_pinyin(sources_path):
+    """
+      构建td_idf
+    :param path: 
+    :return: 
+    """
+    questions = txtRead(sources_path)
+    corpora_documents = []
+    for item_text in questions:
+        item_seg = list(jieba.cut(str(item_text).strip()))
+        corpora_documents.append(item_seg)
+
+    dictionary = corpora.Dictionary(corpora_documents)
+    corpus = [dictionary.doc2bow(text) for text in corpora_documents]
+    tfidf_model = models.TfidfModel(corpus)
+    print("init_tfidf_chinese_or_pinyin ok! " + sources_path)
+    file = open(sources_path.replace(".csv", "_dictionary_model.pkl"), 'wb')
+    pickle.dump([dictionary, tfidf_model], file)
+
+
+if __name__ == '__main__':
+    # path_text = projectdir + '/Data/chicken_gossip.txt'
+    # sentences = txtRead(path_text)
+    # sentences_q = []
+    # for sentences_one in sentences:
+    #     sentences_one_replace = sentences_one.replace(" ", "").replace("\t", "")
+    #     sentences_one_replace_split = sentences_one_replace.split("|")
+    #     sentence_new = sentences_one_replace_split[0] + "\t" + "".join(sentences_one_replace_split[1:])
+    #     sentences_q.append(sentence_new)
+    # sentences = txtWrite(sentences_q, projectdir + '/Data/chicken_and_gossip.txt')
+
+
+    cut_td_idf(chicken_and_gossip_path, td_idf_cut_path)
+    cut_td_idf_pinyin(chicken_and_gossip_path, td_idf_cut_pinyin)
+    init_tfidf_chinese_or_pinyin(td_idf_cut_path)
+    init_tfidf_chinese_or_pinyin(td_idf_cut_pinyin)
+    print("corpus ok!")
+
diff --git a/FeatureProject/distance_text_or_vec.py b/FeatureProject/distance_text_or_vec.py
new file mode 100644
index 0000000..b501b99
--- /dev/null
+++ b/FeatureProject/distance_text_or_vec.py
@@ -0,0 +1,330 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time     :2019/4/4 10:00
+# @author   :Mo
+# @function :
+
+from sklearn.feature_extraction.text import TfidfVectorizer
+from utils.text_tools import txtRead, get_syboml, strQ2B
+import Levenshtein as Leven
+from fuzzywuzzy import fuzz
+import jieba.analyse
+import numpy as np
+import xpinyin
+import pickle
+import jieba
+import os
+
+
+zero_bit = 0.000000001
+pin = xpinyin.Pinyin()
+
+
+def clear_sentence(sentence):
+    """
+      数据清晰，全角转半角
+    :param sentence: str, input sentence
+    :return: str, clearned sentences
+    """
+    corpus_one_clear = str(sentence).replace(' ', '').strip()
+    ques_q2b = strQ2B(corpus_one_clear.strip())
+    ques_q2b_syboml = get_syboml(ques_q2b)
+    return ques_q2b_syboml
+
+
+def chinese2pinyin(sentence):
+    """
+      chinese translate to pingyin
+    :param sentence: str, input sentence
+    :return: str, output pingyin
+    """
+    ques_q2b_syboml_pinying = pin.get_pinyin(sentence, ' ')
+    return ques_q2b_syboml_pinying
+
+
+def hamming_distance(v1, v2):
+    n = int(v1, 2) ^ int(v2, 2)
+    return bin(n & 0xffffffff).count('1')
+
+
+def cosine_distance(v1, v2): # 余弦距离
+    if v1.all() and v2.all():
+        return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
+    else:
+        return 0
+
+
+def euclidean_distance(v1, v2):  # 欧氏距离
+    return np.sqrt(np.sum(np.square(v1 - v2)))
+
+
+def manhattan_distance(v1, v2):  # 曼哈顿距离
+    return np.sum(np.abs(v1 - v2))
+
+
+def chebyshev_distance(v1, v2):  # 切比雪夫距离
+    return np.max(np.abs(v1 - v2))
+
+
+def minkowski_distance(v1, v2):  # 闵可夫斯基距离
+    return np.sqrt(np.sum(np.square(v1 - v2)))
+
+
+def euclidean_distance_standardized(v1, v2):  # 标准化欧氏距离
+    v1_v2 = np.vstack([v1, v2])
+    sk_v1_v2 = np.var(v1_v2, axis=0, ddof=1)
+    return np.sqrt(((v1 - v2) ** 2 / (sk_v1_v2 + zero_bit * np.ones_like(sk_v1_v2))).sum())
+
+
+def mahalanobis_distance(v1, v2):  # 马氏距离
+    # 马氏距离要求样本数要大于维数，否则无法求协方差矩阵
+    # 此处进行转置，表示10个样本，每个样本2维
+    X = np.vstack([v1, v2])
+    XT = X.T
+
+    # 方法一：根据公式求解
+    S = np.cov(X)  # 两个维度之间协方差矩阵
+    try:
+        SI = np.linalg.inv(S)  # 协方差矩阵的逆矩阵  todo
+    except:
+        SI = np.zeros_like(S)
+    # 马氏距离计算两个样本之间的距离，此处共有10个样本，两两组合，共有45个距离。
+    n = XT.shape[0]
+    distance_all = []
+    for i in range(0, n):
+        for j in range(i + 1, n):
+            delta = XT[i] - XT[j]
+            distance_1 = np.sqrt(np.dot(np.dot(delta, SI), delta.T))
+            distance_all.append(distance_1)
+    return np.sum(np.abs(distance_all))
+
+
+def bray_curtis_distance(v1, v2):  # 布雷柯蒂斯距离, 生物学生态距离
+    up_v1_v2 = np.sum(np.abs(v2 - v1))
+    down_v1_v2 = np.sum(v1) + np.sum(v2)
+    return up_v1_v2 / (down_v1_v2 + zero_bit)
+
+
+def pearson_correlation_distance(v1, v2):  # 皮尔逊相关系数（Pearson correlation）
+    v1_v2 = np.vstack([v1, v2])
+    return np.corrcoef(v1_v2)[0][1]
+
+
+def jaccard_similarity_coefficient_distance(v1, v2):  # 杰卡德相似系数(Jaccard similarity coefficient)
+    # 方法一：根据公式求解
+    v1 = np.asarray(v1)
+    v2 = np.asarray(v2)
+    up = np.double(np.bitwise_and((v1 != v2), np.bitwise_or(v1 != 0, v2 != 0)).sum())
+    down = np.double(np.bitwise_or(v1 != 0, v2 != 0).sum() + zero_bit)
+    return up / down
+
+
+def wmd_distance(model, sent1_cut_list, sent2_cut_list):  # WMD距离
+    # model.init_sims(replace=True)
+    distance = model.wmdistance(sent1_cut_list, sent2_cut_list)
+    return distance
+
+
+# def HamMings_Levenshtein(str1, str2):
+#     sim = Leven.hamming(str1, str2)
+#     return sim
+
+def edit_levenshtein(str1, str2):
+    return Leven.distance(str1, str2)
+
+
+def ratio_levenshtein(str1, str2):
+    return Leven.ratio(str1, str2)
+
+
+def jaro_levenshtein(str1, str2):
+    return Leven.jaro(str1, str2)
+
+
+def set_ratio_fuzzywuzzy(str1, str2):
+    return fuzz.token_set_ratio(str1, str2)
+
+
+def sort_ratio_fuzzywuzzy(str1, str2):
+    return fuzz.token_sort_ratio(str1, str2)
+
+
+def num_of_common_sub_str(str1, str2):
+    '''
+    求两个字符串的最长公共子串
+    思想：建立一个二维数组，保存连续位相同与否的状态
+    '''
+    lstr1 = len(str1)
+    lstr2 = len(str2)
+    record = [[0 for i in range(lstr2 + 1)] for j in range(lstr1 + 1)]  # 多一位
+    maxNum = 0  # 最长匹配长度
+    p = 0  # 匹配的起始位
+
+    for i in range(lstr1):
+        for j in range(lstr2):
+            if str1[i] == str2[j]:
+                # 相同则累加
+                record[i + 1][j + 1] = record[i][j] + 1
+                if record[i + 1][j + 1] > maxNum:
+                    # 获取最大匹配长度
+                    maxNum = record[i + 1][j + 1]
+                    # 记录最大匹配长度的终止位置
+                    p = i + 1
+    # return str1[p - maxNum:p], maxNum
+    return maxNum
+
+
+#######################################################  汉明距离
+def string_hash(source):
+    if source == "":
+        return 0
+    else:
+        x = ord(source[0]) << 7
+        m = 1000003
+        mask = 2 ** 128 - 1
+        for c in source:
+            x = ((x * m) ^ ord(c)) & mask
+        x ^= len(source)
+        if x == -1:
+            x = -2
+        x = bin(x).replace('0b', '').zfill(64)[-64:]
+
+        return str(x)
+
+
+def sim_hash(content):
+    seg = jieba.cut(content)
+    keyWord = jieba.analyse.extract_tags('|'.join(seg), topK=20, withWeight=True, allowPOS=())
+    # 先按照权重排序，再按照词排序
+    keyList = []
+    # print(keyWord)
+    for feature, weight in keyWord:
+        weight = int(weight * 20)
+        feature = string_hash(feature)
+        temp = []
+        for f in feature:
+            if f == '1':
+                temp.append(weight)
+            else:
+                temp.append(-weight)
+        keyList.append(temp)
+    content_list = np.sum(np.array(keyList), axis=0)
+    # 编码读不出来
+    if len(keyList) == 0:
+        return '00'
+    simhash = ''
+    for c in content_list:
+        if c > 0:
+            simhash = simhash + '1'
+        else:
+            simhash = simhash + '0'
+    return simhash
+
+
+def hamming_distance_equal(v1, v2):
+    n = int(v1, 2) ^ int(v2, 2)
+    return bin(n & 0xffffffff).count('1')
+
+
+def hamming_distance(sen1, sen2):
+    return hamming_distance_equal(sim_hash(sen1), sim_hash(sen2))
+
+
+def normalization(x):
+    """
+      归一化，最大最小值
+    :param x: 
+    :return:  
+    """
+    return [(float(i) - min(x)) / float(max(x) - min(x) + zero_bit) for i in x]
+
+
+def z_score(x, axis=0):
+    """
+      标准化
+    :param x: arrary, numpy
+    :param axis: int, 0
+    :return: arrary, numpy
+    """
+    x = np.array(x).astype(float)
+    xr = np.rollaxis(x, axis=axis)
+    xr -= np.mean(x, axis=axis)
+    xr /= np.std(x, axis=axis)
+    # print(x)
+    return x
+
+
+def tok_td_idf(data_path):
+    if os.path.exists(data_path + 'td_idf_cut.csv'):
+        '''#计算TD-DIDF，获取训练测试数据'''
+        datas = txtRead(data_path + 'td_idf_cut.csv')
+        # 默认值只匹配长度≥2的单词,修改为1；ngram_range特征所以有2个词的,总计词语50428个
+        # vec_tdidf = TfidfVectorizer(ngram_range=(1, 2), token_pattern=r"(?u)\b\w+\b", min_df=1, max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1,max_features=30000)
+        vec_tdidf = TfidfVectorizer(ngram_range=(1, 2), token_pattern=r"(?u)\b\w+\b", min_df=3,
+                                    max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1, max_features=50000)
+        vec_tdidf.fit_transform(datas)
+        file_vec_tdidf = open(data_path + 'td_idf_cut_model.pkl', 'wb')
+        pickle.dump(vec_tdidf, file_vec_tdidf)
+
+    return vec_tdidf
+
+
+def tok_td_idf_pinyin(data_path):
+    if os.path.exists(data_path + 'td_idf_cut_pinyin.csv'):
+        '''#计算TD-DIDF，获取训练测试数据'''
+        datas = txtRead(data_path + 'td_idf_cut_pinyin.csv')
+        # 默认值只匹配长度≥2的单词,修改为1；ngram_range特征所以有2个词的,总计词语50428个
+        # vec_tdidf = TfidfVectorizer(ngram_range=(1, 2), token_pattern=r"(?u)\b\w+\b", min_df=1, max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1,max_features=30000)
+        vec_tdidf = TfidfVectorizer(ngram_range=(1, 2), token_pattern=r"(?u)\b\w+\b", min_df=3,
+                                    max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1, max_features=50000)
+        vec_tdidf.fit_transform(datas)
+        file_vec_tdidf = open(data_path + 'td_idf_cut_pinyin_model.pkl', 'wb')
+        pickle.dump(vec_tdidf, file_vec_tdidf)
+
+    return vec_tdidf
+
+
+if __name__ == '__main__':
+    vec1_test = np.array([1, 38, 17, 32])
+    vec2_test = np.array([5, 6, 8, 9])
+
+    str1_test = "你到底是谁?"
+    str2_test = "没想到我是谁，是真样子"
+
+    print(clear_sentence(str1_test))  # 数据处理
+    print(chinese2pinyin(str1_test))  # 中文转拼音
+
+    print(euclidean_distance(vec1_test, vec2_test))
+    print(cosine_distance(vec1_test, vec2_test))
+    print(manhattan_distance(vec1_test, vec2_test))
+    print(euclidean_distance(vec1_test, vec2_test))
+    print(chebyshev_distance(vec1_test, vec2_test))
+    print(minkowski_distance(vec1_test, vec2_test))
+
+    print(euclidean_distance_standardized(vec1_test, vec2_test))
+    print(mahalanobis_distance(vec1_test, vec2_test))
+
+    print('###############################################')
+
+    print(bray_curtis_distance(vec1_test, vec2_test))
+    print(pearson_correlation_distance(vec1_test, vec2_test))
+    print(jaccard_similarity_coefficient_distance(vec1_test, vec2_test))
+
+    print('###############################################')
+
+    # print(HamMings_Levenshtein(str1, str2)),需要等长
+    # print(Wmd_distance(model, sent1_cut_list, sent2_cut_list)) # 需要gensim word2vec model
+
+    print(hamming_distance(str1_test, str2_test))
+    print(edit_levenshtein(str1_test, str2_test))
+    print(ratio_levenshtein(str1_test, str2_test))
+    print(jaro_levenshtein(str1_test, str2_test))
+    print(set_ratio_fuzzywuzzy(str1_test, str2_test))
+    print(sort_ratio_fuzzywuzzy(str1_test, str2_test))
+    print(num_of_common_sub_str(str1_test, str2_test))
+    print(normalization(vec1_test))  # 归一化（0-1）
+    print(z_score(vec1_test))  # 标准化（0附近，正负）
+
+    # data_path = 'D:/workspace/python/bitbucket/nlp_model_v1.0/nlp_model/models/word_feature/sim_data/'
+    # tok_TD_IDF(data_path)
+    # tok_TD_IDF_pinyin(data_path)
diff --git a/FeatureProject/distance_vec_TS_SS.py b/FeatureProject/distance_vec_TS_SS.py
new file mode 100644
index 0000000..447ee79
--- /dev/null
+++ b/FeatureProject/distance_vec_TS_SS.py
@@ -0,0 +1,84 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time     :2019/4/3 10:36
+# @author   :Mo
+# @function :TS-SS distance
+# @url      :https://github.com/taki0112/Vector_Similarity
+# @paper    :A Hybrid Geometric Approach for Measuring Similarity Level Among Documents and Document Clustering
+
+
+import numpy as np
+import math
+
+zero_bit = 0.000000001
+
+
+def Cosine(vec1, vec2):
+    """
+       余弦相似度
+    :param vec1: arrary
+    :param vec2: arrary
+    :return: float
+    """
+    result = InnerProduct(vec1, vec2) / (VectorSize(vec1) * VectorSize(vec2) + zero_bit)
+    return result
+
+
+def VectorSize(vec):
+    vec_pow = sum(math.pow(v + zero_bit, 2) for v in vec)
+    if vec_pow >= 0:
+        return math.sqrt(vec_pow)
+    else:
+        return zero_bit
+
+
+def InnerProduct(vec1, vec2):
+    try:
+        return sum(v1 * v2 for v1, v2 in zip(vec1, vec2))
+    except:
+        return zero_bit
+
+
+def Euclidean(vec1, vec2):
+    vec12_pow = sum(math.pow((v1 - v2), 2) for v1, v2 in zip(vec1, vec2))
+    if vec12_pow >= 0:
+        return math.sqrt(vec12_pow)
+    else:
+        return zero_bit
+
+
+def Theta(vec1, vec2):
+    cosine_vec12 = Cosine(vec1, vec2)
+    if -1 <= cosine_vec12 and cosine_vec12 <= 1:
+        return math.acos(cosine_vec12) + 10
+    else:
+        return zero_bit + 10
+
+
+def Triangle(vec1, vec2):
+    theta = math.radians(Theta(vec1, vec2))
+    return (VectorSize(vec1) * VectorSize(vec2) * math.sin(theta)) / 2
+
+
+def Magnitude_Difference(vec1, vec2):
+    return abs(VectorSize(vec1) - VectorSize(vec2))
+
+
+def Sector(vec1, vec2):
+    ED = Euclidean(vec1, vec2)
+    MD = Magnitude_Difference(vec1, vec2)
+    theta = Theta(vec1, vec2)
+    return math.pi * math.pow((ED + MD), 2) * theta / 360
+
+
+def TS_SS(vec1, vec2):
+    return Triangle(vec1, vec2) * Sector(vec1, vec2)
+
+
+if __name__ == '__main__':
+    vec1_test = np.array([1, 38, 17, 32])
+    vec2_test = np.array([5, 6, 8, 9])
+
+    print(Euclidean(vec1_test, vec2_test))
+    print(Cosine(vec1_test, vec2_test))
+    print(TS_SS(vec1_test, vec2_test))
diff --git a/FeatureProject/normalization_util.py b/FeatureProject/normalization_util.py
new file mode 100644
index 0000000..4edf2b1
--- /dev/null
+++ b/FeatureProject/normalization_util.py
@@ -0,0 +1,96 @@
+# -*- coding: UTF-8 -*-
+#!/usr/bin/python
+# @Time     :2019/3/12 14:18
+# @author   :Mo
+# @site     :https://blog.csdn.net/rensihui
+
+from sklearn import preprocessing
+import numpy as np
+
+def autoL1L2(data, norms = 'l1'):
+    '''L1或者L2正则化'''
+    return preprocessing.normalize(data, norm = norms)
+
+def autoScale(data):
+    '''标准化, (X-mean)/std.得到的结果是，对于每个属性/每列来说所有数据都聚集在0附近，方差为1。'''
+    return preprocessing.scale(data)
+
+def autoMinMaxScaler(data):
+    '''将属性缩放到一个指定范围'''
+    return preprocessing.MinMaxScaler(feature_range=(0, 1)).fit_transform(data)
+
+def autoLinNorm(data):  # 传入一个矩阵
+    ''' 0-1归一化
+        :param data: []矩阵
+        :return:     []
+    '''
+    mins = data.min(0)  # 返回data矩阵中每一列中最小的元素，返回一个列表
+    maxs = data.max(0)  # 返回data矩阵中每一列中最大的元素，返回一个列表
+    ranges = maxs - mins  # 最大值列表 - 最小值列表 = 差值列表
+    normData = np.zeros(np.shape(data))  # 生成一个与 data矩阵同规格的normData全0矩阵，用于装归一化后的数据
+    row = data.shape[0]  # 返回 data矩阵的行数
+    normData = data - np.tile(mins, (row, 1))  # data矩阵每一列数据都减去每一列的最小值
+    normData = normData / np.tile(ranges, (row, 1))  # data矩阵每一列数据都除去每一列的差值（差值 = 某列的最大值- 某列最小值）
+    return normData
+
+
+
+def autoAvgNorm(data):  # 传入一个矩阵
+    ''' 均值归一化
+        :param data: []矩阵
+        :return:     []
+    '''
+    avg = np.average(data, axis=1)  # 返回data矩阵中每一列中最小的元素，返回一个列表
+    sigma = np.std(data, axis=1)  # 返回data矩阵中每一列中最大的元素，返回一个列表
+    normData = np.zeros(np.shape(data))  # 生成一个与 data矩阵同规格的normData全0矩阵，用于装归一化后的数据
+    row = data.shape[0]  # 返回 data矩阵的行数
+    normData = data - np.tile(avg, (row, 1))  # data矩阵每一列数据都减去每一列的最小值
+    normData = normData / np.tile(sigma, (row, 1))  # data矩阵每一列数据都除去每一列的差值（差值 = 某列的最大值- 某列最小值）
+    return normData
+
+
+
+###Sigmoid函数；Sigmoid函数是一个具有S形曲线的函数，是良好的阈值函数，在(0, 0.5)处中心对称，在(0, 0.5)附近有比较大的斜率，
+# 而当数据趋向于正无穷和负无穷的时候，映射出来的值就会无限趋向于1和0，是个人非常喜欢的“归一化方法”，之所以打引号是因为我觉得Sigmoid函数在
+# 阈值分割上也有很不错的表现，根据公式的改变，就可以改变分割阈值，这里作为归一化方法，我们只考虑(0, 0.5)作为分割阈值的点的情况：
+def sigmoid(data,useStatus):
+    '''  sig归一化
+        :param data: []矩阵
+        :return:     []
+    '''
+    if useStatus:
+        row=data.shape[0]
+        column=data.shape[1]
+        normData = np.zeros(np.shape(data))
+        for i in range(row):
+            for j in range(column):
+                normData[i][j]=1.0 / (1 + np.exp(-float(data[i][j])));
+        return normData
+    else:
+        return float(data);
+
+if __name__ == '__main__':
+    arr = np.array([[8, 7, 8], [4, 3, 1], [6, 9, 8]])
+
+    print("l1正则化")
+    print(autoL1L2(arr, norms='l1'))
+
+    print("l2正则化")
+    print(autoL1L2(arr, norms='l2'))
+
+    print("0-1标准化处理")
+    print(autoScale(arr))
+
+    print("0-1缩放处理")
+    print(autoMinMaxScaler(arr))
+
+
+    print("0-1归一化处理")
+    print(autoLinNorm(arr))
+
+
+    print("均值归一化处理")
+    print(autoAvgNorm(arr))
+
+    print("sig归一化处理")
+    print(sigmoid(arr,True))
diff --git a/FeatureProject/sentence_sim_feature.py b/FeatureProject/sentence_sim_feature.py
new file mode 100644
index 0000000..ffe056c
--- /dev/null
+++ b/FeatureProject/sentence_sim_feature.py
@@ -0,0 +1,384 @@
+# -*- coding:utf-8 -*-
+# -*- created by: moyongzhuo -*-
+
+
+from FeatureProject.distance_text_or_vec import euclidean_distance, cosine_distance, manhattan_distance, euclidean_distance, jaccard_similarity_coefficient_distance
+from FeatureProject.distance_text_or_vec import chebyshev_distance, minkowski_distance, euclidean_distance_standardized
+from FeatureProject.distance_text_or_vec import mahalanobis_distance, bray_curtis_distance, pearson_correlation_distance
+from FeatureProject.distance_text_or_vec import wmd_distance, normalization, z_score
+from FeatureProject.distance_text_or_vec import hamming_distance, edit_levenshtein, ratio_levenshtein, jaro_levenshtein, set_ratio_fuzzywuzzy, sort_ratio_fuzzywuzzy
+from FeatureProject.distance_text_or_vec import clear_sentence, chinese2pinyin, num_of_common_sub_str
+from conf.path_config import word2_vec_path, td_idf_path, td_idf_path_pinyin
+from FeatureProject.distance_vec_TS_SS import TS_SS
+from gensim import corpora, models, matutils
+from conf.path_config import projectdir
+from gensim.models import KeyedVectors
+import pandas as pd
+import numpy as np
+import pickle
+import jieba
+import time
+import os
+
+
+class SentenceSimFeature:
+    def __init__(self):
+        self.sen1 = None
+        self.sen2 = None
+        self.seg1 = None
+        self.seg2 = None
+        self.sen_vec1 = None
+        self.sen_vec2 = None
+        self.tfidf_vec1 = None
+        self.tfidf_vec2 = None
+        self.dictionary = None
+        self.tfidf_model = None
+        self.w2c_model = None
+
+        self.tfidf_pinyin_model = None
+        self.dictionary_pinyin = None
+        self.sen1_pinyin = None
+        self.sen2_pinyin = None
+        self.seg1_pinyin = None
+        self.seg2_pinyin = None
+        self.tfidf_vec1_pinyin = None
+        self.tfidf_vec2_pinyin = None
+
+    def set_data(self, sen1, sen2):
+        sen1 = clear_sentence(sen1)
+        sen2 = clear_sentence(sen2)
+        self.sen1 = str(sen1).strip()
+        self.sen2 = str(sen2).strip()
+        self.seg1 = list(jieba.cut(sen1))
+        self.seg2 = list(jieba.cut(sen2))
+        self.sen1_pinyin = chinese2pinyin(sen1)
+        self.sen2_pinyin = chinese2pinyin(sen2)
+        self.seg1_pinyin = (self.sen1_pinyin).split(' ')
+        self.seg2_pinyin = (self.sen2_pinyin).split(' ')
+        self.sen_vec1 = np.zeros(300)
+        self.sen_vec2 = np.zeros(300)
+        # self.tfidf_vec1 = np.array((self.tfidf_model.transform([' '.join(self.seg1)])).toarray().tolist()[0])
+        # self.tfidf_vec2 = np.array((self.tfidf_model.transform([' '.join(self.seg2)])).toarray().tolist()[0])
+        # self.tfidf_vec1_pinyin = np.array((self.tfidf_pinyin_model.transform([' '.join(self.seg1_pinyin)])).toarray().tolist()[0])
+        # self.tfidf_vec2_pinyin = np.array((self.tfidf_pinyin_model.transform([' '.join(self.seg2_pinyin)])).toarray().tolist()[0])
+        self.tfidf_vec1 = self.tfidf_model[self.dictionary.doc2bow(self.seg1)]
+        self.tfidf_vec2 = self.tfidf_model[self.dictionary.doc2bow(self.seg2)]
+        self.tfidf_vec1_pinyin = self.tfidf_pinyin_model[self.dictionary_pinyin.doc2bow(self.seg1_pinyin)]
+        self.tfidf_vec2_pinyin = self.tfidf_pinyin_model[self.dictionary_pinyin.doc2bow(self.seg2_pinyin)]
+
+    def same_word_count(self):
+        count_left = 0
+        for s in self.seg1:
+            if s in self.seg2:
+                count_left += 1
+
+        count_right = 0
+        for s in self.seg2:
+            if s in self.seg1:
+                count_right += 1
+
+        return min(count_left, count_right)
+
+    def same_char_count(self):
+        seg1 = list(self.sen1)
+        seg2 = list(self.sen2)
+
+        count_left = 0
+        for s in seg1:
+            if s in seg2:
+                count_left += 1
+
+        count_right = 0
+        for s in seg2:
+            if s in seg1:
+                count_right += 1
+
+        return min(count_left, count_right)
+
+    def sentence_length(self):
+        len_sen1 = len(self.sen1)
+        len_sen2 = len(self.sen2)
+        len_abs_sub = abs(len_sen1 - len_sen2)
+        len_rate = len_sen1 / len_sen2
+        len_add_rate = len_sen1 * len_sen2 / (len_sen1 + len_sen2)
+
+        return [len_abs_sub, len_rate, len_add_rate]
+
+    def init_sentence_vector(self):
+        # file_path = os.path.dirname(__file__)
+        print('load w2v model begin')
+        # model_path = os.path.join(file_path, word2_vec_path)
+        self.w2c_model = KeyedVectors.load_word2vec_format(word2_vec_path, unicode_errors='ignore', limit=None)  # ,binary=True)
+        print('load w2v model success')
+
+    def encode_sentence_vector(self):
+        for s in self.seg1:
+            try:
+                self.sen_vec1 += self.w2c_model[s]
+            except:
+                self.sen_vec1 += np.zeros(300)
+                continue
+
+        for s in self.seg2:
+            try:
+                self.sen_vec2 += self.w2c_model[s]
+            except:
+                self.sen_vec2 += np.zeros(300)
+                continue
+
+    def init_tfidf(self):
+        file = open(td_idf_path, 'rb')
+        tfidf_dictionary_model = pickle.load(file)
+        self.dictionary = tfidf_dictionary_model[0]
+        self.tfidf_model = tfidf_dictionary_model[1]
+
+        file = open(td_idf_path_pinyin, 'rb')
+        tfidf_dictionary_pinyin_model = pickle.load(file)
+        self.dictionary_pinyin = tfidf_dictionary_pinyin_model[0]
+        self.tfidf_pinyin_model = tfidf_dictionary_pinyin_model[1]
+        print("init_tfidf ok!")
+
+    def w2c_all_vec(self):
+        w2c_Cosine = cosine_distance(self.sen_vec1, self.sen_vec2)
+        w2c_TS_SS = TS_SS(self.sen_vec1, self.sen_vec2)
+        w2c_Manhattan = manhattan_distance(self.sen_vec1, self.sen_vec2)
+        w2c_Euclidean = euclidean_distance(self.sen_vec1, self.sen_vec2)
+        w2c_Jaccard = jaccard_similarity_coefficient_distance(self.sen_vec1, self.sen_vec2)
+
+        w2c_Chebyshev = chebyshev_distance(self.sen_vec1, self.sen_vec2)
+        w2c_Minkowski = minkowski_distance(self.sen_vec1, self.sen_vec2)
+
+        w2c_Euclidean_Standard = euclidean_distance_standardized(self.sen_vec1, self.sen_vec2)
+        w2c_Mahalanobis = mahalanobis_distance(self.sen_vec1, self.sen_vec2)
+        w2c_Bray = bray_curtis_distance(self.sen_vec1, self.sen_vec2)
+        w2c_Pearson = pearson_correlation_distance(self.sen_vec1, self.sen_vec2)
+
+        # w2c_Wmd = Wmd_Distance(self.w2c_model, self.sen_vec1, self.sen_vec2)
+        return [w2c_Cosine, w2c_TS_SS, w2c_Manhattan, w2c_Euclidean, w2c_Jaccard, w2c_Chebyshev,
+                w2c_Minkowski, w2c_Euclidean_Standard, w2c_Mahalanobis, w2c_Bray, w2c_Pearson]
+
+    def tdidf_all_vec(self):
+
+        return matutils.cossim(self.tfidf_vec1, self.tfidf_vec2)
+
+    def edit_all_str(self):
+        str_hamming = hamming_distance(self.sen1, self.sen2)
+        str_edit = edit_levenshtein(self.sen1, self.sen2)
+        str_ratio = ratio_levenshtein(self.sen1, self.sen2)
+        str_jaro = jaro_levenshtein(self.sen1, self.sen2)
+        str_set_ratio_fuzz = set_ratio_fuzzywuzzy(self.sen1, self.sen2)
+        str_sort_ratio_fuzz = sort_ratio_fuzzywuzzy(self.sen1, self.sen2)
+        str_commonsubstr = num_of_common_sub_str(self.sen1, self.sen2)
+        str_list_Wmd = wmd_distance(self.w2c_model, self.seg1, self.seg2)
+
+        return [str_hamming, str_edit, str_ratio, str_jaro,
+                str_set_ratio_fuzz, str_sort_ratio_fuzz, str_commonsubstr, str_list_Wmd]
+
+    def word_jaccard(self):
+        a = list(set(self.seg1).intersection(set(self.seg2)))
+        b = list(set(self.seg1).union(set(self.seg2)))
+        return float(len(a) / len(b))
+
+    def char_jaccard(self):
+        a = list(set(list(self.sen1)).intersection(set(list(self.sen2))))
+        b = list(set(list(self.sen1)).union(set(list(self.sen2))))
+
+        return float(len(a) / len(b))
+
+    def tdidf_all_vec_pinyin(self):
+
+        return matutils.cossim(self.tfidf_vec1_pinyin, self.tfidf_vec2_pinyin)
+
+    def edit_all_pinyin(self):
+        pinyin_hamming = hamming_distance(self.sen1_pinyin, self.sen2_pinyin)
+        pinyin_edit = edit_levenshtein(self.sen1_pinyin, self.sen2_pinyin)
+        pinyin_ratio = ratio_levenshtein(self.sen1_pinyin, self.sen2_pinyin)
+        pinyin_jaro = jaro_levenshtein(self.sen1_pinyin, self.sen2_pinyin)
+        pinyin_set_ratio_fuzz = set_ratio_fuzzywuzzy(self.sen1_pinyin, self.sen2_pinyin)
+        pinyin_sort_ratio_fuzz = sort_ratio_fuzzywuzzy(self.sen1_pinyin, self.sen2_pinyin)
+        pinyin_commonsubstr = num_of_common_sub_str(self.sen1_pinyin, self.sen2_pinyin)
+        pinyin_list_Wmd = wmd_distance(self.w2c_model, self.seg1_pinyin, self.seg2_pinyin)
+
+        return [pinyin_hamming, pinyin_edit, pinyin_ratio, pinyin_jaro,
+                pinyin_set_ratio_fuzz, pinyin_sort_ratio_fuzz, pinyin_commonsubstr, pinyin_list_Wmd]
+
+    def word_jaccard_pinyin(self):
+        a = list(set(self.seg1_pinyin).intersection(set(self.seg2_pinyin)))
+        b = list(set(self.seg1_pinyin).union(set(self.seg2_pinyin)))
+        return float(len(a) / len(b))
+
+    def char_jaccard_pinyin(self):
+        a = list(set(list(self.seg1_pinyin)).intersection(set(list(self.seg2_pinyin))))
+        b = list(set(list(self.seg1_pinyin)).union(set(list(self.seg2_pinyin))))
+
+        return float(len(a) / len(b))
+
+
+def sentence_input_t():
+    while True:
+        s1 = input('s1: ')
+        s2 = input('s2: ')
+
+        start_time = time.time()
+        ssf.set_data(s1, s2)
+        ssf.encode_sentence_vector()
+
+        time1 = time.time()
+        print('set_data time：' + str(time1 - start_time))
+
+        # 相同词、长度
+        same_word_count = ssf.same_word_count()
+        time2 = time.time()
+        print('same_word_count time：' + str(time2 - time1))
+
+        same_char_count = ssf.same_char_count()
+        time3 = time.time()
+        print('same_char_count time：' + str(time3 - time2))
+
+        [len_abs_sub, len_rate, len_add_rate] = ssf.sentence_length()
+        time4 = time.time()
+        print('sentence_length time：' + str(time4 - time3))
+
+        #  w2c_all_vec
+        [w2c_Cosine, w2c_TS_SS, w2c_Manhattan, w2c_Euclidean,
+         w2c_Jaccard, w2c_Chebyshev, w2c_Minkowski, w2c_Euclidean_Standard, w2c_Mahalanobis,
+         w2c_Bray, w2c_Pearson] = ssf.w2c_all_vec()
+        time5 = time.time()
+        print('w2c_all_vec time：' + str(time5 - time4))
+
+        #  tdidf_all_vec
+        # [tdidf_Cosine, tdidf_TS_SS, tdidf_Manhattan, tdidf_Euclidean,
+        #  tdidf_Jaccard, tdidf_Chebyshev,tdidf_Minkowski, tdidf_Euclidean_Standard, tdidf_Mahalanobis,
+        #  tdidf_Bray, tdidf_Pearson] = ssf.tdidf_all_vec()
+        tdidf_cossim = ssf.tdidf_all_vec()
+        time6 = time.time()
+        print('tdidf_all_vec time：' + str(time6 - time5))
+
+        #  edit_all_str
+        [str_hamming, str_edit, str_ratio, str_jaro,
+         str_set_ratio_fuzz, str_sort_ratio_fuzz, str_commonsubstr, str_list_Wmd] = ssf.edit_all_str()
+        time7 = time.time()
+        print('edit_all_str time：' + str(time7 - time6))
+
+        # jaccard系数
+        word_jaccard = ssf.word_jaccard()
+        char_jaccard = ssf.char_jaccard()
+        time8 = time.time()
+        print('jaccard系数 time：' + str(time8 - time7))
+
+        #  tdidf_all_vec_pinyin
+        # [tdidf_piyin_Cosine, tdidf_piyin_TS_SS, tdidf_piyin_Manhattan, tdidf_piyin_Euclidean, tdidf_piyin_Jaccard,
+        #  tdidf_piyin_Chebyshev, tdidf_piyin_Minkowski, tdidf_piyin_Euclidean_Standard, tdidf_piyin_Mahalanobis,
+        #  tdidf_piyin_Bray, tdidf_piyin_Pearson] = ssf.tdidf_all_vec_pinyin()
+        tdidf_pinyin_cossim = ssf.tdidf_all_vec_pinyin()
+        time9 = time.time()
+        print('tdidf_all_vec_pinyin time：' + str(time9 - time8))
+
+        #  edit_all_pinyin
+        [pinyin_hamming, pinyin_edit, pinyin_ratio, pinyin_jaro,
+         pinyin_set_ratio_fuzz, pinyin_sort_ratio_fuzz, pinyin_commonsubstr, pinyin_list_Wmd] = ssf.edit_all_pinyin()
+        time10 = time.time()
+        print('edit_all_pinyin time：' + str(time10 - time9))
+
+        # jaccard系数
+        word_jaccard_pinyin = ssf.word_jaccard_pinyin()
+        char_jaccard_pinyin = ssf.char_jaccard_pinyin()
+        time11 = time.time()
+        print('jaccard系数pinyin  time：' + str(time11 - time10))
+        sim_all_last = [same_word_count, same_char_count, len_abs_sub, len_rate, len_add_rate,
+                        w2c_Cosine, w2c_TS_SS, w2c_Manhattan, w2c_Euclidean, w2c_Jaccard, w2c_Chebyshev, w2c_Minkowski,
+                        w2c_Euclidean_Standard, w2c_Mahalanobis, w2c_Bray, w2c_Pearson,
+                        tdidf_cossim, str_hamming, str_edit, str_ratio, str_jaro, str_set_ratio_fuzz,
+                        str_sort_ratio_fuzz,
+                        str_commonsubstr, str_list_Wmd,
+                        word_jaccard, char_jaccard, tdidf_pinyin_cossim,
+                        pinyin_hamming, pinyin_edit, pinyin_ratio, pinyin_jaro, pinyin_set_ratio_fuzz,
+                        pinyin_sort_ratio_fuzz,
+                        pinyin_commonsubstr, pinyin_list_Wmd,
+                        word_jaccard_pinyin, char_jaccard_pinyin]
+        print("sim: ")
+        print(sim_all_last)
+
+
+if __name__ == '__main__':
+    ssf = SentenceSimFeature()
+    ssf.init_sentence_vector()
+    ssf.init_tfidf()
+    s1 = "你知道Mo的能力上限吗"
+    s2 = "你好呀，Mo水平很差"
+    start_time = time.time()
+
+    ssf.set_data(s1, s2)
+    ssf.encode_sentence_vector()
+
+    time1 = time.time()
+    print('set_data time：' + str(time1 - start_time))
+
+    # 相同词、长度
+    same_word_count = ssf.same_word_count()
+    time2 = time.time()
+    print('same_word_count time：' + str(time2 - time1))
+
+    same_char_count = ssf.same_char_count()
+    time3 = time.time()
+    print('same_char_count time：' + str(time3 - time2))
+
+    [len_abs_sub, len_rate, len_add_rate] = ssf.sentence_length()
+    time4 = time.time()
+    print('sentence_length time：' + str(time4 - time3))
+
+    #  w2c_all_vec
+    [w2c_Cosine, w2c_TS_SS, w2c_Manhattan, w2c_Euclidean,
+     w2c_Jaccard, w2c_Chebyshev, w2c_Minkowski, w2c_Euclidean_Standard, w2c_Mahalanobis,
+     w2c_Bray, w2c_Pearson] = ssf.w2c_all_vec()
+    time5 = time.time()
+    print('w2c_all_vec time：' + str(time5 - time4))
+
+    #  tdidf_all_vec
+    tdidf_cossim = ssf.tdidf_all_vec()
+    time6 = time.time()
+    print('tdidf_all_vec time：' + str(time6 - time5))
+
+    #  edit_all_str
+    [str_hamming, str_edit, str_ratio, str_jaro,
+     str_set_ratio_fuzz, str_sort_ratio_fuzz, str_commonsubstr, str_list_Wmd] = ssf.edit_all_str()
+    time7 = time.time()
+    print('edit_all_str time：' + str(time7 - time6))
+
+    # jaccard系数
+    word_jaccard = ssf.word_jaccard()
+    char_jaccard = ssf.char_jaccard()
+    time8 = time.time()
+    print('jaccard系数 time：' + str(time8 - time7))
+
+    # pinyin
+    tdidf_pinyin_cossim = ssf.tdidf_all_vec_pinyin()
+    time9 = time.time()
+    print('tdidf_all_vec_pinyin time：' + str(time9 - time8))
+
+    #  edit_all_pinyin
+    [pinyin_hamming, pinyin_edit, pinyin_ratio, pinyin_jaro,
+     pinyin_set_ratio_fuzz, pinyin_sort_ratio_fuzz, pinyin_commonsubstr, pinyin_list_Wmd] = ssf.edit_all_pinyin()
+    time10 = time.time()
+    print('edit_all_pinyin time：' + str(time10 - time9))
+
+    # jaccard系数
+    word_jaccard_pinyin = ssf.word_jaccard_pinyin()
+    char_jaccard_pinyin = ssf.char_jaccard_pinyin()
+    time11 = time.time()
+    print('jaccard系数pinyin  time：' + str(time11 - time10))
+
+    sim_all_last = [same_word_count, same_char_count, len_abs_sub, len_rate, len_add_rate,
+                    w2c_Cosine, w2c_TS_SS, w2c_Manhattan, w2c_Euclidean, w2c_Jaccard, w2c_Chebyshev, w2c_Minkowski,
+                    w2c_Euclidean_Standard, w2c_Mahalanobis, w2c_Bray, w2c_Pearson,
+                    tdidf_cossim, str_hamming, str_edit, str_ratio, str_jaro, str_set_ratio_fuzz, str_sort_ratio_fuzz,
+                    str_commonsubstr, str_list_Wmd,
+                    word_jaccard, char_jaccard, tdidf_pinyin_cossim,
+                    pinyin_hamming, pinyin_edit, pinyin_ratio, pinyin_jaro, pinyin_set_ratio_fuzz,
+                    pinyin_sort_ratio_fuzz,
+                    pinyin_commonsubstr, pinyin_list_Wmd,
+                    word_jaccard_pinyin, char_jaccard_pinyin]
+    print("小姜机器人计算sim: ")
+    print(sim_all_last)
+
+    sentence_input_t()
diff --git a/conf/__init__.py b/conf/__init__.py
new file mode 100644
index 0000000..b238954
--- /dev/null
+++ b/conf/__init__.py
@@ -0,0 +1,5 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time     :2019/4/3 11:23
+# @author   :Mo
+# @function :
\ No newline at end of file
diff --git a/conf/__pycache__/__init__.cpython-36.pyc b/conf/__pycache__/__init__.cpython-36.pyc
new file mode 100644
index 0000000..9042934
Binary files /dev/null and b/conf/__pycache__/__init__.cpython-36.pyc differ
diff --git a/conf/__pycache__/path_config.cpython-36.pyc b/conf/__pycache__/path_config.cpython-36.pyc
new file mode 100644
index 0000000..6ee0fdc
Binary files /dev/null and b/conf/__pycache__/path_config.cpython-36.pyc differ
diff --git a/conf/path_config.py b/conf/path_config.py
new file mode 100644
index 0000000..72bf2aa
--- /dev/null
+++ b/conf/path_config.py
@@ -0,0 +1,39 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time     :2019/4/3 11:23
+# @author   :Mo
+# @function :path
+
+
+import pathlib
+import sys
+import os
+
+
+# base dir
+projectdir = str(pathlib.Path(os.path.abspath(__file__)).parent.parent)
+sys.path.append(projectdir)
+print(projectdir)
+
+# corpus
+chicken_and_gossip_path = projectdir + '/Data/corpus/chicken_and_gossip.txt'
+
+# word2vec
+w2v_model_merge_short_path = projectdir + "/Data/chinese_vector/w2v_model_merge_short.vec"
+
+# tf_idf
+td_idf_cut_path = projectdir + '/Data/tf_idf/td_idf_cut.csv'
+td_idf_cut_pinyin = projectdir + '/Data/tf_idf/td_idf_cut_pinyin.csv'
+td_idf_path_pinyin = projectdir + '/Data/tf_idf/td_idf_cut_pinyin_dictionary_model.pkl'
+td_idf_path = projectdir + '/Data/tf_idf/td_idf_cut_dictionary_model.pkl'
+
+# word, 句向量
+w2v_model_wiki_word_path = projectdir + '/Data/chinese_vector/w2v_model_wiki_word.vec'
+matrix_ques_part_path = projectdir + '/Data/sentence_vec_encode_word/1.txt'
+
+# char, 句向量
+w2v_model_char_path = projectdir + '/Data/chinese_vector/w2v_model_wiki_char.vec'
+matrix_ques_part_path_char = projectdir + '/Data/sentence_vec_encode_char/1.txt'
+
+# word2vec select
+word2_vec_path = w2v_model_wiki_word_path if os.path.exists(w2v_model_wiki_word_path) else w2v_model_merge_short_path
\ No newline at end of file
diff --git a/python-version-time b/python-version-time
new file mode 100644
index 0000000..54dcdff
--- /dev/null
+++ b/python-version-time
@@ -0,0 +1,15 @@
+Python 3.3.2(May 15, 2013)
+Python 3.2.5(May 15, 2013)
+Python 3.1.5(April 10, 2012)
+Python 3.0.1(February 13, 2009)
+Python 2.7.5(May 15, 2013)
+Python 2.6.8(April 10, 2012)
+Python 2.5.6(May 26, 2011)
+Python 2.4.6(December 19, 2008)
+Python 2.3.7(March 11, 2008)
+Python 2.2.3(May 30, 2003)
+Python 2.1.3(April 8, 2002)
+Python 2.0.1(June 2001)
+Python 1.6.1(September 2000)
+Python 1.5.2(April 1999)
+Older releases:Source releases,binaries-1.1,binaries-1.2,binaries-1.3,binaries-1.4,binaries-1.5
diff --git a/readme.md b/readme.md
new file mode 100644
index 0000000..b6c21a2
--- /dev/null
+++ b/readme.md
@@ -0,0 +1,49 @@
+# nlp_xiaojiang
+
+# Data
+    - chinese_vector
+        - 截取的部分word2vec训练词向量（自己需要下载全效果才会好）
+    - corpus
+        - 小黄鸡和gossip问答预料（数据没清洗）
+    - sentence_vec_encode_char
+        - 1.txt（字向量生成的前100000句向量）
+    - sentence_vec_encode_word
+        - 1.txt（词向量生成的前100000句向量）
+    - tf_idf（chicken_and_gossip.txt生成的tf-idf）
+
+# ChatBot
+    - 检索式ChatBot
+        - 像ES那样直接检索(如使用fuzzywuzzy)，只能字面匹配
+        - 构造句向量，检索问答库，能够检索有同义词的句子
+    - 生成式ChatBot（todo）
+        - seq2seq
+        - GAN
+
+# FeatureProject
+    - normalization_util指的是数据归一化
+        - 0-1归一化处理
+        - 均值归一化
+        - sig归一化处理
+    - sim feature（这里只有ML，没有bert、emlo等的句向量相似度）
+        - distance_text_or_vec:各种计算文本、向量距离等
+        - distance_vec_TS_SS：TS_SS计算词向量距离
+        - cut_td_idf：将小黄鸡语料和gossip结合
+        - sentence_sim_feature：计算两个文本的相似度或者距离，例如qq（问题和问题），或者qa（问题和答案）
+
+# run
+  - 1.创建tf-idf文件等（运行2需要先跑1）:  python cut_td_idf.py
+  - 2.计算两个句子间的各种相似度，先计算一个预定义的，然后可输入自定义的（先跑1）:  python sentence_sim_feature.py
+  - 3.chatbot_1跑起来(fuzzy检索-没)（独立）：python chatbot_fuzzy.py
+  - 4.chatbot_2跑起来(句向量检索-词)（独立）：python chatbot_sentence_vec_by_word.py
+  - 5.chatbot_3跑起来(句向量检索-字)（独立）：python chatbot_sentence_vec_by_char.py
+
+# requestments.txt
+    - python_Levenshtei
+        - 调用Levenshtein，我的python是3.6，
+        - 打开其源文件https://www.lfd.uci.edu/~gohlke/pythonlibs/
+        - 查找python_Levenshtein-0.12.0-cp36-cp36m-win_amd64.whl下载即可
+    - pyemd
+    - pyhanlp
+        - 下好依赖JPype1-0.6.3-cp36-cp36m-win_amd64.whl
+
+
diff --git a/requestments.txt b/requestments.txt
new file mode 100644
index 0000000..2195e9c
--- /dev/null
+++ b/requestments.txt
@@ -0,0 +1,12 @@
+python-Levenshtein==0.12.0
+fuzzywuzzy==0.17.0
+openpyxl==2.6.2
+pandas==0.24.2
+xpinyin==0.5.6
+numpy==1.16.1
+gensim==3.7.1
+pyemd==0.5.1
+jieba==0.39
+xlrd==1.2.0
+sklearn
+pathlib
diff --git a/result_test/__init__.py b/result_test/__init__.py
new file mode 100644
index 0000000..cdaeb55
--- /dev/null
+++ b/result_test/__init__.py
@@ -0,0 +1,5 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time     :2019/4/3 14:40
+# @author   :Mo
+# @function :
\ No newline at end of file
diff --git a/result_test/result_chatbot_fuzzy.txt b/result_test/result_chatbot_fuzzy.txt
new file mode 100644
index 0000000..cd5b755
--- /dev/null
+++ b/result_test/result_chatbot_fuzzy.txt
@@ -0,0 +1,38 @@
+Connected to pydev debugger (build 171.3780.115)
+D:\workspace\pythonMyCode\django_project\nlp_xiaojiang
+read questions ok!
+����: ��˭ѽ
+С�������ˣ� �Ҿ����˼��˰������������ľۺ�����ѽ
+�Ƽ����:
+[('��˭ѽ\t�Ҿ����˼��˰������������ľۺ�����ѽ\n', 100), ('��˭ѽ\t�Ҿ����˼��˰������������ľۺ�����ѽ\n', 100), ('����˭\t=��=\n', 67), ('��˭��\t==\n', 67), ('����˭\t==\n', 67)]
+����:
+nenen
+С�������ˣ� ��������
+�Ƽ����:
+[('nnn\t��������\n', 75), ('��nnn\tnnn\n', 75), ('lene\t==\n', 67), ('tencent\t==\n', 67), ('nann\t=��=\n', 67)]
+����:
+niguola
+С�������ˣ� =��=
+�Ƽ����:
+[('igdota\t=��=\n', 62), ('ula\t==\n', 60), ('qiulaif\t=��=\n', 57), ('qiulaif\t==\n', 57), ('gold\t=��=\n', 55)]
+����:
+�����
+С�������ˣ� imba�ͳ�������˵ɵ�Ʋ���
+�Ƽ����:
+[('�����,ɵ��\timba�ͳ�������˵ɵ�Ʋ���\n', 100), ('�����!�����ҵ��İ�,ɣ��\tҪ��������~Ҳ������ᡣҪ�����Ǿ�˦�����������ô��\n', 100), ('�����\t=��=\n', 67), ('�Ҵ���,������\t==\n', 60), ('�������\t��������������\n', 57)]
+����:
+������
+С�������ˣ� ҮҮҮ
+�Ƽ����:
+[('������\tҮҮҮ\n', 100), ('\t������2��������������Ǽ��㼦�����ܣ����������\n', 100), ('������\t����\n', 100), ('������\t����������С�м�\n', 100), ('������\t�����򱨵�С�м�\n', 100)]
+����:
+Ц��
+С�������ˣ� ��֪��������
+�Ƽ����:
+[('Ц��\t��֪��������\n', 100), ('Ц\t=��=\n', 67), ('Ц\t==\n', 67), ('Ц��\t��\n', 50), ('��Ц\t��\n', 50)]
+����:
+������ӭ��
+С�������ˣ� ������ӭ��
+�Ƽ����:
+[('����\t������ӭ��\n', 62), ('����\t����\n', 57), ('����\t��\n', 57), ('����\t==\n', 57), ('����\t������\n', 57)]
+����:
diff --git a/result_test/result_chatbot_sentence_vec_by_char.txt b/result_test/result_chatbot_sentence_vec_by_char.txt
new file mode 100644
index 0000000..f750d52
--- /dev/null
+++ b/result_test/result_chatbot_sentence_vec_by_char.txt
@@ -0,0 +1,55 @@
+Connected to pydev debugger (build 171.3780.115)
+D:\workspace\pythonMyCode\django_project\nlp_xiaojiang
+np.loadtxt(matrix_ques_part_path_char) ok!
+����:
+��˭ѽ
+D:/workspace/pythonMyCode/django_project/nlp_xiaojiang/ChatBot/chatbot_search/chatbot_sentence_vec_by_char.py:115: RuntimeWarning: invalid value encountered in true_divide
+  matrix_org_norm = (matrix_org / np.sqrt((matrix_org ** 2).sum(-1))[..., np.newaxis]).astype(numpy_type)
+С��������: ��£���
+'gbk' codec can't encode character '\u301c' in position 227: illegal multibyte sequence
+����:
+������
+С��������: ��ʲô����
+[('��', '��ʲô����'), ('������', '��(O_O)'), ('����', '=��='), ('������', '�ԶԶԴ�󷽷�'), ('����', '�������ǰ��ҵģ��԰�'), ('������', '�˳Ե�̫������?��߲�ǳŵλ�?��....һ���ǵ�~Ҫ߲Ҫݭ�����³���~(^V^)'), ('��', '�ԶԶԴ�󷽷�'), ('��,,,', '=��='), ('��', "����ô������'��"), ('��', '�˳Ե�̫������?��߲�ǳŵλ�?��....һ���ǵ�~Ҫ߲Ҫݭ�����³���~(^V^)'), ('������', '��ʲô����'), ('������', '����˵���ˣ���Ҳ������'), ('������������', '��ʲô����'), ('����', '�������Ǿ�ȥ�Է��ɣ��װף������������ŵ°ף��ߺ�'), ('����', '������?'), ('������', '��ʲô����'), ('��,', '�˳Ե�̫������?��߲�ǳŵλ�?��....һ���ǵ�~Ҫ߲Ҫݭ�����³���~(^V^)'), ('����', "����ô������'��"), ('��', '?'), ('������', '�˳Ե�̫������?��߲�ǳŵλ�?��....һ���ǵ�~Ҫ߲Ҫݭ�����³���~(^V^)')]
+����:
+����
+С��������: ��_��?
+[('����������������������������������������������������������������������������������������������������������������������������������������������������', '��_��?'), ('��,��,����', '=��='), ('��', 'Цƨ'), ('��,', 'hŶ��'), ('��������', '��Ц������'), ('��������', '��'), ('����', '����'), ('������������', 'hihi~'), ('��������������', 'ôô�����ǲ������'), ('����', 'ҮҮ'), ('��������������������', '������'), ('����,', '�����갡'), ('����������', '��Ц������'), ('������', '�ҲŲ�����'), ('������', '(��o��?)'), ('����������', '����'), ('��������', '���'), ('������', '��_��?'), ('��������', 'Unauthorizedaccess!.Inthisprogram(site,app),theSimSimiAPIisbeingusedillegally.Pleasecontactus.'), ('����!', '����������')]
+����:
+����
+С��������: �����հ�����
+'gbk' codec can't encode character '\u301c' in position 136: illegal multibyte sequence
+����:
+������
+С��������: ����������Сͨ~
+[('������', '����������Сͨ~'), ('����', '(*^__^*)��������'), ('������', '����������'), ('������', 'ʹ'), ('��������', '����������С�м�~'), ('����', 'hi'), ('��������', 'ҮҮҮ'), ('������', '����'), ('����', 'ҮҮҮ'), ('��������', '����������Сͨ~'), ('��', '����������Сͨ~'), ('������', '����'), ('����', '����������С�м�'), ('������', '����'), ('����', '��'), ('��������', 'ʹ'), ('������', '�����򱨵�С�м�'), ('������', '��°�'), ('����������', 'hi'), ('����', '��')]
+����:
+��ȥ�ĸ�����
+С��������: һ��ȥ�Է�������Ͱ���
+[('���ϸ�ȥ', 'һ��ȥ�Է�������Ͱ���'), ('���Ǹ�ȥ', 'ʲô߹'), ('ȥ�������', '�μ�̫���������������꣡'), ('�����', '�ǵģ�����'), ('�����', '�԰���ʲô���������������أ�����֪��Щʲô�أ�'), ('������', "ǰ�Ÿ���ʻһ��'��������"), ('�ĸ���', '���š������㡣'), ('�����ϸ�ȥ', '���ϸ���'), ('�ĸ�����', '����˵����˵�е�����Ȼ�ǻ���'), ('���ĸ�', 'RudyJeffEric������'), ('��!���˸�ȥ', '����'), ('�������', '�����SB'), ('���˸�ȥ', 'һ��ȥ�Է�������Ͱ���'), ('���˸�ȥ', '���˸������ȥ'), ('���˸�ȥ', '���˸�ȥ��'), ('���˸�ȥ', '����'), ('���˸�ȥ', '��С�׿�ͷ��'), ('���˸�ȥ', '���˸�����'), ('���˸�ȥ', '��˿����'), ('���˸�ȥ', '=��=')]
+����:
+����ȥ������ˣ
+С��������: ȥ�㋌
+[('����ȥ����', 'ȥ�㋌'), ('������ȥ�Ҽ���', '������ȥ�ģ�'), ('ȥ��������', '�����'), ('������ȥ', 'ȥ�ģ�'), ('��������', '���������Ŷ'), ('���뿴��ȥ', '���������'), ('���������', '�ţ����ǻ�ģ���ʹ���ᣬ��Ҳ��һֱ���㡣'), ('����ȥ�Ϻ�', '��һ�����Ե�'), ('����ȥ��', '���봩Խ������'), ('�����ĺ���', '��ϲ��������һ�����ﶼ����'), ('����ȥ��', '�ð�����κ�ͦ�����İ�������ϲ������ѩ����'), ('�����Լ���ȥ��', '��̫���ˡ�����'), ('����������', '��ë�ߣ�ѧϰȥ��'), ('���Լ���ȥ', '���Ǹ���˵��'), ('�����', '������'), ('�����', '�����ã�'), ('�����', '�ߣ���ֽ����ȥ�Ժܶ�ܶ���~~'), ('�����Ķ�����', '��ϲ��������һ�����ﶼ����'), ('�������', '��֪��'), ('�������', "�������������ⱨӦ�ģ�������ͻᷢ���������Ѫ�⻯������ķ���'һ���Ӳ����㣡��")]
+����:
+������������������
+"word ' ' not in vocabulary"
+С��������: ��������ܺ�Ŷ��
+list index out of range
+����:
+��
+С��������: Ŷ~~~~~~~~
+[('��', 'Ŷ~~~~~~~~'), ('��', '=='), ('��', '�ף��㷢�����������ˣ�'), ('��', 'û����ûʲô'), ('��', '�㲻���Ҳ���'), ('��ѽ��', '����gi�����ո�������gi�İ��ո�������gi���յ��չţ���'), ('������', '��ô߷'), ('��', '^_^'), ('��', '�ڵġ��ڵġ�ʱ�̺��ţ�'), ('����~~~(>_<', '��ʲô�����簮��������'), ('��', '��ô��'), ('��', '��ľ��'), ('��', 'Ӵ�Ǻ�'), ('������', '��Ӵ'), ('��', '��Ӵ'), ('����', '��ô��'), ('��', '��ʲô�����簮��������'), ('��', '��̾����'), ('��', 'Ŷ'), ('��', '==')]
+����:
+���»�ˮ
+С��������: ������Զ�̣����ܾ��㶮�ģ�
+'gbk' codec can't encode character '\u2207' in position 329: illegal multibyte sequence
+����:
+��ϲ��˭
+С��������: ��Ȼϲ�������
+[('��ϲ��˭', '��Ȼϲ�������'), ('��ϲ��˭', 'С��'), ('��ϲ��˭', '��ϲ��ʨ��'), ('��ϲ��˭', 'bb'), ('��ϲ��˭', '����£���'), ('��ϲ��˭', '˭ϲ���ҡ��Ҿ�ϲ��˭'), ('��ϲ��˭', '����ɰ�������'), ('��ϲ��˭', '����'), ('��ϲ��˭', '��Ȼ�Ǻ��������㱿����'), ('��ϲ��˭', '�š����������˼Ҳ�����˼������������'), ('��ϲ��˭', '�Ʋ���qq******��'), ('��ϲ��˭', '�����ͣ����ҵ�����'), ('��ϲ��˭', '��Ȼ��СŮ����~'), ('��ϲ��˭', '��Ȼ�����������'), ('��ϲ��˭', '�Ը����˵�ladygaga'), ('��ϲ��˭', '��ϲ������'), ('��ϲ��˭', '�����'), ('˭ϲ����', '��ϲ����Ѿ��>3<'), ('��,,��ϲ��˭', 'ŶŶ�������'), ('��!��ϲ��˭', '���ߺ���')]
+����:
+��
+С��������: ��
+[('��?', '��'), ('������������', '��óԳ�'), ('��??', '��������'), ('��...', '��ô�˻������'), ('��~', '����~~~'), ('��??', '˭��֪��������ͥ������ô���ϴγԻ����һ�������������ϴ���ء������ڶ��Ҹ�����'), ('����������', '������'), ('��', '��Ͳ�Ҫ�ŵ���ô���������������'), ('��?', '�벻Ҫ���ڹ����Ϊ���û����޷�������ȥ'), ('��������,', '�쵽������~'), ('��', '����ͷ��'), ('��', '��ô����ô�ˣ����ӳ�û����'), ('��?', '����'), ('������������������', '������'), ('����', '����'), ('������', '����������������'), ('��?', '���£�'), ('����������', 'Ŷ'), ('��������������������', 'С���ˣ���'), ('��??', '=��=')]
+����:
diff --git a/result_test/result_chatbot_sentence_vec_by_word.txt b/result_test/result_chatbot_sentence_vec_by_word.txt
new file mode 100644
index 0000000..2c1f251
--- /dev/null
+++ b/result_test/result_chatbot_sentence_vec_by_word.txt
@@ -0,0 +1,73 @@
+Connected to pydev debugger (build 171.3780.115)
+D:\workspace\pythonMyCode\django_project\nlp_xiaojiang
+load_word2vec_model start!
+load_word2vec_model end!
+load w2v_model_wiki_word_path ok!
+np.loadtxt(matrix_ques_part_path) start!
+np.loadtxt(matrix_ques_part_path) end!
+��:
+����˭ѽ
+Building prefix dict from the default dictionary ...
+Loading model from cache C:\Users\MOYONG~1\AppData\Local\Temp\jieba.cache
+Loading model cost 0.815 seconds.
+Prefix dict has been built succesfully.
+D:/workspace/pythonMyCode/django_project/nlp_xiaojiang/ChatBot/chatbot_search/chatbot_sentence_vec_by_word.py:131: RuntimeWarning: invalid value encountered in true_divide
+  matrix_org_norm = (matrix_org / np.sqrt((matrix_org ** 2).sum(-1))[..., np.newaxis]).astype(numpy_type)
+С��������: ��������������ȥ����
+[('��ְ�������˭', '��������������ȥ����'), ('��ְ�������˭', '���޷ɷ�'), ('�����˭', '������'), ('˭�����', '��°�'), ('˭�����', '������'), ('˭�����', '˵��Ҳû�ã�������������'), ('�����˭', '��°�'), ('�����˭', '�ҵ�������������������ˡ���������~�����������~�ꡭ����Ҫ���߱��ˣ����費���Ҹ��ְ�˵^o^'), ('�����˭', '����׳'), ('�����˭', '���Ϲ�'), ('�����˭', '��Ƽ�'), ('��²�����˭', '����Ҳ²��¡�'), ('�����˭', '���ڰ�'), ('˭�����', '��������õ��������������������~'), ('�����˭', '��Ȼ�Ǵ�չ˶��'), ('�����˭', '�ڿ���˧��'), ('�����˭', '��ѽ'), ('�����˭', '��������õ��������������������~'), ('�����˭', '���µ�һ��˧�磬��ò���ǻ۵Ľ�ϣ�Ӣ��������Ļ���~��ү��'), ('�����˭', '����ƣ���')]
+��:
+�������
+С��������: �ҽл�СͨӴ
+[('�������', '�ҽл�СͨӴ'), ('�������', '�ҽ�simsimi'), ('�ҵ�����', 'Ard!!'), ('���������', '������ү�ķ�ౣ�����ƿ��Сͨ����ȸ��С����'), ('��������', '������'), ('������ֽ�', '���ߣ�����ܡ�����~��һ����������'), ('��Ů���ѵ�����', '�����Ρ�'), ('������ֽ�ʲô', '����������Ұְ�����'), ('�����Լ�������', '��Ŷ'), ('�����������', '��Ŷ'), ('���������ôд', '�ÿ�����ϣ��㶼���к������ְ�������'), ('�ҵ�������ʲô', '��һȻ���'), ('�������������', '�Ƽ�'), ('�������ֽ�ɶ', '������'), ('��������е�����', '��'), ('�ҵ�������19������˵�ҵ�����ʲô����', '������'), ('���һ���ҵ�����', '��ү'), ('���app��ʲô����', '������'), ('�ҵ����ŵ�������ʲô', '����'), ('����Խг��ҵ�������', '�ҽ�С����˼�ܴ��')]
+��:
+�ҵ�����
+С��������: Ard!!
+[('�ҵ�����', 'Ard!!'), ('�������', '�ҽ�simsimi'), ('�������', '�ҽл�СͨӴ'), ('��������', '������'), ('���������', '������ү�ķ�ౣ�����ƿ��Сͨ����ȸ��С����'), ('�ҵ�Ů������ʲô', '����ѽ����˵�ұ������Լ�����֪����'), ('�ҵ����ŵ�������ʲô', '����'), ('����˵�ҵ�����', 'ร������'), ('������ֽ�', '���ߣ�����ܡ�����~��һ����������'), ('�ҵ�������ʲô', '��һȻ���'), ('�����Լ�������', '��Ŷ'), ('�����������', '��Ŷ'), ('��ϲ��������ɶ', '����������֪����'), ('�ҵ�������19������˵�ҵ�����ʲô����', '������'), ('��˵�������΢������', '���D�i��Ҳ��������'), ('�������ֽ�ɶ', '������'), ('��Ů���ѵ�����', '�����Ρ�'), ('������ֽ�ʲô', '����������Ұְ�����'), ('С�ƹ��������ѵ�����', '�����Ρ�'), ('����������������', '��������Ժ���~')]
+��:
+bվ��ʲô�ÿ���
+С��������: �������л���ѽ,���յ��л���ɺó�����������ȥ������ῴ��
+'gbk' codec can't encode character '\u301c' in position 116: illegal multibyte sequence
+��:
+��ѽ��
+С��������: ���Ϻö���
+'gbk' codec can't encode character '\u301c' in position 131: illegal multibyte sequence
+��:
+ƽ�ֻ���������������?
+С��������: �ðɺð��ҳ������ֲ���������
+[('������仰', '�ðɺð��ҳ������ֲ���������'), ('��,,,,,������Ц��', '�ð��������Ц�ˣ����������ձ�������˭���ձ�����ô�󣡣�<br/>��úѽ������Ͳ���ί��㣬����̫Ƿ��'), ('��˵���Ƕ�����', '������'), ('��˵Сͨ�ǹ��Ļ���ĸ��', '������һ��'), ('��Ҫ������仰', '�ǵ��ǵΣ����Ǽ������������ҵΣ���'), ('���ǵط���,��������', '�����Ž��������'), ('������ʲô��', '�и�'), ('��ôÿ�춼����仰', '�������ñ�������'), ('��������˵���Ǽ���', 'yes,mylord'), ('���ǻ���ô', '��Ҳ����һ����'), ('��˵,��Ƽ���Ů��', '=='), ('��仰˵�ĺü���', '����ϯ��ɵ��һ��~~~'), ('��˵������ǹ���', '�����ǹ���ĸ���أ�'), ('��仰Ҳ���Զ��ظ�', '��������'), ('��仰��ʲô��˼', 'һ���'), ('��˵�㵽���ǹ��Ļ���ĸ��', '���м��������е�'), ('������仰,�ܲ��ܻ��仰', '���ǣ��ֲ�������Ҫ����������'), ('������Ц���ĸɻ�', 'ÿ��ѧУ������ô������˵�еġ�*����������ѧ�������ġ���<br/>�������翼�ԣ��Ĵ����������˾�Ȼ������ͬ���࿼��<br/>�˼������˽��Ÿ�˵��һ�仰�������������Ӧ�ö���ʶ���������ɲ����ˡ���<br/>�һ������������ѧ��ֱ����ʰ����������ô����'), ('��˵�ĳ��ݻ�', 'ѩ'), ('��˵����ʨ��', '��ţ����')]
+��:
+����ѽ���㲻��
+С��������: ����
+'gbk' codec can't encode character '\xaf' in position 433: illegal multibyte sequence
+��:
+�㲻��ѽ
+С��������: ��ô����?
+[('����,�㲻��', '��ô����?'), ('�㲻��', '(��_��)'), ('�㲻��', 'yes,sir��'), ('�㲻��', '�а������һ���ʹ'), ('�㲻��', '����һ������������~\\(�R���Q)/~'), ('��˵�㲻��', '�ðɣ�������'), ('��@����ô����', '�ȵ�ҹ���˾���ʱ���������ĵظ���˵һ������^_^'), ('ʲôѽ,�㶮����', '�ҽа�����'), ('���벻�а�,���������', '�뵽��ĵ���'), ('���Ȼ����', '���Ǽ���,�����'), ('�㲻��ô', '�Ҿ�ϲ���㲻ϲ���ң����ѽ��'), ('����������ɵ', 'û��ɵ'), ('����ô@��������,~~~(>_<', '���ء���Сͨ��������Ϣһ��'), ('��ǺǸ���,Ҫ��', '�Բ������װ�������Ҳ������'), ('����ﲻ����', '�Բ������װ�������Ҳ������'), ('����ɶ', '=��='), ('ƨѽ~�ҽ���ʲô', '����'), ('��˵��ô@��ѽ,����', '����������˵���ֻ��һ��㡣����'), ('�����Ƕ�����ѽ��', '��զ֪��'), ('����˵�ҷ�', '�õģ�С����Ҳ������(>�n<)')]
+��:
+�Ǻ�
+С��������: �����ð�
+[('�ǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺ�', '�����ð�'), ('�Ǻǡ�', 'ɵЦɶ��'), ('�Ǻ�', '������è�ġ�'), ('�Ǻ�', '������������������'), ('�Ǻ�', 'ϴ��ȥ'), ('�Ǻǡ�', 'лл'), ('�Ǻ�', '�����������˵�Ǻ�'), ('�Ǻ�,', '���������˴˱˴˰�'), ('�Ǻ�', '��Ҫ�Ǻǣ��Ҳ�ϲ������Ҳ��ϲ�������ĵ��'), ('�Ǻ�', 'ɵ��'), ('�ǺǺǺ�', '���ֲ���Ů��'), ('�Ǻ�,', '���������ã�����Сͨ'), ('�ǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺǺ�', '�Ǻ����ү'), ('�Ǻ�', '����ds��'), ('�Ǻ�', '������'), ('�ǺǺǺ�~~', '=��='), ('�Ǻ�', '��˿�źǺǡ���'), ('�Ǻ�,,,', '�ǺǳԷ�ȥϴ��'), ('�Ǻ�', '����ֹ�����ߣ��������ںǺǡ���'), ('�ǺǺǺǺǺ�', '�Ǻ����ã�')]
+��:
+������������
+С��������: hi
+[('����', 'hi'), ('������~', '=��='), ('������', 'ʹ'), ('������', '����������С�м�~'), ('����', '������~\\(�R���Q)/~'), ('������', '�����򱨵�С�м�'), ('������������', '����'), ('����', '����������Сͨ~'), ('����', '��'), ('����', '����������С�м�'), ('������', 'hi'), ('����������', '����'), ('��', '����������'), ('����������', '����������С�м�~'), ('������', '����������Сͨ~'), ('������������', '����������С�м�~'), ('������', '����'), ('��������', '����������С�м�~'), ('������', '������~\\(�R���Q)/~'), ('��������������', '==')]
+��:
+ȥ���
+С��������: �������������Ҳ��֪��
+'gbk' codec can't encode character '\xaf' in position 273: illegal multibyte sequence
+��:
+��
+С��������: ����
+'gbk' codec can't encode character '\u301c' in position 335: illegal multibyte sequence
+��:
+����
+С��������: ������
+'gbk' codec can't encode character '\u301c' in position 170: illegal multibyte sequence
+��:
+��ȥ
+'gbk' codec can't encode character '\u301c' in position 32: illegal multibyte sequence
+��:
+����
+С��������: that'sok
+[('thanks', "that'sok"), ('��Ģ��', '�㹽'), ('ɵ�ϻ�', '�ö���'), ('�ſ�', '��������������������fuck'), ('baka', '��·����'), ('����С��ͷ', '��Ҫ����ζ��'), ('�㹤��', '��������'), ('�Ե�', '=='), ('ͯ�Ӽ�', '��������Ҫ����'), ('5.3.6.9', '=='), ('����', '���Ҹ���ְ�����'), ('����', '�������������У�'), ('����', '������ҧСͨ'), ('33333', '����������'), ('�ƿ�', 'һֱ����ԥ'), ('year', '���¹���������'), ('goodnight', 'SweetdreamdarlingXD'), ('goodnight', 'ba\u2006d'), ('goodnight', '��Ȼ��'), ('����', '�ٺ١�')]
+��:
diff --git a/result_test/result_sentence_sim_feature.txt b/result_test/result_sentence_sim_feature.txt
new file mode 100644
index 0000000..f5adfec
--- /dev/null
+++ b/result_test/result_sentence_sim_feature.txt
@@ -0,0 +1,37 @@
+Connected to pydev debugger (build 171.3780.115)
+D:\workspace\pythonMyCode\django_project\nlp_xiaojiang
+load w2v model begin
+load w2v model success
+Building prefix dict from the default dictionary ...
+Loading model from cache C:\Users\MOYONG~1\AppData\Local\Temp\jieba.cache
+Loading model cost 0.719 seconds.
+set_data time��0.7200782299041748
+Prefix dict has been built succesfully.
+same_word_count time��0.0
+same_char_count time��0.0
+sentence_length time��0.0
+w2c_all_vec time��0.1994335651397705
+tdidf_all_vec time��0.0
+edit_all_str time��0.0019953250885009766
+jaccardϵ�� time��0.0
+tdidf_all_vec_pinyin time��0.0
+edit_all_pinyin time��0.004553556442260742
+jaccardϵ��pinyin  time��0.0
+sim:
+[1, 3, 1, 1.1, 5.238095238095238, 0.6782572237857507, 3461.1677906854284, 283.83272299933014, 19.980963040347838, 0.9999999999966667, 3.0830289870500565, 19.980963040347838, 24.494821131252575, 79619.83774188746, -5.10379204991808, 0.6769724044408956, 0.0, 12, 9, 0.2857142857142857, 0.5242424242424243, 19, 19, 2, 8.141546895617283, 0.08333333333333333, 0.16666666666666666, 0.008081558347970244, 17, 22, 0.5217391304347826, 0.6838686096962837, 56, 47, 4, 6.190419904893637, 0.11764705882352941, 0.11764705882352941]
+s1: ������������
+s2: �������ߵú���ѽ
+set_data time��0.0009706020355224609
+same_word_count time��0.0009982585906982422
+same_char_count time��0.0
+sentence_length time��0.0
+w2c_all_vec time��0.20846796035766602
+tdidf_all_vec time��0.0
+edit_all_str time��0.0019943714141845703
+jaccardϵ�� time��0.0
+tdidf_all_vec_pinyin time��0.0
+edit_all_pinyin time��0.0019960403442382812
+jaccardϵ��pinyin  time��0.0
+sim:
+[2, 3, 1, 0.875, 3.7333333333333334, 0.8200504988005877, 3746.94646712115, 236.48076447923086, 17.65693370974129, 0.9999999999966667, 4.2634280025959015, 17.65693370974129, 24.494877087856107, 78956.49194315828, -13.367107715032754, 0.8200018973656127, 0.07174613344073014, 21, 6, 0.4, 0.6011904761904762, 40, 40, 1, 5.620521171774245, 0.2, 0.25, 0.36243089354552877, 10, 15, 0.5384615384615384, 0.6417797888386123, 62, 58, 5, 6.01776904578638, 0.25, 0.25]
+s1:
\ No newline at end of file
diff --git a/utils/__init__.py b/utils/__init__.py
new file mode 100644
index 0000000..d838479
--- /dev/null
+++ b/utils/__init__.py
@@ -0,0 +1,5 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time     :2019/4/3 15:15
+# @author   :Mo
+# @function :
\ No newline at end of file
diff --git a/utils/text_tools.py b/utils/text_tools.py
new file mode 100644
index 0000000..a8b40f6
--- /dev/null
+++ b/utils/text_tools.py
@@ -0,0 +1,322 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time     :2019/4/3 11:23
+# @author   :Mo
+# @function :utils, tools
+
+
+from openpyxl import Workbook
+import logging as logger
+import gensim
+import jieba
+import time
+import xlrd
+import re
+
+
+#中英文标点符号
+filters='[!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n' + '！，；：。？、“”’‘《》（）~@#￥%……&*\（）/{}【】…=-]'
+#标点符号、空格
+filters_1 = "[\.\!\/_,?;:$%^*<>()+\"\']+|[！，；：。？、“”’‘《》（）~@#￥%……&*\（\）\/\-]+"
+
+"""去除标点符号、空格"""
+def clear_punctuation(text):
+    """去除标点符号"""
+    sentence = text.replace(' ', '')
+    sentence_punctuation_clear = re.sub(filters, ' ', sentence).strip()
+    sentence_punctuation_clear_replace = sentence_punctuation_clear.replace('   ', ' ').replace('  ', ' ')
+    return sentence_punctuation_clear_replace
+
+
+'''截取中文、拼音、数字，去除特殊字符等'''
+def getChinese1(ques):
+    # ques = '•“鑫菁英”教育分期手续费怎么收取？可以'
+    findAllChinese = ''.join(re.findall(u"([\u4e00-\u9fa50-9A-Za-z])", ques))
+    # print(sub_str)
+    return findAllChinese
+
+
+'''xlrd读xls'''
+def xlsRead(sheetName=None, cols=0, fileXlsPath=None):
+    '''读xls文件'''
+    workbook = xlrd.open_workbook(fileXlsPath)
+    # 根据sheet索引或者名称获取sheet内容
+    sheet = workbook.sheet_by_name(sheetName)
+    nrows = sheet.nrows
+    ncols = sheet.ncols
+
+    listRows = []
+    for i in range(nrows):
+        listRows.append(sheet.row_values(i))
+
+    return listRows
+
+
+'''openpyxl写xlsx'''
+def xlsxWrite(sheetName, writeList, fileXlsName):
+    wb = Workbook()
+    print('{}'.format(wb.get_sheet_names()))  # 提供一个默认名叫Sheet的表，office2016下新建提供默认Sheet1
+    sheet = wb.create_sheet(sheetName)
+    # i = 0
+    for listLine_one in writeList:
+        # i += 1
+        sheet.append(listLine_one)
+        # if i == 1000:
+        #     break
+    wb.save(fileXlsName)
+
+
+
+"""判断一个unicode是否是英文字母"""
+def is_alphabet(uchar):
+    """判断一个unicode是否是英文字母"""
+    if (uchar >= u'\u0041' and uchar <= u'\u005a') or (uchar >= u'\u0061' and uchar <= u'\u007a'):
+        return True
+    else:
+        return False
+
+'''读取txt文件'''
+def txtRead(filePath, encodeType = 'utf-8'):
+    listLine = []
+    try:
+        file = open(filePath, 'r', encoding= encodeType)
+
+        while True:
+            line = file.readline()
+            if not line:
+                break
+
+            listLine.append(line)
+
+        file.close()
+
+    except Exception as e:
+        logger.info(str(e))
+
+    finally:
+        return listLine
+
+'''读取txt文件'''
+def txtWrite(listLine, filePath, type = 'w',encodeType='utf-8'):
+
+    try:
+        file = open(filePath, type, encoding=encodeType)
+        file.writelines(listLine)
+        file.close()
+
+    except Exception as e:
+        logger.info(str(e))
+
+'''截取中文、拼音、数字，去除特殊字符等'''
+'''要保留特殊字符的格式，最好的方法是每个字符都去匹配'''
+
+def getChinese(ques):
+    # ques = '•“鑫菁英”教育分期手续费怎么收取？可以'
+    ques = strQ2B(ques)
+    answer = ''
+    for ques_one in ques:
+        ques_one_findall = ''.join(re.findall(u"([\u4e00-\u9fa50-9A-Za-z峣㒶㒰玘宸諕鄕缓緩𪥵嬆嬲煙草砼赟贇龘㗊㵘㙓敠])", ques_one))
+        if not ques_one_findall:
+            ques_one_findall = ' '
+        answer = answer + ques_one_findall
+    answer = answer.strip().replace('  ', ' ').replace('   ', ' ')
+    return answer.upper()
+
+'''去除标点符号'''
+
+def get_syboml(ques):
+    # ques = '•“鑫菁英”教育分期手续费怎么收取？可以'
+    ques = strQ2B(ques)
+    # answer = re.sub(u'([。.,，、\；;：:？?！!“”"‘’'''（）()…——-《》<>{}_~【】\\[])', ' ', ques).replace('  ', ' ').replace('   ', ' ')
+    answer = re.sub("[\.\!\/_,?;:$%^*<>()+\"\']+|[！，；：。？、“”’‘《》[\]（|）{}【】~@#￥%…&*\/\-—_]+", " ", ques).strip()
+    return answer
+
+'''xlrd读xls'''
+
+def xlsRead(sheetName=None, cols=0, fileXlsPath=None):
+    '''读xls文件'''
+    workbook = xlrd.open_workbook(fileXlsPath)
+    # 根据sheet索引或者名称获取sheet内容
+    sheet = workbook.sheet_by_name(sheetName)
+    nrows = sheet.nrows
+    ncols = sheet.ncols
+
+    listRows = []
+    for i in range(nrows):
+        listRows.append(sheet.row_values(i))
+
+    return listRows
+
+'''openpyxl写xlsx'''
+
+def xlsxWrite(sheetName, writeList, fileXlsName):
+    wb = Workbook()
+    print('{}'.format(wb.get_sheet_names()))  # 提供一个默认名叫Sheet的表，office2016下新建提供默认Sheet1
+    sheet = wb.create_sheet(sheetName)
+    # i = 0
+    for listLine_one in writeList:
+        # i += 1
+        sheet.append(listLine_one)
+        # if i == 1000:
+        #     break
+    wb.save(fileXlsName)
+
+'''读取txt文件'''
+
+def txtRead(filePath, encodeType='utf-8'):
+    listLine = []
+    try:
+        file = open(filePath, 'r', encoding=encodeType)
+
+        while True:
+            line = file.readline()
+            if not line:
+                break
+
+            listLine.append(line)
+
+        file.close()
+
+    except Exception as e:
+        logger.info(str(e))
+
+    finally:
+        return listLine
+
+'''读取txt文件'''
+
+def txtWrite(listLine, filePath, type='w', encodeType='utf-8'):
+
+    try:
+        file = open(filePath, type, encoding=encodeType)
+        file.writelines(listLine)
+        file.close()
+
+    except Exception as e:
+        logger.info(str(e))
+
+# -*- coding: cp936 -*-
+def strQ2B(ustring):
+    """全角转半角"""
+    rstring = ""
+    for uchar in ustring:
+        inside_code = ord(uchar)
+        if inside_code == 12288:  # 全角空格直接转换
+            inside_code = 32
+        elif (inside_code >= 65281 and inside_code <= 65374):  # 全角字符（除空格）根据关系转化
+            inside_code -= 65248
+
+        rstring += chr(inside_code)
+    return rstring
+
+def strB2Q(ustring):
+    """半角转全角"""
+    rstring = ""
+    for uchar in ustring:
+        inside_code = ord(uchar)
+        if inside_code == 32:  # 半角空格直接转化
+            inside_code = 12288
+        elif inside_code >= 32 and inside_code <= 126:  # 半角字符（除空格）根据关系转化
+            inside_code += 65248
+
+        rstring += chr(inside_code)
+    return rstring
+
+def is_valid_date(strdate):
+    '''判断是否是一个有效的日期字符串'''
+    try:
+        if ":" in strdate:
+            time.strptime(strdate, "%Y-%m-%d %H:%M:%S")
+        else:
+            time.strptime(strdate, "%Y-%m-%d")
+        return True
+    except:
+        return False
+
+'''判断是否是全英文的'''
+
+def is_total_english(text):
+    """判断一个是否是全英文字母"""
+    symbol = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
+    try:
+        sentence_punctuation_clear = get_syboml(text)
+        sentence_punctuation_clear = sentence_punctuation_clear.replace(' ', '').strip()
+        numben = 0
+        for one in sentence_punctuation_clear:
+            if one in symbol:
+                numben += 1
+        if numben == len(sentence_punctuation_clear):
+            return True
+        else:
+            return False
+    except:
+        return False
+
+'''判断是否是数字的'''
+
+def is_total_number(text):
+    """判断一个是否是全英文字母"""
+    try:
+        sentence_punctuation_clear = get_syboml(text)
+        sentence_punctuation_clear = sentence_punctuation_clear.replace(' ', '').strip()
+        numben = 0
+        for one in sentence_punctuation_clear:
+            if one.isdigit():
+                numben += 1
+        if numben == len(sentence_punctuation_clear):
+            return True
+        else:
+            return False
+    except:
+        return False
+
+def is_number_or_english(text):
+    '''不为数字不为字母'''
+    judge = False
+    try:
+        sentence_punctuation_clear = get_syboml(text)
+        sentence_punctuation_clear = sentence_punctuation_clear.replace(' ', '').strip()
+        for words in sentence_punctuation_clear:
+            judge_number = is_total_number(words)
+            judge_english = is_total_english(words)
+            judge = judge_number or judge_english
+            if not judge:
+                return False
+        return judge
+    except:
+        return False
+
+#todo #句子改写，同义词替换，去停用词等
+
+
+if __name__ == '__main__':
+
+
+    # for i in range(10):
+    #     sentence_vec = word2vec_model.wv["的"]
+    #     sentence_vec_pd = pd.DataFrame(sentence_vec)
+    #     sentence_vec_pd.to_csv('my_csv.csv', mode='a', header=False)
+
+    # sentence_ee = pd.read_csv('my_csv.csv')
+
+    # txtWrite([str(sentence_vec)], "gg.txt")
+
+
+    # path_test_data_government = '/data/test_data_government.csv'
+    # sentences = txtRead(path_test_data_government)
+    sentences = []
+    sentences_one_clear_punctuation_all = []
+    for sentences_one in sentences[1:]:
+        sentences_one_1 = sentences_one
+        sentences_one_clear_punctuation = clear_punctuation(sentences_one_1.replace(',0.0,1.0', ''))
+        # print(sentences_one)
+        # print(sentences_one_clear_punctuation)
+        sentences_one_clear_punctuation_jieba = jieba.cut(sentences_one_clear_punctuation, cut_all=False, HMM=False)
+        sentences_one_clear_punctuation_jieba_list = ' '.join(list(sentences_one_clear_punctuation_jieba)).replace('   ', ' ').replace('  ', ' ').strip()
+        sentences_one_clear_punctuation_all.append(sentences_one_clear_punctuation_jieba_list + ',0.0,1.0' + '\n')
+
+    txtWrite(sentences[0:1] + sentences_one_clear_punctuation_all, '/data/test_data_government_cut.csv')
+
+    #',0.0,1.0'
+    # np.savetxt('001', [word2vec_model.wv["的"], word2vec_model.wv["的"]])
+    # gg = np.loadtxt('001')
\ No newline at end of file
diff --git a/utils/word2vec_vector.py b/utils/word2vec_vector.py
new file mode 100644
index 0000000..50a3e9a
--- /dev/null
+++ b/utils/word2vec_vector.py
@@ -0,0 +1,55 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time     :2019/4/4 10:00
+# @author   :Mo
+# @function :
+
+from __future__ import print_function
+from utils.text_tools import txtRead, txtWrite
+from gensim.models.word2vec import LineSentence
+from gensim.models import Word2Vec
+import multiprocessing
+import logging
+import sys
+import os
+
+def train_word2vec_by_word():
+    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
+    logging.root.setLevel(level=logging.INFO)
+    logging.info("running")
+
+    inp = "Y:/BaiduNetdiskDownload/cut_zhwiki_wiki_parse/cut_zhwiki_wiki_parse.txt"
+    outp1 = "w2v_model_wiki.model"
+    outp2 = "w2v_model_wiki_word.vec"
+    model = Word2Vec(LineSentence(inp), size=300, window=5, min_count=5, workers=multiprocessing.cpu_count())
+    model.save(outp1)
+    model.wv.save_word2vec_format(outp2, binary=False)
+
+def train_word2vec_by_char():
+    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
+    logging.root.setLevel(level=logging.INFO)
+    logging.info("running")
+
+    inp = "Y:/BaiduNetdiskDownload/cut_zhwiki_wiki_parse/cut_zhwiki_wiki_parse_char.txt"
+    outp1 = "w2v_model_wiki.model"
+    outp2 = "w2v_model_wiki_char.vec"
+    model = Word2Vec(LineSentence(inp), size=300, window=5, min_count=5, workers=multiprocessing.cpu_count())
+    model.save(outp1)
+    model.wv.save_word2vec_format(outp2, binary=False)
+
+
+if __name__ == '__main__':
+    train_word2vec_by_word()
+    # train_word2vec_by_char()
+
+    # inp = "Y:/BaiduNetdiskDownload/cut_zhwiki_wiki_parse/cut_zhwiki_wiki_parse.txt"
+    # sentences_char = []
+    # sentences = txtRead(inp)
+    # for sentences_one in sentences:
+    #     sentences_one_replace = sentences_one.strip().replace(" ", "")
+    #     sentences_one_replace_all = []
+    #     for sentences_one_replace_one in sentences_one_replace:
+    #         sentences_one_replace_all.append(sentences_one_replace_one)
+    #     sentences_char.append(" ".join(sentences_one_replace_all) + "\n")
+    # txtWrite(sentences_char, "Y:/BaiduNetdiskDownload/cut_zhwiki_wiki_parse/cut_zhwiki_wiki_parse_char.txt")
+    # gg = 0
\ No newline at end of file