Add files via upload

2019-05-05 16:15:35 +08:00 · 2019-05-05 16:15:35 +08:00 · a2891940c2
commit a2891940c2
parent 7c10ea984b
1 changed files with 139 additions and 0 deletions
--- a/AugmentText/augment_eda/enhance_word2vec.py
+++ b/AugmentText/augment_eda/enhance_word2vec.py
@ -0,0 +1,139 @@
+# -*- coding: UTF-8 -*-
+# !/usr/bin/python
+# @time     :2019/4/29 21:26
+# @author   :Mo
+# @function :同义词替换, word2vec版本
+
+from utils.text_tools import txtRead, txtWrite, jieba_cut, load_word2vec_model
+from conf.path_config import word2_vec_path
+import logging as logger
+
+word2vec_model = load_word2vec_model(word2_vec_path, limit_words=10000, binary_type=False, encoding_type='utf-8')
+
+
+def select_all_syn_sentence(count=0, candidate_list_set=[], syn_sentences=[]):
+    """
+      递归函数，将形如 [['1'], ['1', '2'], ['1']] 的list转为 ['111','121']
+    :param count: int, recursion times
+    :param candidate_list_set: list, eg.[['你'], ['是', '是不是'], ['喜欢', '喜爱', '爱'], ['米饭']]
+    :param syn_sentences: list, Storing intermediate variables of syn setnence, eg.['你是喜欢米饭', '你是不是喜欢米饭', '你是不是爱米饭']
+    :return: list, result of syn setnence, eg.['你是喜欢米饭', '你是不是喜欢米饭', '你是不是爱米饭']
+    """
+    syn_sentences_new = []
+    count = count - 1
+    if count == -1:
+        return syn_sentences
+    for candidate_list_set_one in candidate_list_set[0]:
+        for syn_sentences_one in syn_sentences:
+            syn_sentences_new.append(syn_sentences_one + candidate_list_set_one)
+    syn_sentences_new = select_all_syn_sentence(count=count, candidate_list_set=candidate_list_set[1:], syn_sentences=syn_sentences_new)
+    return syn_sentences_new
+
+
+def from_word2vec_get_synonym_words(sentence_cut, word2vec_model, count_words, top_words=20,
+                                    topn_word_score=0.75):
+    """
+      jieba切词后替换为同义句, 从word2vec获取结果
+    :param sentence_cut: list, demarcate words by jieba, eg.['你', '喜欢', '谁']
+    :param words_list:   list, A pair of synonyms or same_words or antonyms, all ,eg.['良民\t顺民\n','汉人\t汉民\n']
+    :param count_words:  int, statistics query times
+    :param type:         boolean, Judging whether synonyms(True) or antonyms(False), eg.True or False
+    :return: sentence_cut_dict(list, search words), count_words(int)
+    """
+    sentence_cut_dict = []
+    count_word = 0
+    for sentence_cut_one in sentence_cut:  # 切词后list中的一个词
+        sentence_cut_dict_list = set()
+        try:
+            topn_words = word2vec_model.most_similar(sentence_cut_one, topn=top_words)
+            for topn_word_num in topn_words:
+                if topn_word_num[1] >= topn_word_score:
+                    sentence_cut_dict_list.add(topn_word_num[0])
+        except Exception as e:
+            logger.info(str(e))
+
+        if sentence_cut_dict_list:  # 如果有同义词或者反义词，就加上，如果没有，就加上自身
+            sentence_cut_dict.append(list(sentence_cut_dict_list) + [sentence_cut_one])
+            count_words.append(count_word)
+        else:
+            sentence_cut_dict.append([sentence_cut_one])
+        count_word += 1
+
+    return sentence_cut_dict, count_words
+
+def word2vec_word_replace(sentence, top_words_put=20, topn_word_score_put=0.75):
+    """
+      只进行同义词替换，来生成同义句，同义词来源是word2vec
+    :param sentence: str, input sentence of user, eg.'我喜欢谁你知道吗'
+    :return: list, synonymous sentence generation
+
+    """
+    count_words = []
+    sentence_cut = jieba_cut(sentence)
+    len_sentence_cut = len(sentence_cut)
+    count_word2vec_words = []
+    # 根据切词结果list，按照word2vec模型获取同义词
+    sentence_cut_word2vec_dict, count_word2vec_words = from_word2vec_get_synonym_words(sentence_cut,
+                                                                                            word2vec_model,
+                                                                                            count_words,
+                                                                                            top_words=top_words_put,
+                                                                                            topn_word_score=topn_word_score_put)
+
+    # 根据获取到同义词，递归遍历生成同义句
+    if len(sentence_cut_word2vec_dict) == 1:  # 没有同义词就返回原句子
+        syn_sentence_cut_word2vec_list = sentence_cut
+    syn_sentence_cut_word2vec_list = select_all_syn_sentence(count=len_sentence_cut - 1,
+                                                                  candidate_list_set=sentence_cut_word2vec_dict[1:],
+                                                                  syn_sentences=sentence_cut_word2vec_dict[0])
+    syn_sentence_cut_word2vec_list.remove(sentence)
+
+    return list(set(syn_sentence_cut_word2vec_list))
+
+
+def get_syn_sentences(sentence_list, create_type='word2vec', top_words_put=20, topn_word_score_put=0.75):
+        """
+           批量生成同义句
+        :param sentence_list: list, sentences of input, eg.['爱你', '你会什么']
+        :param create_type: str, 'word2vec' or 'synonym'
+        :param top_words_put: int, top n word of word2vec_model most_similar words
+        :param topn_word_score_put: select topn_words of min most_similar score
+        :return: 
+        """
+        syn_sentences = []
+        # if create_type == 'word2vec':
+        for sentence in sentence_list:
+            syn_word2vec_one = word2vec_word_replace(sentence, top_words_put=20, topn_word_score_put=0.75)
+            syn_sentences.append([sentence] + syn_word2vec_one)
+
+        # todo 同义词典遍历生成
+        return syn_sentences
+
+
+def get_synonyms_from_word2vec(word2vec_model, word, topn=20, score_top=0.75):
+    word_syn = []
+    try:
+        topn_words = word2vec_model.most_similar(word, topn=topn)
+        for topn_word_num in topn_words:
+            if topn_word_num[1] >= score_top:
+                word_syn.append(topn_word_num[0])
+    except Exception as e:
+        logger.info(str(e))
+    return word_syn
+
+if __name__ == "__main__":
+    sentence = '2005年美元换人民币的,'
+    sentence_list = ['2006年美元换人民币']
+
+    syns1 = word2vec_word_replace(sentence)
+    print(syns1)
+    print('#####################word2vec词典较小，需要自己在Data/chinese_vector新增，conf/payh_config.py需要修改word2_vec_path常量###############################')
+
+    syn_sentences = get_syn_sentences(sentence_list)
+    print(syn_sentences)
+    print('###########################################################')
+    while True:
+        print('input: ')
+        sen = input()
+        syns1 = word2vec_word_replace(sen)
+        print('###########################word2vec_words ###########################################################')
+        print(syns1)