diff --git a/AugmentText/Readme.md b/AugmentText/Readme.md new file mode 100644 index 0000000..b08a700 --- /dev/null +++ b/AugmentText/Readme.md @@ -0,0 +1,61 @@ +# AugmentText + +# 概述 + - 相较于图像数据增强,文本数据增强,现在还是有很多问题的; + - 往更严格的角度看,文本数据增强更像是同义句生成,但又不完全是,它是一个更大范围的概念; + - 很多时候,需要文本数据增强,一个是常常遇到的数据不足,另一个就是数据不均衡。 + - 我的感觉是,文本数据增强的有效方法: + - 一个是回译(翻译两次,例如中文到英文,然后英文再到中文), + - 另外一个就是EDA(同义词替换、插入、交换和删除),插入和交换当时确实没有想到用 + + +###github项目地址为### + https://github.com/yongzhuo/nlp_xiaojiang/tree/master/AugmentText + + +# 回译(相对靠谱) + - 1.在线翻译工具(中文->[英、法、德、俄、西班牙、葡萄牙、日、韩、荷兰、阿拉伯]等语言) + - 谷歌翻译(google),谷歌翻译不用说,应该是挺好的,语言支持最多,不过我暂时还不会翻墙注册账户 + - 百度翻译(baidu),百度翻译不用说,国内支持翻译语言最多的了(28种互译),而且最大方了,注册账户后每月有200万字符的流量,大约是2M吧,超出则49元人民币/百万字符 + - 有道翻译(youdao),初始接触网络的时候我最喜欢用有道翻译了,但死贵,只有100元体验金,差评。才支持11种语言,48元/百万字符 + - 搜狗翻译(sougou),对于搜狗印象还行吧,毕竟是能做搜索引擎的公司嘛。78种语言,200元体验金,常见语言40元/百万字符,非常见语言60元/百万字符 + - 腾讯翻译(tencent),总觉得腾讯AI是后知后觉了,公司调用腾讯接口老是变来变去的,这次也是被它的sign加密给恶心到了,空格改为+。或许对企鹅而言,人工智能不那么重要吧。 + -有两个,一个是翻译君一个是AIlab什么的,支持的语言少些。似乎还在开发中,不限额不保证并发,php开发没有python的demo + - 必应翻译(bing),微软的东西,你懂的,没有尝试,直接在网页上试试还可以吧 + - 可以采用工具、模拟访问网页、或者是注册账号等 + - 2.离线翻译工具 + - 1.自己写,收集些语料,seq2seq,nmt,transformer + - 2.小牛翻译,比较古老的版本了,win10或者linux都可以,不过只有训练好的中英互译 + 地址:http://www.niutrans.com/index.html + +# 同义词替换(还行) + - 1.eda(其实就是同义词替换、插入、交换和删除) 论文《Easy data augmentation techniques for boosting performance on text classification tasks》 + - 中文实现的demo,github项目zhanlaoban/eda_nlp_for_Chinese,地址:https://github.com/zhanlaoban/eda_nlp_for_Chinese + - 2.word2vec、词典同义词替换 + - 不同于1中使用synonyms工具查找同义词,可以使用gensim的词向量,找出某个词最相似的词作为同意词。 + - 还可以使用同义词典机械查找,词典可用fighting41love/funNLP,github地址:https://github.com/fighting41love/funNLP/tree/master/data/ + +# 句法、句子扩充、句子缩写(比较困难、) + - 1.句子缩写,查找句子主谓宾等 + - 有个java的项目,调用斯坦福分词工具(不爱用),查找主谓宾的 + - 地址为:(主谓宾提取器)https://github.com/hankcs/MainPartExtractor + - 2.句子扩写 todo + - 3.句法 todo + +# HMM-marko(质量较差) + - HMM生成句子原理: 根据语料构建状态转移矩阵,jieba等提取关键词开头,生成句子 + - 参考项目:https://github.com/takeToDreamLand/SentenceGenerate_byMarkov + +# 深度学习方法 todo + - seq2seq + - bert + - transformer + - GAN + + +# 参考/感谢 + - eda_chinese: https://github.com/zhanlaoban/eda_nlp_for_Chinese + - 主谓宾提取器: https://github.com/hankcs/MainPartExtractor + - HMM生成句子: https://github.com/takeToDreamLand/SentenceGenerate_byMarkov + - 同义词等: https://github.com/fighting41love/funNLP/tree/master/data/ + - 小牛翻译: http://www.niutrans.com/index.html diff --git a/AugmentText/__init__.py b/AugmentText/__init__.py new file mode 100644 index 0000000..d7bc126 --- /dev/null +++ b/AugmentText/__init__.py @@ -0,0 +1,5 @@ +# -*- coding: UTF-8 -*- +# !/usr/bin/python +# @time :2019/4/9 19:44 +# @author :Mo +# @function : \ No newline at end of file diff --git a/AugmentText/augment_eda/__init__.py b/AugmentText/augment_eda/__init__.py new file mode 100644 index 0000000..6745f1f --- /dev/null +++ b/AugmentText/augment_eda/__init__.py @@ -0,0 +1,5 @@ +# -*- coding: UTF-8 -*- +# !/usr/bin/python +# @time :2019/4/9 21:14 +# @author :Mo +# @function : \ No newline at end of file diff --git a/AugmentText/augment_eda/enhance_eda.py b/AugmentText/augment_eda/enhance_eda.py new file mode 100644 index 0000000..c484cae --- /dev/null +++ b/AugmentText/augment_eda/enhance_eda.py @@ -0,0 +1,235 @@ +# -*- coding: UTF-8 -*- +# !/usr/bin/python +# @time :2019/4/1 10:35 +# @author :Mo +# @function :enhance text by eda, eda is replace, insert, swap, delete + + +from utils.text_tools import is_total_english +from utils.text_tools import is_total_number +from conf.path_config import stop_words_path +from utils.text_tools import jieba_cut +from random import shuffle +import synonyms +import random + + +random.seed(2019) +key_word_list = ["rsh", "mo", "澶ф紶甯濆浗"] + + +# 鍋滅敤璇嶅垪琛紝榛樿浣跨敤hanlp鍋滅敤璇嶈〃 +f_stop = open(stop_words_path, "r", encoding="utf-8") +stop_words = [] +for stop_word in f_stop.readlines(): + stop_words.append(stop_word.strip()) + + +def synonym_replacement(words, n, key_words): + """ + 鍚屼箟璇嶆浛鎹,鏇挎崲涓涓鍙ヤ腑鐨刵涓崟璇嶄负鍏跺悓涔夎瘝 + :param words: list, inupt sentence + :param n: int, replace words + :return: list, new_words + """ + new_words = words.copy() + random_word_list = list(set([word for word in words if word not in stop_words])) + random.shuffle(random_word_list) + num_replaced = 0 + for random_word in random_word_list: + sim_synonyms = get_syn_by_synonyms(random_word) + if len(sim_synonyms) >= 1 and random_word not in key_words and not is_total_english(random_word) and not is_total_number(random_word): + synonym = random.choice(sim_synonyms) + new_words = [synonym if word == random_word else word for word in new_words] + num_replaced += 1 + if num_replaced >= n: + break + sentence = ' '.join(new_words) + new_words = sentence.split(' ') + return new_words + + +def get_syn_by_synonyms(word): + if not is_total_english(word.strip()): + return synonyms.nearby(word)[0] + else: + return word + + +def random_insertion(words, n, key_words): + """ + 闅忔満鎻掑叆, 闅忔満鍦ㄨ鍙ヤ腑鎻掑叆n涓瘝 + :param words: list, inupt sentence + :param n: int, insert words + :return: list, new_words + """ + new_words = words.copy() + for _ in range(n): + add_word(new_words, key_words) + return new_words + + +def add_word(new_words, key_words): + """ + 鍦╨ist涓婇殢鏈烘彃鍏ヤ竴涓悓涔夎瘝 + :param words: list, inupt sentence + :return: list, new_words + """ + synonyms = [] + counter = 0 + while len(synonyms) < 1: + random_word = new_words[random.randint(0, len(new_words) - 1)] + # 杩囨护 + if random_word not in key_words and not is_total_english(random_word) and not is_total_number(random_word): + synonyms = get_syn_by_synonyms(random_word) + counter += 1 + if counter >= 10: + return + random_synonym = random.choice(synonyms) + random_idx = random.randint(0, len(new_words) - 1) + new_words.insert(random_idx, random_synonym) + + +def random_swap(words, n): + """ + 闅忔満浜ゆ崲锛岄殢鏈轰氦鎹袱涓瘝璇璶娆℃暟 + :param words: list, inupt sentence + :param n: int, swap words + :return: list, new_words + """ + new_words = words.copy() + for _ in range(n): + new_words = swap_word(new_words) + return new_words + + +def swap_word(new_words): + """ + 闅忔満浜ゆ崲锛岄殢鏈轰氦鎹袱涓瘝璇 + :param new_words: list, inupt sentence + :return: list, new_words + """ + random_idx_1 = random.randint(0, len(new_words) - 1) + random_idx_2 = random_idx_1 + counter = 0 + while random_idx_2 == random_idx_1: + random_idx_2 = random.randint(0, len(new_words) - 1) + counter += 1 + if counter > 3: + return new_words + new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1] + return new_words + + +def random_deletion(words, p, key_words): + """ + 闅忔満鍒犻櫎,浠ユ鐜噋鍒犻櫎璇彞涓殑璇 + :param words: list, inupt sentence + :param p: float,闅忔満鍒犻櫎鐨勬鐜 + :return: list, 杩斿洖绛 + """ + if len(words) == 1: + return words + + new_words = [] + for word in words: + r = random.uniform(0, 1) + if r > p or word in key_words: + new_words.append(word) + + if len(new_words) == 0: + rand_int = random.randint(0, len(words) - 1) + return [words[rand_int]] + + return new_words + + +def sentence_replace_whitespace(sentences): + """ + 鍘婚櫎绌烘牸 + :param sentences: list, + :return: list + """ + sentences_new = [] + for sentence in sentences: + sentence_replace = sentence.replace(" ", "").strip() + sentences_new.append(sentence_replace + "\n") + return sentences_new + + +def eda(sentence, alpha_sr=0.1, alpha_ri=0.1, alpha_rs=0.1, p_rd=0.1, num_aug=9, key_words=[]): + """ + EDA鍑芥暟锛屽悓涔夎瘝鏇挎崲銆佹彃鍏ヨ瘝姹囥佷氦鎹㈣瘝璇『搴忋佸垹闄よ瘝璇 + :param sentence: str, input sentence + :param alpha_sr: float, synonym_replacement + :param alpha_ri: float, random_insertion + :param alpha_rs: float, random_swap + :param p_rd: float, random_deletion + :param num_aug: int, generate n new sentence + :return: list, contain orl sentence + """ + seg_list = jieba_cut(sentence) + seg_list = " ".join(seg_list) + words = list(seg_list.split()) + num_words = len(words) + + augmented_sentences = [] + num_new_per_technique = int(num_aug*2 / 4) + 1 + n_sr = max(1, int(alpha_sr * num_words)) * 2 + n_ri = max(1, int(alpha_ri * num_words)) * 2 + n_rs = max(1, int(alpha_rs * num_words)) + + # 鍚屼箟璇嶆浛鎹r + for _ in range(num_new_per_technique): + a_words = synonym_replacement(words, n_sr, key_words) + augmented_sentences.append(''.join(a_words)) + + # 闅忔満鎻掑叆ri + for _ in range(num_new_per_technique): + a_words = random_insertion(words, n_ri, key_words) + augmented_sentences.append(''.join(a_words)) + + # 闅忔満浜ゆ崲rs + for _ in range(num_new_per_technique): + a_words = random_swap(words, n_rs) + augmented_sentences.append(''.join(a_words)) + + # 闅忔満鍒犻櫎rd + for _ in range(num_new_per_technique): + a_words = random_deletion(words, p_rd, key_words) + augmented_sentences.append(''.join(a_words)) + + augmented_sentences = list(set(augmented_sentences)) + shuffle(augmented_sentences) + # 澶煭鐨勫彞瀛愪笉瑕 + augmented_sentences_new = [] + for augmented_sentences_one in augmented_sentences: + if len(augmented_sentences_one) > 5: + augmented_sentences_new.append(augmented_sentences_one) + + augmented_sentences = augmented_sentences_new + if num_aug >= 1: + augmented_sentences = augmented_sentences[:num_aug] + else: + keep_prob = num_aug / len(augmented_sentences) + augmented_sentences = [s for s in augmented_sentences if random.uniform(0, 1) < keep_prob] + + if len(augmented_sentences) > num_aug: + augmented_sentences = augmented_sentences[0:num_aug] + # augmented_sentences.append(seg_list) + return augmented_sentences + + + + +if __name__ == "__main__": + des = get_syn_by_synonyms("鍚屼箟璇") + print(des) + syn = eda(sentence="rsh鍠滄澶ф紶甯濆浗鍚", alpha_sr=0.2, alpha_ri=0.2, alpha_rs=0.2, p_rd=0.2, num_aug=10, key_words=key_word_list) + syn_s = sentence_replace_whitespace(syn) + print(syn) + while True: + print('杈撳叆: ') + sen = input() + syn = eda(sentence=sen) + print(syn) \ No newline at end of file diff --git a/AugmentText/augment_marko/__init__.py b/AugmentText/augment_marko/__init__.py new file mode 100644 index 0000000..6745f1f --- /dev/null +++ b/AugmentText/augment_marko/__init__.py @@ -0,0 +1,5 @@ +# -*- coding: UTF-8 -*- +# !/usr/bin/python +# @time :2019/4/9 21:14 +# @author :Mo +# @function : \ No newline at end of file diff --git a/AugmentText/augment_marko/enhance_marko.py b/AugmentText/augment_marko/enhance_marko.py new file mode 100644 index 0000000..e1cb58e --- /dev/null +++ b/AugmentText/augment_marko/enhance_marko.py @@ -0,0 +1,196 @@ +# -*- coding: UTF-8 -*- +# !/usr/bin/python +# @Time :2019/3/25 14:11 +# @author :Mo +# @function :generate disorder sentence by marko + +from sklearn.feature_extraction.text import TfidfTransformer +from sklearn.feature_extraction.text import CountVectorizer +from conf.path_config import chicken_and_gossip_path +from conf.path_config import projectdir +from utils.text_tools import txtRead +from utils.text_tools import txtWrite +from jieba import analyse +import random +import jieba + + +# 寮曞叆TF-IDF鍏抽敭璇嶆娊鍙栨帴鍙 +tfidf = analyse.extract_tags +# 寮曞叆TextRank鍏抽敭璇嶆娊鍙栨帴鍙 +textrank = analyse.textrank + + +def create_model(model_markov, datalist): + """ + create model of sentence sequence + :param model_marko: dict + :param datalist: list of set + :return: set + """ + for line in datalist: + line = list(jieba.cut(line.lower().strip(), cut_all=False)) + for i, word in enumerate(line): + if i == len(line) - 1: + model_markov['FINISH'] = model_markov.get('FINISH', []) + [word] + else: + if i == 0: + model_markov['BEGIN'] = model_markov.get('BEGIN', []) + [word] + model_markov[word] = model_markov.get(word, []) + [line[i + 1]] + + for key in model_markov.keys(): + model_markov[key] = list(set(model_markov[key])) + + return model_markov + + +def generate_random_1(model_markov, gen_words): + """ + 鏍规嵁椹皵绉戝か閾剧敓鎴愬悓涔夊彞锛屾湰璐ㄥ氨鏄牴鎹竴涓瘝璧板埌鍙﹀涓涓瘝鍘 + :param generated: list, empty + :param model_marko: dict, marko of dict + :return: str + """ + while True: + if not gen_words: + words = model_markov['BEGIN'] + elif gen_words[-1] in model_markov['FINISH']: + break + else: + try: + words = model_markov[gen_words[-1]] + except Exception as e: + return "".join(gen_words) + "\n" + # 闅忔満閫夋嫨涓涓瘝璇 + gen_words.append(random.choice(words)) + + return "".join(gen_words) + "\n" + + +def generate_random_select(generated, model_marko, twice=100000, len_min=5): + """ + 榛樿閬嶅巻1000娆$敓鎴愬彞瀛 + :param generated: list, one key word, rg.["寤鸿"] + :param model_marko: dict, transition matrix + :param twice: int, twice + :param len_min: int, min length of gen sentence + :return: list, syn_generates + """ + syn_generates = set() + for num in range(twice): + syn_generate = generate_random_1(model_marko, generated) + generated = [] + if len(syn_generate) > len_min: + syn_generates.add(syn_generate) + return list(syn_generates) + + +def get_keyword_from_tf(sentences, p): + """ + 鑾峰彇鏌愪釜绫诲瀷涓嬭鏂欑殑鐑瘝 + :param sentences: list, cuted sentences, filter by " " + :param p: float, rate, 0 < p < 1 + :return: list, words + """ + sentence_cut_list = [" ".join(list(jieba.cut(text.strip(), cut_all=False, HMM=True))) for text in sentences] + # token_pattern鎸囧畾缁熻璇嶉鐨勬ā寮, 涓嶆寚瀹, 榛樿濡傝嫳鏂, 涓嶇粺璁″崟瀛 + vectorizer = CountVectorizer(token_pattern='\\b\\w+\\b') + # norm=None瀵硅瘝棰戠粨鏋滀笉褰掍竴鍖 + # use_idf=False, 鍥犱负浣跨敤鐨勬槸璁$畻tfidf鐨勫嚱鏁, 鎵浠ヨ蹇界暐idf鐨勮绠 + transformer = TfidfTransformer(norm=None, use_idf=False) + vectorizer.fit_transform(sentence_cut_list) + # tf = transformer.fit_transform(vectorizer.fit_transform(sentence_cut_list)) + word = vectorizer.get_feature_names() + # weight = tf.toarray() + return word[-int(len(word) * p):] + + +def get_begin_word(sentences, p): + """ + 鑾峰彇jieba鍒囪瘝鍚 + :param sentences:list, sentences of input + :param p: float, + :return: list, key_words + """ + sentence_cut_begin_list = [list(jieba.cut(text.strip(), cut_all=False, HMM=True))[0] for text in sentences] + len_begin_p = int(len(sentence_cut_begin_list) * p) + return sentence_cut_begin_list[-len_begin_p:] + + +def get_keyword_from_jieba_tfidf(sentences, p): + """ + 鍩轰簬TF-IDF绠楁硶杩涜鍏抽敭璇嶆娊鍙 + :param sentence: str, sentence of input + :return: list, return keyword + """ + sentence_cut_list = [" ".join(list(jieba.cut(text.strip(), cut_all=False, HMM=True))) for text in sentences] + sentence_cut_list_str = str(sentence_cut_list) + key_word = tfidf(sentence_cut_list_str) + return key_word + + +def get_keyword_from_jieba_textrank(sentences, p): + """ + 鍩轰簬textrank绠楁硶杩涜鍏抽敭璇嶆娊鍙 + :param sentence: str, sentence of input + :return: list, return keyword + """ + key_words = [] + for sentences_one in sentences: + key_word = textrank(sentences_one) + key_words = key_words + key_word + # token_pattern鎸囧畾缁熻璇嶉鐨勬ā寮, 涓嶆寚瀹, 榛樿濡傝嫳鏂, 涓嶇粺璁″崟瀛 + vectorizer = CountVectorizer(token_pattern='\\b\\w+\\b') + vectorizer.fit_transform(key_words) + word = vectorizer.get_feature_names() + return word[-int(len(word) * p):] + + +def generate_syns_from_list(sentence_list, begin_word="tfidf", p=0.1): + """ + 璇诲彇txt鏂囦欢鍘熻鍙ワ紝鑾峰彇娌℃湁鐨勭敓鎴愬彞瀛 + :param txt_path: str, path of corpus + :param begin_word: str, "tf", "tfidf", "textrank" + :param p: float, rate, 0 < p < 1 + :return: list, generated sentence + """ + # 鑾峰彇鐑棬鍏抽敭璇 + if begin_word == "tf": + generated_hot = get_keyword_from_tf(sentence_list, p) + elif begin_word == "textrank": + generated_hot = get_keyword_from_jieba_textrank(sentence_list, p) + elif begin_word == "begin_word": + generated_hot = get_begin_word(sentence_list, p) + else: + generated_hot = get_keyword_from_jieba_tfidf(sentence_list, p) + + # 鍒涘缓浼犻掓ā鍨 + model_txt = {} + model_txt = create_model(model_txt, sentence_list) + # 浠ュ叧閿瘝寮澶达紝鏋勫缓鍚屼箟鍙 + gen_all_syn = [] + for generated_hot_one in generated_hot: + generated_hot_one_1 = [generated_hot_one] + generated_str = generate_random_select(generated_hot_one_1, model_txt, twice=1000, len_min=5) + if generated_str: + gen_all_syn = gen_all_syn + generated_str + # 鎻愬彇鍘熷彞涓病鏈夌殑閮ㄥ垎 + gen_all_syn = list(set(gen_all_syn)) + # 鐢熸垚鍙ュ瓙涓庡師鍙ョ殑浜ら泦 + syn_intersection = list(set(sentence_list).intersection(set(gen_all_syn))) + # 鐢熸垚鍙ュ瓙鍑忓幓浜ら泦 + gen_syns = list(set(gen_all_syn).difference(set(syn_intersection))) + return gen_syns + + +if __name__ == "__main__": + # 璇诲彇涓涓枃浠讹紝鍐嶇敓鎴愬彞瀛 + txt_path = chicken_and_gossip_path + sentence_list = txtRead(txt_path) + sentence_list = sentence_list[0:100] + enhance_texts = generate_syns_from_list(sentence_list, begin_word="tfidf", p=0.1) + for enhance_texts_one in enhance_texts: + try: + print(enhance_texts_one) + except Exception as e: + print(str(e)) \ No newline at end of file diff --git a/AugmentText/augment_syntax/__init__.py b/AugmentText/augment_syntax/__init__.py new file mode 100644 index 0000000..7dad124 --- /dev/null +++ b/AugmentText/augment_syntax/__init__.py @@ -0,0 +1,5 @@ +# -*- coding: UTF-8 -*- +# !/usr/bin/python +# @time :2019/4/9 21:16 +# @author :Mo +# @function : \ No newline at end of file diff --git a/AugmentText/augment_translate/__init__.py b/AugmentText/augment_translate/__init__.py new file mode 100644 index 0000000..1f0ae12 --- /dev/null +++ b/AugmentText/augment_translate/__init__.py @@ -0,0 +1,5 @@ +# -*- coding: UTF-8 -*- +# !/usr/bin/python +# @time :2019/4/9 21:15 +# @author :Mo +# @function : \ No newline at end of file diff --git a/AugmentText/augment_translate/translate_account/__init__.py b/AugmentText/augment_translate/translate_account/__init__.py new file mode 100644 index 0000000..6efb462 --- /dev/null +++ b/AugmentText/augment_translate/translate_account/__init__.py @@ -0,0 +1,5 @@ +# -*- coding: UTF-8 -*- +# !/usr/bin/python +# @time :2019/4/9 22:58 +# @author :Mo +# @function : \ No newline at end of file diff --git a/AugmentText/augment_translate/translate_account/translate_tencent_secret.py b/AugmentText/augment_translate/translate_account/translate_tencent_secret.py new file mode 100644 index 0000000..98d894e --- /dev/null +++ b/AugmentText/augment_translate/translate_account/translate_tencent_secret.py @@ -0,0 +1,107 @@ +# -*- coding: UTF-8 -*- +# !/usr/bin/python +# @time :2019/4/9 23:05 +# @author :Mo +# @function :浣跨敤鑵捐璐︽埛锛堢炕璇戝悰锛夛紝鍥炶瘧 + + +from conf.augment_constant import language_short_tencent +from conf.augment_constant import app_secret_tentcnet +from conf.augment_constant import app_key_tencent +from urllib.parse import quote +import logging as logger +import requests +import hashlib +import random +import string +import time +import json + + +def md5_sign(text): + """ + 鐢熸垚md5 + :param src: str, sentence + :return: str, upper of string + """ + md5_model = hashlib.md5(text.encode("utf8")) + return md5_model.hexdigest().upper() + + +def get_params(text, from_l="zh", to_l="en"): + """ + 鐢熸垚sign鍜宲arams + :param text: str, input sentence + :param from_: source language + :param to_: target language + :return: dict, params + """ + # 璇锋眰鏃堕棿鎴筹紙绉掔骇锛夛紝鐢ㄤ簬闃叉璇锋眰閲嶆斁锛堜繚璇佺鍚5鍒嗛挓鏈夋晥锛壜犅 + time_stamp = str(int(time.time())) + # 璇锋眰闅忔満瀛楃涓诧紝鐢ㄤ簬淇濊瘉绛惧悕涓嶅彲棰勬祴聽聽 + nonce_str = ''.join(random.sample(string.ascii_letters + string.digits, 10)) + params = {'app_id': app_key_tencent, + 'source': from_l, + 'target': to_l, + 'text': text, + 'time_stamp': time_stamp, + 'nonce_str': nonce_str + } + signs = '' + # 瑕佸key鎺掑簭鍐嶆嫾鎺ヂ犅 + for key in sorted(params): + # 閿兼嫾鎺ヨ繃绋媣alue閮ㄥ垎闇瑕乁RL缂栫爜锛孶RL缂栫爜绠楁硶鐢ㄥぇ鍐欏瓧姣嶏紝渚嬪%E8銆俼uote榛樿澶у啓銆偮犅 + signs += '{}={}&'.format(key, quote(params[key], safe='').replace("%20", "+")) + # 灏嗗簲鐢ㄥ瘑閽ヤ互app_key涓洪敭鍚嶏紝鎷兼帴鍒板瓧绗︿覆sign_before鏈熬聽聽 + signs += 'app_key={}'.format(app_secret_tentcnet) + # 瀵瑰瓧绗︿覆sign_before杩涜MD5杩愮畻锛屽緱鍒版帴鍙h姹傜鍚嵚犅 + sign = md5_sign(signs) + params['sign'] = sign + return params + + +def any_to_any_translate_tencent(text, from_='zh', to_='en'): + """ + 璋冪敤鎼滅嫍缈昏瘧锛屼粠浠绘剰涓绉嶈瑷鍒板彟澶栦竴绉嶈瑷锛岃鎯呰甯搁噺LANGUAGE_SHORT_BAIDU + :param text: str, input sentence + :param from_: source language + :param to_: target language + :return: str, translate sentence + """ + try: + url = "https://api.ai.qq.com/fcgi-bin/nlp/nlp_texttranslate" + params_text = get_params(text, from_l=from_, to_l=to_) + res_post = requests.request("POST", url, data=params_text) + res_content = res_post.content.decode("utf8") + res_json = json.loads(res_content) + target_text = res_json["data"]["target_text"] + return target_text + except Exception as e: + logger.error(str(e)) + return None + + +def translate_tencent_back(text, from_='zh', to_='en'): + """ + 鍥炶瘧锛岃皟鐢ㄤ袱娆¤吘璁炕璇 + :param text: str, input sentence + :param from_: source language + :param to_: target language + :return: str, translate sentence + """ + try: + text_from_to = any_to_any_translate_tencent(text, from_=from_, to_=to_) + text_to_from = any_to_any_translate_tencent(text_from_to, from_=to_, to_=from_) + return text_to_from + except Exception as e: + logger.error(str(e)) + return None + + + +if __name__ == '__main__': + text_test = "浣犺寰桱Y浼氬枩娆㈡殫褰遍殢椋庛佸ぇ婕犲笣鍥藉悧".strip() + for to_test in language_short_tencent: + res_test = translate_tencent_back(text_test, from_='zh', to_=to_test) + print("娌℃湁璐︽埛灏变负绌猴紝鍥炶瘧缁撴灉: ") + print(res_test) diff --git a/AugmentText/augment_translate/translate_tools/__init__.py b/AugmentText/augment_translate/translate_tools/__init__.py new file mode 100644 index 0000000..50ebcb6 --- /dev/null +++ b/AugmentText/augment_translate/translate_tools/__init__.py @@ -0,0 +1,5 @@ +# -*- coding: UTF-8 -*- +# !/usr/bin/python +# @time :2019/4/9 22:57 +# @author :Mo +# @function : \ No newline at end of file diff --git a/AugmentText/augment_translate/translate_tools/translate_translate.py b/AugmentText/augment_translate/translate_tools/translate_translate.py new file mode 100644 index 0000000..f68e10e --- /dev/null +++ b/AugmentText/augment_translate/translate_tools/translate_translate.py @@ -0,0 +1,46 @@ +# -*- coding: UTF-8 -*- +# !/usr/bin/python +# @time :2019/4/9 23:05 +# @author :Mo +# @function :浣跨敤缈昏瘧宸ュ叿translate.Translator锛屽洖璇 + + +from conf.augment_constant import language_short_google +from utils.text_tools import judge_translate_english +from translate import Translator + + +def translate_tools_translate(text, to_='en'): + """ + 璋冪敤translate杩涜鍙ュ瓙鐢熸垚 + :param text: str, input + :param to_: language type + :return: str, result + """ + # provider = 'mymemory','microsoft' + translator1 = Translator(to_lang=to_, from_lang='zh', provider=None, secret_access_key=None) + translator2 = Translator(to_lang="zh", from_lang=to_, provider=None, secret_access_key=None) + + translation1 = translator1.translate(text) + translation2 = translator2.translate(translation1) + return translation2 + + +if __name__ == "__main__": + sen_org = "澶ф紶甯濆浗鍠滄RSH銆丣Y鍚" + for language_short_google_one in language_short_google: + text_translate = translate_tools_translate(sen_org, to_=language_short_google_one) + judge = judge_translate_english(sen_org, text_translate) + if judge: + print("True") + print(text_translate) + else: + print("False") + print(text_translate) +# 娴嬭瘯缁撴灉: +# False +# 娌欐紶甯濆浗鏄惁鍍廟SH锛孞Y锛 +# False +# 娌欐紶甯濆浗鐪嬭捣鏉ュ儚RSH锛孞Y锛 +# False +# 甯濆浗娌欐紶鍍弐sh锛宩y锛 \ No newline at end of file diff --git a/AugmentText/augment_translate/translate_web/__init__.py b/AugmentText/augment_translate/translate_web/__init__.py new file mode 100644 index 0000000..6efb462 --- /dev/null +++ b/AugmentText/augment_translate/translate_web/__init__.py @@ -0,0 +1,5 @@ +# -*- coding: UTF-8 -*- +# !/usr/bin/python +# @time :2019/4/9 22:58 +# @author :Mo +# @function : \ No newline at end of file diff --git a/AugmentText/augment_translate/translate_web/translate_google.py b/AugmentText/augment_translate/translate_web/translate_google.py new file mode 100644 index 0000000..2ab2b02 --- /dev/null +++ b/AugmentText/augment_translate/translate_web/translate_google.py @@ -0,0 +1,154 @@ +# -*- coding: UTF-8 -*- +# !/usr/bin/python +# @Time :2019/3/21 14:30 +# @author :Mo +# @function :鍥炶瘧璋冪敤璋锋瓕缈昏瘧锛屾ā鎷焔oogle token璁块棶 + +from conf.augment_constant import language_short_google +from utils.text_tools import judge_translate_english +import logging as logger +import urllib.parse as parse +import requests +import execjs + + +class GoogleToken: + def __init__(self): + self.ctx = execjs.compile(""" + function TL(a) { + var k = ""; + var b = 406644; + var b1 = 3293161072; + var jd = "."; + var $b = "+-a^+6"; + var Zb = "+-3^+b+-f"; + for (var e = [], f = 0, g = 0; g < a.length; g++) { + var m = a.charCodeAt(g); + 128 > m ? e[f++] = m : (2048 > m ? e[f++] = m >> 6 | 192 : (55296 == (m & 64512) && g + 1 < a.length && 56320 == (a.charCodeAt(g + 1) & 64512) ? (m = 65536 + ((m & 1023) << 10) + (a.charCodeAt(++g) & 1023), + e[f++] = m >> 18 | 240, + e[f++] = m >> 12 & 63 | 128) : e[f++] = m >> 12 | 224, + e[f++] = m >> 6 & 63 | 128), + e[f++] = m & 63 | 128) + } + a = b; + for (f = 0; f < e.length; f++) a += e[f], + a = RL(a, $b); + a = RL(a, Zb); + a ^= b1 || 0; + 0 > a && (a = (a & 2147483647) + 2147483648); + a %= 1E6; + return a.toString() + jd + (a ^ b) + }; + function RL(a, b) { + var t = "a"; + var Yb = "+"; + for (var c = 0; c < b.length - 2; c += 3) { + var d = b.charAt(c + 2), + d = d >= t ? d.charCodeAt(0) - 87 : Number(d), + d = b.charAt(c + 1) == Yb ? a >>> d: a << d; + a = b.charAt(c) == Yb ? a + d & 4294967295 : a ^ d + } + return a + } + """) + + def get_google_token(self, text): + """ + 鑾峰彇璋锋瓕璁块棶token + :param text: str, input sentence + :return: + """ + return self.ctx.call("TL", text) + + +def open_url(url): + """ + 鏂板header锛屽苟request璁块棶 + :param url: str, url鍦板潃 + :return: str, 鐩爣url鍦板潃杩斿洖 + """ + headers = { + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) ' + 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'} + req = requests.get(url=url, headers=headers) + return req.content.decode('utf-8') + + +def max_length(content): + """ + 瓒呰繃鏈澶ч暱搴﹀氨涓嶇炕璇 + :param content: str, need translate + :return: + """ + if len(content) > 4891: + logger.info("缈昏瘧鏂囨湰瓒呰繃闄愬埗锛") + return + + +def translate_result(result): + """ + 鍒犲幓鏃犲叧璇 + :param result: str + :return: str + """ + str_end = result.find("\",") + if str_end > 4: + return result[4:str_end] + else: + return None + + +def any_to_any_translate(content, from_='zh-CN', to_='en'): + """ + 鑷畾涔夐夋嫨 + :param content: str, 4891涓瓧锛 鐢ㄦ埛杈撳叆 + :param from_: str, original language + :param to_: str, target language + :return: str, result of translate + """ + max_length(content) + tk = google_tokn.get_google_token(content) + content = parse.quote(content) + url = "http://translate.google.cn/translate_a/single?client=t&sl={0}&tl={1}" \ + "&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca&dt=rw&dt=rm&dt=ss&dt=t&" \ + "ie=UTF-8&oe=UTF-8&source=btn&ssel=3&tsel=3&kc=0&tk={2}&q={3}".format(from_, to_, tk, content) + result = open_url(url) + res = translate_result(result) + return res + + +def any_to_any_translate_back(content, from_='zh-CN', to_='en'): + """ + 涓嫳锛岃嫳涓洖璇 + :param content:str, 4891涓瓧锛 鐢ㄦ埛杈撳叆 + :param from_: str, original language + :param to_: str, target language + :return: str, result of translate + """ + translate_content = any_to_any_translate(content, from_=from_, to_=to_) + result = any_to_any_translate(translate_content, from_=to_, to_=from_) + return result + + +if __name__ == '__main__': + google_tokn = GoogleToken() + while True: + sen_org = "杩囪矾铚昏湏鍠滄鍙h宸у厠鍔涳紝杩欐槸浠涔堟剰鎬" + for language_short_google_one in language_short_google: + text_translate = any_to_any_translate_back(sen_org, from_='zh', to_=language_short_google_one) + judge = judge_translate_english(sen_org, text_translate) + if judge: + print(language_short_google_one + " " + "True") + print(text_translate) + else: + print(language_short_google_one + " " + "False") + print(text_translate) +#娴嬭瘯缁撴灉 +# en False +# 鎴戝枩娆㈠彛琚嬪阀鍏嬪姏锛岃繖鏄粈涔堟剰鎬濓紵 +# fr False +# 鎴戝枩娆㈠彛琚嬪阀鍏嬪姏锛岃繖鏄粈涔堟剰鎬濓紵 +# ru False +# 鎴戝枩娆㈠彛琚嬬硸鏋滐紝杩欐槸浠涔堟剰鎬濓紵 +# de False +# 鎴戝枩娆㈣宸у厠鍔涳紝杩欐槸浠涔堟剰鎬濓紵 \ No newline at end of file