Add files via upload
This commit is contained in:
parent
80fd973d94
commit
fb86363088
61
AugmentText/Readme.md
Normal file
61
AugmentText/Readme.md
Normal file
@ -0,0 +1,61 @@
|
||||
# AugmentText
|
||||
|
||||
# 概述
|
||||
- 相较于图像数据增强,文本数据增强,现在还是有很多问题的;
|
||||
- 往更严格的角度看,文本数据增强更像是同义句生成,但又不完全是,它是一个更大范围的概念;
|
||||
- 很多时候,需要文本数据增强,一个是常常遇到的数据不足,另一个就是数据不均衡。
|
||||
- 我的感觉是,文本数据增强的有效方法:
|
||||
- 一个是回译(翻译两次,例如中文到英文,然后英文再到中文),
|
||||
- 另外一个就是EDA(同义词替换、插入、交换和删除),插入和交换当时确实没有想到用
|
||||
|
||||
|
||||
###github项目地址为###
|
||||
https://github.com/yongzhuo/nlp_xiaojiang/tree/master/AugmentText
|
||||
|
||||
|
||||
# 回译(相对靠谱)
|
||||
- 1.在线翻译工具(中文->[英、法、德、俄、西班牙、葡萄牙、日、韩、荷兰、阿拉伯]等语言)
|
||||
- 谷歌翻译(google),谷歌翻译不用说,应该是挺好的,语言支持最多,不过我暂时还不会翻墙注册账户
|
||||
- 百度翻译(baidu),百度翻译不用说,国内支持翻译语言最多的了(28种互译),而且最大方了,注册账户后每月有200万字符的流量,大约是2M吧,超出则49元人民币/百万字符
|
||||
- 有道翻译(youdao),初始接触网络的时候我最喜欢用有道翻译了,但死贵,只有100元体验金,差评。才支持11种语言,48元/百万字符
|
||||
- 搜狗翻译(sougou),对于搜狗印象还行吧,毕竟是能做搜索引擎的公司嘛。78种语言,200元体验金,常见语言40元/百万字符,非常见语言60元/百万字符
|
||||
- 腾讯翻译(tencent),总觉得腾讯AI是后知后觉了,公司调用腾讯接口老是变来变去的,这次也是被它的sign加密给恶心到了,空格改为+。或许对企鹅而言,人工智能不那么重要吧。
|
||||
-有两个,一个是翻译君一个是AIlab什么的,支持的语言少些。似乎还在开发中,不限额不保证并发,php开发没有python的demo
|
||||
- 必应翻译(bing),微软的东西,你懂的,没有尝试,直接在网页上试试还可以吧
|
||||
- 可以采用工具、模拟访问网页、或者是注册账号等
|
||||
- 2.离线翻译工具
|
||||
- 1.自己写,收集些语料,seq2seq,nmt,transformer
|
||||
- 2.小牛翻译,比较古老的版本了,win10或者linux都可以,不过只有训练好的中英互译
|
||||
地址:http://www.niutrans.com/index.html
|
||||
|
||||
# 同义词替换(还行)
|
||||
- 1.eda(其实就是同义词替换、插入、交换和删除) 论文《Easy data augmentation techniques for boosting performance on text classification tasks》
|
||||
- 中文实现的demo,github项目zhanlaoban/eda_nlp_for_Chinese,地址:https://github.com/zhanlaoban/eda_nlp_for_Chinese
|
||||
- 2.word2vec、词典同义词替换
|
||||
- 不同于1中使用synonyms工具查找同义词,可以使用gensim的词向量,找出某个词最相似的词作为同意词。
|
||||
- 还可以使用同义词典机械查找,词典可用fighting41love/funNLP,github地址:https://github.com/fighting41love/funNLP/tree/master/data/
|
||||
|
||||
# 句法、句子扩充、句子缩写(比较困难、)
|
||||
- 1.句子缩写,查找句子主谓宾等
|
||||
- 有个java的项目,调用斯坦福分词工具(不爱用),查找主谓宾的
|
||||
- 地址为:(主谓宾提取器)https://github.com/hankcs/MainPartExtractor
|
||||
- 2.句子扩写 todo
|
||||
- 3.句法 todo
|
||||
|
||||
# HMM-marko(质量较差)
|
||||
- HMM生成句子原理: 根据语料构建状态转移矩阵,jieba等提取关键词开头,生成句子
|
||||
- 参考项目:https://github.com/takeToDreamLand/SentenceGenerate_byMarkov
|
||||
|
||||
# 深度学习方法 todo
|
||||
- seq2seq
|
||||
- bert
|
||||
- transformer
|
||||
- GAN
|
||||
|
||||
|
||||
# 参考/感谢
|
||||
- eda_chinese: https://github.com/zhanlaoban/eda_nlp_for_Chinese
|
||||
- 主谓宾提取器: https://github.com/hankcs/MainPartExtractor
|
||||
- HMM生成句子: https://github.com/takeToDreamLand/SentenceGenerate_byMarkov
|
||||
- 同义词等: https://github.com/fighting41love/funNLP/tree/master/data/
|
||||
- 小牛翻译: http://www.niutrans.com/index.html
|
5
AugmentText/__init__.py
Normal file
5
AugmentText/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
# !/usr/bin/python
|
||||
# @time :2019/4/9 19:44
|
||||
# @author :Mo
|
||||
# @function :
|
5
AugmentText/augment_eda/__init__.py
Normal file
5
AugmentText/augment_eda/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
# !/usr/bin/python
|
||||
# @time :2019/4/9 21:14
|
||||
# @author :Mo
|
||||
# @function :
|
235
AugmentText/augment_eda/enhance_eda.py
Normal file
235
AugmentText/augment_eda/enhance_eda.py
Normal file
@ -0,0 +1,235 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
# !/usr/bin/python
|
||||
# @time :2019/4/1 10:35
|
||||
# @author :Mo
|
||||
# @function :enhance text by eda, eda is replace, insert, swap, delete
|
||||
|
||||
|
||||
from utils.text_tools import is_total_english
|
||||
from utils.text_tools import is_total_number
|
||||
from conf.path_config import stop_words_path
|
||||
from utils.text_tools import jieba_cut
|
||||
from random import shuffle
|
||||
import synonyms
|
||||
import random
|
||||
|
||||
|
||||
random.seed(2019)
|
||||
key_word_list = ["rsh", "mo", "大漠帝国"]
|
||||
|
||||
|
||||
# 停用词列表,默认使用hanlp停用词表
|
||||
f_stop = open(stop_words_path, "r", encoding="utf-8")
|
||||
stop_words = []
|
||||
for stop_word in f_stop.readlines():
|
||||
stop_words.append(stop_word.strip())
|
||||
|
||||
|
||||
def synonym_replacement(words, n, key_words):
|
||||
"""
|
||||
同义词替换,替换一个语句中的n个单词为其同义词
|
||||
:param words: list, inupt sentence
|
||||
:param n: int, replace words
|
||||
:return: list, new_words
|
||||
"""
|
||||
new_words = words.copy()
|
||||
random_word_list = list(set([word for word in words if word not in stop_words]))
|
||||
random.shuffle(random_word_list)
|
||||
num_replaced = 0
|
||||
for random_word in random_word_list:
|
||||
sim_synonyms = get_syn_by_synonyms(random_word)
|
||||
if len(sim_synonyms) >= 1 and random_word not in key_words and not is_total_english(random_word) and not is_total_number(random_word):
|
||||
synonym = random.choice(sim_synonyms)
|
||||
new_words = [synonym if word == random_word else word for word in new_words]
|
||||
num_replaced += 1
|
||||
if num_replaced >= n:
|
||||
break
|
||||
sentence = ' '.join(new_words)
|
||||
new_words = sentence.split(' ')
|
||||
return new_words
|
||||
|
||||
|
||||
def get_syn_by_synonyms(word):
|
||||
if not is_total_english(word.strip()):
|
||||
return synonyms.nearby(word)[0]
|
||||
else:
|
||||
return word
|
||||
|
||||
|
||||
def random_insertion(words, n, key_words):
|
||||
"""
|
||||
随机插入, 随机在语句中插入n个词
|
||||
:param words: list, inupt sentence
|
||||
:param n: int, insert words
|
||||
:return: list, new_words
|
||||
"""
|
||||
new_words = words.copy()
|
||||
for _ in range(n):
|
||||
add_word(new_words, key_words)
|
||||
return new_words
|
||||
|
||||
|
||||
def add_word(new_words, key_words):
|
||||
"""
|
||||
在list上随机插入一个同义词
|
||||
:param words: list, inupt sentence
|
||||
:return: list, new_words
|
||||
"""
|
||||
synonyms = []
|
||||
counter = 0
|
||||
while len(synonyms) < 1:
|
||||
random_word = new_words[random.randint(0, len(new_words) - 1)]
|
||||
# 过滤
|
||||
if random_word not in key_words and not is_total_english(random_word) and not is_total_number(random_word):
|
||||
synonyms = get_syn_by_synonyms(random_word)
|
||||
counter += 1
|
||||
if counter >= 10:
|
||||
return
|
||||
random_synonym = random.choice(synonyms)
|
||||
random_idx = random.randint(0, len(new_words) - 1)
|
||||
new_words.insert(random_idx, random_synonym)
|
||||
|
||||
|
||||
def random_swap(words, n):
|
||||
"""
|
||||
随机交换,随机交换两个词语n次数
|
||||
:param words: list, inupt sentence
|
||||
:param n: int, swap words
|
||||
:return: list, new_words
|
||||
"""
|
||||
new_words = words.copy()
|
||||
for _ in range(n):
|
||||
new_words = swap_word(new_words)
|
||||
return new_words
|
||||
|
||||
|
||||
def swap_word(new_words):
|
||||
"""
|
||||
随机交换,随机交换两个词语
|
||||
:param new_words: list, inupt sentence
|
||||
:return: list, new_words
|
||||
"""
|
||||
random_idx_1 = random.randint(0, len(new_words) - 1)
|
||||
random_idx_2 = random_idx_1
|
||||
counter = 0
|
||||
while random_idx_2 == random_idx_1:
|
||||
random_idx_2 = random.randint(0, len(new_words) - 1)
|
||||
counter += 1
|
||||
if counter > 3:
|
||||
return new_words
|
||||
new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1]
|
||||
return new_words
|
||||
|
||||
|
||||
def random_deletion(words, p, key_words):
|
||||
"""
|
||||
随机删除,以概率p删除语句中的词
|
||||
:param words: list, inupt sentence
|
||||
:param p: float,随机删除的概率
|
||||
:return: list, 返回等
|
||||
"""
|
||||
if len(words) == 1:
|
||||
return words
|
||||
|
||||
new_words = []
|
||||
for word in words:
|
||||
r = random.uniform(0, 1)
|
||||
if r > p or word in key_words:
|
||||
new_words.append(word)
|
||||
|
||||
if len(new_words) == 0:
|
||||
rand_int = random.randint(0, len(words) - 1)
|
||||
return [words[rand_int]]
|
||||
|
||||
return new_words
|
||||
|
||||
|
||||
def sentence_replace_whitespace(sentences):
|
||||
"""
|
||||
去除空格
|
||||
:param sentences: list,
|
||||
:return: list
|
||||
"""
|
||||
sentences_new = []
|
||||
for sentence in sentences:
|
||||
sentence_replace = sentence.replace(" ", "").strip()
|
||||
sentences_new.append(sentence_replace + "\n")
|
||||
return sentences_new
|
||||
|
||||
|
||||
def eda(sentence, alpha_sr=0.1, alpha_ri=0.1, alpha_rs=0.1, p_rd=0.1, num_aug=9, key_words=[]):
|
||||
"""
|
||||
EDA函数,同义词替换、插入词汇、交换词语顺序、删除词语
|
||||
:param sentence: str, input sentence
|
||||
:param alpha_sr: float, synonym_replacement
|
||||
:param alpha_ri: float, random_insertion
|
||||
:param alpha_rs: float, random_swap
|
||||
:param p_rd: float, random_deletion
|
||||
:param num_aug: int, generate n new sentence
|
||||
:return: list, contain orl sentence
|
||||
"""
|
||||
seg_list = jieba_cut(sentence)
|
||||
seg_list = " ".join(seg_list)
|
||||
words = list(seg_list.split())
|
||||
num_words = len(words)
|
||||
|
||||
augmented_sentences = []
|
||||
num_new_per_technique = int(num_aug*2 / 4) + 1
|
||||
n_sr = max(1, int(alpha_sr * num_words)) * 2
|
||||
n_ri = max(1, int(alpha_ri * num_words)) * 2
|
||||
n_rs = max(1, int(alpha_rs * num_words))
|
||||
|
||||
# 同义词替换sr
|
||||
for _ in range(num_new_per_technique):
|
||||
a_words = synonym_replacement(words, n_sr, key_words)
|
||||
augmented_sentences.append(''.join(a_words))
|
||||
|
||||
# 随机插入ri
|
||||
for _ in range(num_new_per_technique):
|
||||
a_words = random_insertion(words, n_ri, key_words)
|
||||
augmented_sentences.append(''.join(a_words))
|
||||
|
||||
# 随机交换rs
|
||||
for _ in range(num_new_per_technique):
|
||||
a_words = random_swap(words, n_rs)
|
||||
augmented_sentences.append(''.join(a_words))
|
||||
|
||||
# 随机删除rd
|
||||
for _ in range(num_new_per_technique):
|
||||
a_words = random_deletion(words, p_rd, key_words)
|
||||
augmented_sentences.append(''.join(a_words))
|
||||
|
||||
augmented_sentences = list(set(augmented_sentences))
|
||||
shuffle(augmented_sentences)
|
||||
# 太短的句子不要
|
||||
augmented_sentences_new = []
|
||||
for augmented_sentences_one in augmented_sentences:
|
||||
if len(augmented_sentences_one) > 5:
|
||||
augmented_sentences_new.append(augmented_sentences_one)
|
||||
|
||||
augmented_sentences = augmented_sentences_new
|
||||
if num_aug >= 1:
|
||||
augmented_sentences = augmented_sentences[:num_aug]
|
||||
else:
|
||||
keep_prob = num_aug / len(augmented_sentences)
|
||||
augmented_sentences = [s for s in augmented_sentences if random.uniform(0, 1) < keep_prob]
|
||||
|
||||
if len(augmented_sentences) > num_aug:
|
||||
augmented_sentences = augmented_sentences[0:num_aug]
|
||||
# augmented_sentences.append(seg_list)
|
||||
return augmented_sentences
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
des = get_syn_by_synonyms("同义词")
|
||||
print(des)
|
||||
syn = eda(sentence="rsh喜欢大漠帝国吗", alpha_sr=0.2, alpha_ri=0.2, alpha_rs=0.2, p_rd=0.2, num_aug=10, key_words=key_word_list)
|
||||
syn_s = sentence_replace_whitespace(syn)
|
||||
print(syn)
|
||||
while True:
|
||||
print('输入: ')
|
||||
sen = input()
|
||||
syn = eda(sentence=sen)
|
||||
print(syn)
|
5
AugmentText/augment_marko/__init__.py
Normal file
5
AugmentText/augment_marko/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
# !/usr/bin/python
|
||||
# @time :2019/4/9 21:14
|
||||
# @author :Mo
|
||||
# @function :
|
196
AugmentText/augment_marko/enhance_marko.py
Normal file
196
AugmentText/augment_marko/enhance_marko.py
Normal file
@ -0,0 +1,196 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
# !/usr/bin/python
|
||||
# @Time :2019/3/25 14:11
|
||||
# @author :Mo
|
||||
# @function :generate disorder sentence by marko
|
||||
|
||||
from sklearn.feature_extraction.text import TfidfTransformer
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from conf.path_config import chicken_and_gossip_path
|
||||
from conf.path_config import projectdir
|
||||
from utils.text_tools import txtRead
|
||||
from utils.text_tools import txtWrite
|
||||
from jieba import analyse
|
||||
import random
|
||||
import jieba
|
||||
|
||||
|
||||
# 引入TF-IDF关键词抽取接口
|
||||
tfidf = analyse.extract_tags
|
||||
# 引入TextRank关键词抽取接口
|
||||
textrank = analyse.textrank
|
||||
|
||||
|
||||
def create_model(model_markov, datalist):
|
||||
"""
|
||||
create model of sentence sequence
|
||||
:param model_marko: dict
|
||||
:param datalist: list of set
|
||||
:return: set
|
||||
"""
|
||||
for line in datalist:
|
||||
line = list(jieba.cut(line.lower().strip(), cut_all=False))
|
||||
for i, word in enumerate(line):
|
||||
if i == len(line) - 1:
|
||||
model_markov['FINISH'] = model_markov.get('FINISH', []) + [word]
|
||||
else:
|
||||
if i == 0:
|
||||
model_markov['BEGIN'] = model_markov.get('BEGIN', []) + [word]
|
||||
model_markov[word] = model_markov.get(word, []) + [line[i + 1]]
|
||||
|
||||
for key in model_markov.keys():
|
||||
model_markov[key] = list(set(model_markov[key]))
|
||||
|
||||
return model_markov
|
||||
|
||||
|
||||
def generate_random_1(model_markov, gen_words):
|
||||
"""
|
||||
根据马尔科夫链生成同义句,本质就是根据一个词走到另外一个词去
|
||||
:param generated: list, empty
|
||||
:param model_marko: dict, marko of dict
|
||||
:return: str
|
||||
"""
|
||||
while True:
|
||||
if not gen_words:
|
||||
words = model_markov['BEGIN']
|
||||
elif gen_words[-1] in model_markov['FINISH']:
|
||||
break
|
||||
else:
|
||||
try:
|
||||
words = model_markov[gen_words[-1]]
|
||||
except Exception as e:
|
||||
return "".join(gen_words) + "\n"
|
||||
# 随机选择一个词语
|
||||
gen_words.append(random.choice(words))
|
||||
|
||||
return "".join(gen_words) + "\n"
|
||||
|
||||
|
||||
def generate_random_select(generated, model_marko, twice=100000, len_min=5):
|
||||
"""
|
||||
默认遍历1000次生成句子
|
||||
:param generated: list, one key word, rg.["建行"]
|
||||
:param model_marko: dict, transition matrix
|
||||
:param twice: int, twice
|
||||
:param len_min: int, min length of gen sentence
|
||||
:return: list, syn_generates
|
||||
"""
|
||||
syn_generates = set()
|
||||
for num in range(twice):
|
||||
syn_generate = generate_random_1(model_marko, generated)
|
||||
generated = []
|
||||
if len(syn_generate) > len_min:
|
||||
syn_generates.add(syn_generate)
|
||||
return list(syn_generates)
|
||||
|
||||
|
||||
def get_keyword_from_tf(sentences, p):
|
||||
"""
|
||||
获取某个类型下语料的热词
|
||||
:param sentences: list, cuted sentences, filter by " "
|
||||
:param p: float, rate, 0 < p < 1
|
||||
:return: list, words
|
||||
"""
|
||||
sentence_cut_list = [" ".join(list(jieba.cut(text.strip(), cut_all=False, HMM=True))) for text in sentences]
|
||||
# token_pattern指定统计词频的模式, 不指定, 默认如英文, 不统计单字
|
||||
vectorizer = CountVectorizer(token_pattern='\\b\\w+\\b')
|
||||
# norm=None对词频结果不归一化
|
||||
# use_idf=False, 因为使用的是计算tfidf的函数, 所以要忽略idf的计算
|
||||
transformer = TfidfTransformer(norm=None, use_idf=False)
|
||||
vectorizer.fit_transform(sentence_cut_list)
|
||||
# tf = transformer.fit_transform(vectorizer.fit_transform(sentence_cut_list))
|
||||
word = vectorizer.get_feature_names()
|
||||
# weight = tf.toarray()
|
||||
return word[-int(len(word) * p):]
|
||||
|
||||
|
||||
def get_begin_word(sentences, p):
|
||||
"""
|
||||
获取jieba切词后
|
||||
:param sentences:list, sentences of input
|
||||
:param p: float,
|
||||
:return: list, key_words
|
||||
"""
|
||||
sentence_cut_begin_list = [list(jieba.cut(text.strip(), cut_all=False, HMM=True))[0] for text in sentences]
|
||||
len_begin_p = int(len(sentence_cut_begin_list) * p)
|
||||
return sentence_cut_begin_list[-len_begin_p:]
|
||||
|
||||
|
||||
def get_keyword_from_jieba_tfidf(sentences, p):
|
||||
"""
|
||||
基于TF-IDF算法进行关键词抽取
|
||||
:param sentence: str, sentence of input
|
||||
:return: list, return keyword
|
||||
"""
|
||||
sentence_cut_list = [" ".join(list(jieba.cut(text.strip(), cut_all=False, HMM=True))) for text in sentences]
|
||||
sentence_cut_list_str = str(sentence_cut_list)
|
||||
key_word = tfidf(sentence_cut_list_str)
|
||||
return key_word
|
||||
|
||||
|
||||
def get_keyword_from_jieba_textrank(sentences, p):
|
||||
"""
|
||||
基于textrank算法进行关键词抽取
|
||||
:param sentence: str, sentence of input
|
||||
:return: list, return keyword
|
||||
"""
|
||||
key_words = []
|
||||
for sentences_one in sentences:
|
||||
key_word = textrank(sentences_one)
|
||||
key_words = key_words + key_word
|
||||
# token_pattern指定统计词频的模式, 不指定, 默认如英文, 不统计单字
|
||||
vectorizer = CountVectorizer(token_pattern='\\b\\w+\\b')
|
||||
vectorizer.fit_transform(key_words)
|
||||
word = vectorizer.get_feature_names()
|
||||
return word[-int(len(word) * p):]
|
||||
|
||||
|
||||
def generate_syns_from_list(sentence_list, begin_word="tfidf", p=0.1):
|
||||
"""
|
||||
读取txt文件原语句,获取没有的生成句子
|
||||
:param txt_path: str, path of corpus
|
||||
:param begin_word: str, "tf", "tfidf", "textrank"
|
||||
:param p: float, rate, 0 < p < 1
|
||||
:return: list, generated sentence
|
||||
"""
|
||||
# 获取热门关键词
|
||||
if begin_word == "tf":
|
||||
generated_hot = get_keyword_from_tf(sentence_list, p)
|
||||
elif begin_word == "textrank":
|
||||
generated_hot = get_keyword_from_jieba_textrank(sentence_list, p)
|
||||
elif begin_word == "begin_word":
|
||||
generated_hot = get_begin_word(sentence_list, p)
|
||||
else:
|
||||
generated_hot = get_keyword_from_jieba_tfidf(sentence_list, p)
|
||||
|
||||
# 创建传递模型
|
||||
model_txt = {}
|
||||
model_txt = create_model(model_txt, sentence_list)
|
||||
# 以关键词开头,构建同义句
|
||||
gen_all_syn = []
|
||||
for generated_hot_one in generated_hot:
|
||||
generated_hot_one_1 = [generated_hot_one]
|
||||
generated_str = generate_random_select(generated_hot_one_1, model_txt, twice=1000, len_min=5)
|
||||
if generated_str:
|
||||
gen_all_syn = gen_all_syn + generated_str
|
||||
# 提取原句中没有的部分
|
||||
gen_all_syn = list(set(gen_all_syn))
|
||||
# 生成句子与原句的交集
|
||||
syn_intersection = list(set(sentence_list).intersection(set(gen_all_syn)))
|
||||
# 生成句子减去交集
|
||||
gen_syns = list(set(gen_all_syn).difference(set(syn_intersection)))
|
||||
return gen_syns
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 读取一个文件,再生成句子
|
||||
txt_path = chicken_and_gossip_path
|
||||
sentence_list = txtRead(txt_path)
|
||||
sentence_list = sentence_list[0:100]
|
||||
enhance_texts = generate_syns_from_list(sentence_list, begin_word="tfidf", p=0.1)
|
||||
for enhance_texts_one in enhance_texts:
|
||||
try:
|
||||
print(enhance_texts_one)
|
||||
except Exception as e:
|
||||
print(str(e))
|
5
AugmentText/augment_syntax/__init__.py
Normal file
5
AugmentText/augment_syntax/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
# !/usr/bin/python
|
||||
# @time :2019/4/9 21:16
|
||||
# @author :Mo
|
||||
# @function :
|
5
AugmentText/augment_translate/__init__.py
Normal file
5
AugmentText/augment_translate/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
# !/usr/bin/python
|
||||
# @time :2019/4/9 21:15
|
||||
# @author :Mo
|
||||
# @function :
|
@ -0,0 +1,5 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
# !/usr/bin/python
|
||||
# @time :2019/4/9 22:58
|
||||
# @author :Mo
|
||||
# @function :
|
@ -0,0 +1,107 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
# !/usr/bin/python
|
||||
# @time :2019/4/9 23:05
|
||||
# @author :Mo
|
||||
# @function :使用腾讯账户(翻译君),回译
|
||||
|
||||
|
||||
from conf.augment_constant import language_short_tencent
|
||||
from conf.augment_constant import app_secret_tentcnet
|
||||
from conf.augment_constant import app_key_tencent
|
||||
from urllib.parse import quote
|
||||
import logging as logger
|
||||
import requests
|
||||
import hashlib
|
||||
import random
|
||||
import string
|
||||
import time
|
||||
import json
|
||||
|
||||
|
||||
def md5_sign(text):
|
||||
"""
|
||||
生成md5
|
||||
:param src: str, sentence
|
||||
:return: str, upper of string
|
||||
"""
|
||||
md5_model = hashlib.md5(text.encode("utf8"))
|
||||
return md5_model.hexdigest().upper()
|
||||
|
||||
|
||||
def get_params(text, from_l="zh", to_l="en"):
|
||||
"""
|
||||
生成sign和params
|
||||
:param text: str, input sentence
|
||||
:param from_: source language
|
||||
:param to_: target language
|
||||
:return: dict, params
|
||||
"""
|
||||
# 请求时间戳(秒级),用于防止请求重放(保证签名5分钟有效)
|
||||
time_stamp = str(int(time.time()))
|
||||
# 请求随机字符串,用于保证签名不可预测
|
||||
nonce_str = ''.join(random.sample(string.ascii_letters + string.digits, 10))
|
||||
params = {'app_id': app_key_tencent,
|
||||
'source': from_l,
|
||||
'target': to_l,
|
||||
'text': text,
|
||||
'time_stamp': time_stamp,
|
||||
'nonce_str': nonce_str
|
||||
}
|
||||
signs = ''
|
||||
# 要对key排序再拼接
|
||||
for key in sorted(params):
|
||||
# 键值拼接过程value部分需要URL编码,URL编码算法用大写字母,例如%E8。quote默认大写。
|
||||
signs += '{}={}&'.format(key, quote(params[key], safe='').replace("%20", "+"))
|
||||
# 将应用密钥以app_key为键名,拼接到字符串sign_before末尾
|
||||
signs += 'app_key={}'.format(app_secret_tentcnet)
|
||||
# 对字符串sign_before进行MD5运算,得到接口请求签名
|
||||
sign = md5_sign(signs)
|
||||
params['sign'] = sign
|
||||
return params
|
||||
|
||||
|
||||
def any_to_any_translate_tencent(text, from_='zh', to_='en'):
|
||||
"""
|
||||
调用搜狗翻译,从任意一种语言到另外一种语言,详情见常量LANGUAGE_SHORT_BAIDU
|
||||
:param text: str, input sentence
|
||||
:param from_: source language
|
||||
:param to_: target language
|
||||
:return: str, translate sentence
|
||||
"""
|
||||
try:
|
||||
url = "https://api.ai.qq.com/fcgi-bin/nlp/nlp_texttranslate"
|
||||
params_text = get_params(text, from_l=from_, to_l=to_)
|
||||
res_post = requests.request("POST", url, data=params_text)
|
||||
res_content = res_post.content.decode("utf8")
|
||||
res_json = json.loads(res_content)
|
||||
target_text = res_json["data"]["target_text"]
|
||||
return target_text
|
||||
except Exception as e:
|
||||
logger.error(str(e))
|
||||
return None
|
||||
|
||||
|
||||
def translate_tencent_back(text, from_='zh', to_='en'):
|
||||
"""
|
||||
回译,调用两次腾讯翻译
|
||||
:param text: str, input sentence
|
||||
:param from_: source language
|
||||
:param to_: target language
|
||||
:return: str, translate sentence
|
||||
"""
|
||||
try:
|
||||
text_from_to = any_to_any_translate_tencent(text, from_=from_, to_=to_)
|
||||
text_to_from = any_to_any_translate_tencent(text_from_to, from_=to_, to_=from_)
|
||||
return text_to_from
|
||||
except Exception as e:
|
||||
logger.error(str(e))
|
||||
return None
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
text_test = "你觉得JY会喜欢暗影随风、大漠帝国吗".strip()
|
||||
for to_test in language_short_tencent:
|
||||
res_test = translate_tencent_back(text_test, from_='zh', to_=to_test)
|
||||
print("没有账户就为空,回译结果: ")
|
||||
print(res_test)
|
@ -0,0 +1,5 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
# !/usr/bin/python
|
||||
# @time :2019/4/9 22:57
|
||||
# @author :Mo
|
||||
# @function :
|
@ -0,0 +1,46 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
# !/usr/bin/python
|
||||
# @time :2019/4/9 23:05
|
||||
# @author :Mo
|
||||
# @function :使用翻译工具translate.Translator,回译
|
||||
|
||||
|
||||
from conf.augment_constant import language_short_google
|
||||
from utils.text_tools import judge_translate_english
|
||||
from translate import Translator
|
||||
|
||||
|
||||
def translate_tools_translate(text, to_='en'):
|
||||
"""
|
||||
调用translate进行句子生成
|
||||
:param text: str, input
|
||||
:param to_: language type
|
||||
:return: str, result
|
||||
"""
|
||||
# provider = 'mymemory','microsoft'
|
||||
translator1 = Translator(to_lang=to_, from_lang='zh', provider=None, secret_access_key=None)
|
||||
translator2 = Translator(to_lang="zh", from_lang=to_, provider=None, secret_access_key=None)
|
||||
|
||||
translation1 = translator1.translate(text)
|
||||
translation2 = translator2.translate(translation1)
|
||||
return translation2
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sen_org = "大漠帝国喜欢RSH、JY吗"
|
||||
for language_short_google_one in language_short_google:
|
||||
text_translate = translate_tools_translate(sen_org, to_=language_short_google_one)
|
||||
judge = judge_translate_english(sen_org, text_translate)
|
||||
if judge:
|
||||
print("True")
|
||||
print(text_translate)
|
||||
else:
|
||||
print("False")
|
||||
print(text_translate)
|
||||
# 测试结果:
|
||||
# False
|
||||
# 沙漠帝国是否像RSH,JY?
|
||||
# False
|
||||
# 沙漠帝国看起来像RSH,JY?
|
||||
# False
|
||||
# 帝国沙漠像rsh,jy?
|
5
AugmentText/augment_translate/translate_web/__init__.py
Normal file
5
AugmentText/augment_translate/translate_web/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
# !/usr/bin/python
|
||||
# @time :2019/4/9 22:58
|
||||
# @author :Mo
|
||||
# @function :
|
154
AugmentText/augment_translate/translate_web/translate_google.py
Normal file
154
AugmentText/augment_translate/translate_web/translate_google.py
Normal file
@ -0,0 +1,154 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
# !/usr/bin/python
|
||||
# @Time :2019/3/21 14:30
|
||||
# @author :Mo
|
||||
# @function :回译调用谷歌翻译,模拟google token访问
|
||||
|
||||
from conf.augment_constant import language_short_google
|
||||
from utils.text_tools import judge_translate_english
|
||||
import logging as logger
|
||||
import urllib.parse as parse
|
||||
import requests
|
||||
import execjs
|
||||
|
||||
|
||||
class GoogleToken:
|
||||
def __init__(self):
|
||||
self.ctx = execjs.compile("""
|
||||
function TL(a) {
|
||||
var k = "";
|
||||
var b = 406644;
|
||||
var b1 = 3293161072;
|
||||
var jd = ".";
|
||||
var $b = "+-a^+6";
|
||||
var Zb = "+-3^+b+-f";
|
||||
for (var e = [], f = 0, g = 0; g < a.length; g++) {
|
||||
var m = a.charCodeAt(g);
|
||||
128 > m ? e[f++] = m : (2048 > m ? e[f++] = m >> 6 | 192 : (55296 == (m & 64512) && g + 1 < a.length && 56320 == (a.charCodeAt(g + 1) & 64512) ? (m = 65536 + ((m & 1023) << 10) + (a.charCodeAt(++g) & 1023),
|
||||
e[f++] = m >> 18 | 240,
|
||||
e[f++] = m >> 12 & 63 | 128) : e[f++] = m >> 12 | 224,
|
||||
e[f++] = m >> 6 & 63 | 128),
|
||||
e[f++] = m & 63 | 128)
|
||||
}
|
||||
a = b;
|
||||
for (f = 0; f < e.length; f++) a += e[f],
|
||||
a = RL(a, $b);
|
||||
a = RL(a, Zb);
|
||||
a ^= b1 || 0;
|
||||
0 > a && (a = (a & 2147483647) + 2147483648);
|
||||
a %= 1E6;
|
||||
return a.toString() + jd + (a ^ b)
|
||||
};
|
||||
function RL(a, b) {
|
||||
var t = "a";
|
||||
var Yb = "+";
|
||||
for (var c = 0; c < b.length - 2; c += 3) {
|
||||
var d = b.charAt(c + 2),
|
||||
d = d >= t ? d.charCodeAt(0) - 87 : Number(d),
|
||||
d = b.charAt(c + 1) == Yb ? a >>> d: a << d;
|
||||
a = b.charAt(c) == Yb ? a + d & 4294967295 : a ^ d
|
||||
}
|
||||
return a
|
||||
}
|
||||
""")
|
||||
|
||||
def get_google_token(self, text):
|
||||
"""
|
||||
获取谷歌访问token
|
||||
:param text: str, input sentence
|
||||
:return:
|
||||
"""
|
||||
return self.ctx.call("TL", text)
|
||||
|
||||
|
||||
def open_url(url):
|
||||
"""
|
||||
新增header,并request访问
|
||||
:param url: str, url地址
|
||||
:return: str, 目标url地址返回
|
||||
"""
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) '
|
||||
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'}
|
||||
req = requests.get(url=url, headers=headers)
|
||||
return req.content.decode('utf-8')
|
||||
|
||||
|
||||
def max_length(content):
|
||||
"""
|
||||
超过最大长度就不翻译
|
||||
:param content: str, need translate
|
||||
:return:
|
||||
"""
|
||||
if len(content) > 4891:
|
||||
logger.info("翻译文本超过限制!")
|
||||
return
|
||||
|
||||
|
||||
def translate_result(result):
|
||||
"""
|
||||
删去无关词
|
||||
:param result: str
|
||||
:return: str
|
||||
"""
|
||||
str_end = result.find("\",")
|
||||
if str_end > 4:
|
||||
return result[4:str_end]
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def any_to_any_translate(content, from_='zh-CN', to_='en'):
|
||||
"""
|
||||
自定义选择
|
||||
:param content: str, 4891个字, 用户输入
|
||||
:param from_: str, original language
|
||||
:param to_: str, target language
|
||||
:return: str, result of translate
|
||||
"""
|
||||
max_length(content)
|
||||
tk = google_tokn.get_google_token(content)
|
||||
content = parse.quote(content)
|
||||
url = "http://translate.google.cn/translate_a/single?client=t&sl={0}&tl={1}" \
|
||||
"&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca&dt=rw&dt=rm&dt=ss&dt=t&" \
|
||||
"ie=UTF-8&oe=UTF-8&source=btn&ssel=3&tsel=3&kc=0&tk={2}&q={3}".format(from_, to_, tk, content)
|
||||
result = open_url(url)
|
||||
res = translate_result(result)
|
||||
return res
|
||||
|
||||
|
||||
def any_to_any_translate_back(content, from_='zh-CN', to_='en'):
|
||||
"""
|
||||
中英,英中回译
|
||||
:param content:str, 4891个字, 用户输入
|
||||
:param from_: str, original language
|
||||
:param to_: str, target language
|
||||
:return: str, result of translate
|
||||
"""
|
||||
translate_content = any_to_any_translate(content, from_=from_, to_=to_)
|
||||
result = any_to_any_translate(translate_content, from_=to_, to_=from_)
|
||||
return result
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
google_tokn = GoogleToken()
|
||||
while True:
|
||||
sen_org = "过路蜻蜓喜欢口袋巧克力,这是什么意思"
|
||||
for language_short_google_one in language_short_google:
|
||||
text_translate = any_to_any_translate_back(sen_org, from_='zh', to_=language_short_google_one)
|
||||
judge = judge_translate_english(sen_org, text_translate)
|
||||
if judge:
|
||||
print(language_short_google_one + " " + "True")
|
||||
print(text_translate)
|
||||
else:
|
||||
print(language_short_google_one + " " + "False")
|
||||
print(text_translate)
|
||||
#测试结果
|
||||
# en False
|
||||
# 我喜欢口袋巧克力,这是什么意思?
|
||||
# fr False
|
||||
# 我喜欢口袋巧克力,这是什么意思?
|
||||
# ru False
|
||||
# 我喜欢口袋糖果,这是什么意思?
|
||||
# de False
|
||||
# 我喜欢袋巧克力,这是什么意思?
|
Loading…
Reference in New Issue
Block a user