Add files via upload

This commit is contained in:
yongzhuo 2019-04-09 23:26:23 +08:00 committed by GitHub
parent 80fd973d94
commit fb86363088
14 changed files with 839 additions and 0 deletions

61
AugmentText/Readme.md Normal file
View File

@ -0,0 +1,61 @@
# AugmentText
# 概述
- 相较于图像数据增强,文本数据增强,现在还是有很多问题的;
- 往更严格的角度看,文本数据增强更像是同义句生成,但又不完全是,它是一个更大范围的概念;
- 很多时候,需要文本数据增强,一个是常常遇到的数据不足,另一个就是数据不均衡。
- 我的感觉是,文本数据增强的有效方法:
- 一个是回译(翻译两次,例如中文到英文,然后英文再到中文),
- 另外一个就是EDA同义词替换、插入、交换和删除插入和交换当时确实没有想到用
###github项目地址为###
https://github.com/yongzhuo/nlp_xiaojiang/tree/master/AugmentText
# 回译(相对靠谱)
- 1.在线翻译工具(中文->[英、法、德、俄、西班牙、葡萄牙、日、韩、荷兰、阿拉伯]等语言)
- 谷歌翻译(google),谷歌翻译不用说,应该是挺好的,语言支持最多,不过我暂时还不会翻墙注册账户
- 百度翻译(baidu),百度翻译不用说,国内支持翻译语言最多的了(28种互译)而且最大方了注册账户后每月有200万字符的流量大约是2M吧超出则49元人民币/百万字符
- 有道翻译(youdao)初始接触网络的时候我最喜欢用有道翻译了但死贵只有100元体验金差评。才支持11种语言48元/百万字符
- 搜狗翻译(sougou)对于搜狗印象还行吧毕竟是能做搜索引擎的公司嘛。78种语言200元体验金常见语言40元/百万字符,非常见语言60元/百万字符
- 腾讯翻译(tencent)总觉得腾讯AI是后知后觉了公司调用腾讯接口老是变来变去的这次也是被它的sign加密给恶心到了空格改为+。或许对企鹅而言,人工智能不那么重要吧。
-有两个一个是翻译君一个是AIlab什么的支持的语言少些。似乎还在开发中不限额不保证并发php开发没有python的demo
- 必应翻译(bing),微软的东西,你懂的,没有尝试,直接在网页上试试还可以吧
- 可以采用工具、模拟访问网页、或者是注册账号等
- 2.离线翻译工具
- 1.自己写收集些语料seq2seq,nmt,transformer
- 2.小牛翻译比较古老的版本了win10或者linux都可以不过只有训练好的中英互译
地址:http://www.niutrans.com/index.html
# 同义词替换(还行)
- 1.eda(其实就是同义词替换、插入、交换和删除) 论文《Easy data augmentation techniques for boosting performance on text classification tasks》
- 中文实现的demogithub项目zhanlaoban/eda_nlp_for_Chinese地址:https://github.com/zhanlaoban/eda_nlp_for_Chinese
- 2.word2vec、词典同义词替换
- 不同于1中使用synonyms工具查找同义词可以使用gensim的词向量找出某个词最相似的词作为同意词。
- 还可以使用同义词典机械查找词典可用fighting41love/funNLPgithub地址:https://github.com/fighting41love/funNLP/tree/master/data/
# 句法、句子扩充、句子缩写(比较困难、)
- 1.句子缩写,查找句子主谓宾等
- 有个java的项目调用斯坦福分词工具(不爱用),查找主谓宾的
- 地址为:主谓宾提取器https://github.com/hankcs/MainPartExtractor
- 2.句子扩写 todo
- 3.句法 todo
# HMM-marko质量较差
- HMM生成句子原理: 根据语料构建状态转移矩阵jieba等提取关键词开头生成句子
- 参考项目:https://github.com/takeToDreamLand/SentenceGenerate_byMarkov
# 深度学习方法 todo
- seq2seq
- bert
- transformer
- GAN
# 参考/感谢
- eda_chinese https://github.com/zhanlaoban/eda_nlp_for_Chinese
- 主谓宾提取器: https://github.com/hankcs/MainPartExtractor
- HMM生成句子 https://github.com/takeToDreamLand/SentenceGenerate_byMarkov
- 同义词等: https://github.com/fighting41love/funNLP/tree/master/data/
- 小牛翻译: http://www.niutrans.com/index.html

5
AugmentText/__init__.py Normal file
View File

@ -0,0 +1,5 @@
# -*- coding: UTF-8 -*-
# !/usr/bin/python
# @time :2019/4/9 19:44
# @author :Mo
# @function :

View File

@ -0,0 +1,5 @@
# -*- coding: UTF-8 -*-
# !/usr/bin/python
# @time :2019/4/9 21:14
# @author :Mo
# @function :

View File

@ -0,0 +1,235 @@
# -*- coding: UTF-8 -*-
# !/usr/bin/python
# @time :2019/4/1 10:35
# @author :Mo
# @function :enhance text by eda, eda is replace, insert, swap, delete
from utils.text_tools import is_total_english
from utils.text_tools import is_total_number
from conf.path_config import stop_words_path
from utils.text_tools import jieba_cut
from random import shuffle
import synonyms
import random
random.seed(2019)
key_word_list = ["rsh", "mo", "大漠帝国"]
# 停用词列表默认使用hanlp停用词表
f_stop = open(stop_words_path, "r", encoding="utf-8")
stop_words = []
for stop_word in f_stop.readlines():
stop_words.append(stop_word.strip())
def synonym_replacement(words, n, key_words):
"""
同义词替换,替换一个语句中的n个单词为其同义词
:param words: list, inupt sentence
:param n: int, replace words
:return: list, new_words
"""
new_words = words.copy()
random_word_list = list(set([word for word in words if word not in stop_words]))
random.shuffle(random_word_list)
num_replaced = 0
for random_word in random_word_list:
sim_synonyms = get_syn_by_synonyms(random_word)
if len(sim_synonyms) >= 1 and random_word not in key_words and not is_total_english(random_word) and not is_total_number(random_word):
synonym = random.choice(sim_synonyms)
new_words = [synonym if word == random_word else word for word in new_words]
num_replaced += 1
if num_replaced >= n:
break
sentence = ' '.join(new_words)
new_words = sentence.split(' ')
return new_words
def get_syn_by_synonyms(word):
if not is_total_english(word.strip()):
return synonyms.nearby(word)[0]
else:
return word
def random_insertion(words, n, key_words):
"""
随机插入, 随机在语句中插入n个词
:param words: list, inupt sentence
:param n: int, insert words
:return: list, new_words
"""
new_words = words.copy()
for _ in range(n):
add_word(new_words, key_words)
return new_words
def add_word(new_words, key_words):
"""
在list上随机插入一个同义词
:param words: list, inupt sentence
:return: list, new_words
"""
synonyms = []
counter = 0
while len(synonyms) < 1:
random_word = new_words[random.randint(0, len(new_words) - 1)]
# 过滤
if random_word not in key_words and not is_total_english(random_word) and not is_total_number(random_word):
synonyms = get_syn_by_synonyms(random_word)
counter += 1
if counter >= 10:
return
random_synonym = random.choice(synonyms)
random_idx = random.randint(0, len(new_words) - 1)
new_words.insert(random_idx, random_synonym)
def random_swap(words, n):
"""
随机交换随机交换两个词语n次数
:param words: list, inupt sentence
:param n: int, swap words
:return: list, new_words
"""
new_words = words.copy()
for _ in range(n):
new_words = swap_word(new_words)
return new_words
def swap_word(new_words):
"""
随机交换随机交换两个词语
:param new_words: list, inupt sentence
:return: list, new_words
"""
random_idx_1 = random.randint(0, len(new_words) - 1)
random_idx_2 = random_idx_1
counter = 0
while random_idx_2 == random_idx_1:
random_idx_2 = random.randint(0, len(new_words) - 1)
counter += 1
if counter > 3:
return new_words
new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1]
return new_words
def random_deletion(words, p, key_words):
"""
随机删除,以概率p删除语句中的词
:param words: list, inupt sentence
:param p: float,随机删除的概率
:return: list, 返回等
"""
if len(words) == 1:
return words
new_words = []
for word in words:
r = random.uniform(0, 1)
if r > p or word in key_words:
new_words.append(word)
if len(new_words) == 0:
rand_int = random.randint(0, len(words) - 1)
return [words[rand_int]]
return new_words
def sentence_replace_whitespace(sentences):
"""
去除空格
:param sentences: list,
:return: list
"""
sentences_new = []
for sentence in sentences:
sentence_replace = sentence.replace(" ", "").strip()
sentences_new.append(sentence_replace + "\n")
return sentences_new
def eda(sentence, alpha_sr=0.1, alpha_ri=0.1, alpha_rs=0.1, p_rd=0.1, num_aug=9, key_words=[]):
"""
EDA函数同义词替换插入词汇交换词语顺序删除词语
:param sentence: str, input sentence
:param alpha_sr: float, synonym_replacement
:param alpha_ri: float, random_insertion
:param alpha_rs: float, random_swap
:param p_rd: float, random_deletion
:param num_aug: int, generate n new sentence
:return: list, contain orl sentence
"""
seg_list = jieba_cut(sentence)
seg_list = " ".join(seg_list)
words = list(seg_list.split())
num_words = len(words)
augmented_sentences = []
num_new_per_technique = int(num_aug*2 / 4) + 1
n_sr = max(1, int(alpha_sr * num_words)) * 2
n_ri = max(1, int(alpha_ri * num_words)) * 2
n_rs = max(1, int(alpha_rs * num_words))
# 同义词替换sr
for _ in range(num_new_per_technique):
a_words = synonym_replacement(words, n_sr, key_words)
augmented_sentences.append(''.join(a_words))
# 随机插入ri
for _ in range(num_new_per_technique):
a_words = random_insertion(words, n_ri, key_words)
augmented_sentences.append(''.join(a_words))
# 随机交换rs
for _ in range(num_new_per_technique):
a_words = random_swap(words, n_rs)
augmented_sentences.append(''.join(a_words))
# 随机删除rd
for _ in range(num_new_per_technique):
a_words = random_deletion(words, p_rd, key_words)
augmented_sentences.append(''.join(a_words))
augmented_sentences = list(set(augmented_sentences))
shuffle(augmented_sentences)
# 太短的句子不要
augmented_sentences_new = []
for augmented_sentences_one in augmented_sentences:
if len(augmented_sentences_one) > 5:
augmented_sentences_new.append(augmented_sentences_one)
augmented_sentences = augmented_sentences_new
if num_aug >= 1:
augmented_sentences = augmented_sentences[:num_aug]
else:
keep_prob = num_aug / len(augmented_sentences)
augmented_sentences = [s for s in augmented_sentences if random.uniform(0, 1) < keep_prob]
if len(augmented_sentences) > num_aug:
augmented_sentences = augmented_sentences[0:num_aug]
# augmented_sentences.append(seg_list)
return augmented_sentences
if __name__ == "__main__":
des = get_syn_by_synonyms("同义词")
print(des)
syn = eda(sentence="rsh喜欢大漠帝国吗", alpha_sr=0.2, alpha_ri=0.2, alpha_rs=0.2, p_rd=0.2, num_aug=10, key_words=key_word_list)
syn_s = sentence_replace_whitespace(syn)
print(syn)
while True:
print('输入: ')
sen = input()
syn = eda(sentence=sen)
print(syn)

View File

@ -0,0 +1,5 @@
# -*- coding: UTF-8 -*-
# !/usr/bin/python
# @time :2019/4/9 21:14
# @author :Mo
# @function :

View File

@ -0,0 +1,196 @@
# -*- coding: UTF-8 -*-
# !/usr/bin/python
# @Time :2019/3/25 14:11
# @author :Mo
# @function :generate disorder sentence by marko
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from conf.path_config import chicken_and_gossip_path
from conf.path_config import projectdir
from utils.text_tools import txtRead
from utils.text_tools import txtWrite
from jieba import analyse
import random
import jieba
# 引入TF-IDF关键词抽取接口
tfidf = analyse.extract_tags
# 引入TextRank关键词抽取接口
textrank = analyse.textrank
def create_model(model_markov, datalist):
"""
create model of sentence sequence
:param model_marko: dict
:param datalist: list of set
:return: set
"""
for line in datalist:
line = list(jieba.cut(line.lower().strip(), cut_all=False))
for i, word in enumerate(line):
if i == len(line) - 1:
model_markov['FINISH'] = model_markov.get('FINISH', []) + [word]
else:
if i == 0:
model_markov['BEGIN'] = model_markov.get('BEGIN', []) + [word]
model_markov[word] = model_markov.get(word, []) + [line[i + 1]]
for key in model_markov.keys():
model_markov[key] = list(set(model_markov[key]))
return model_markov
def generate_random_1(model_markov, gen_words):
"""
根据马尔科夫链生成同义句本质就是根据一个词走到另外一个词去
:param generated: list, empty
:param model_marko: dict, marko of dict
:return: str
"""
while True:
if not gen_words:
words = model_markov['BEGIN']
elif gen_words[-1] in model_markov['FINISH']:
break
else:
try:
words = model_markov[gen_words[-1]]
except Exception as e:
return "".join(gen_words) + "\n"
# 随机选择一个词语
gen_words.append(random.choice(words))
return "".join(gen_words) + "\n"
def generate_random_select(generated, model_marko, twice=100000, len_min=5):
"""
默认遍历1000次生成句子
:param generated: list, one key word, rg.["建行"]
:param model_marko: dict, transition matrix
:param twice: int, twice
:param len_min: int, min length of gen sentence
:return: list, syn_generates
"""
syn_generates = set()
for num in range(twice):
syn_generate = generate_random_1(model_marko, generated)
generated = []
if len(syn_generate) > len_min:
syn_generates.add(syn_generate)
return list(syn_generates)
def get_keyword_from_tf(sentences, p):
"""
获取某个类型下语料的热词
:param sentences: list, cuted sentences, filter by " "
:param p: float, rate, 0 < p < 1
:return: list, words
"""
sentence_cut_list = [" ".join(list(jieba.cut(text.strip(), cut_all=False, HMM=True))) for text in sentences]
# token_pattern指定统计词频的模式, 不指定, 默认如英文, 不统计单字
vectorizer = CountVectorizer(token_pattern='\\b\\w+\\b')
# norm=None对词频结果不归一化
# use_idf=False, 因为使用的是计算tfidf的函数, 所以要忽略idf的计算
transformer = TfidfTransformer(norm=None, use_idf=False)
vectorizer.fit_transform(sentence_cut_list)
# tf = transformer.fit_transform(vectorizer.fit_transform(sentence_cut_list))
word = vectorizer.get_feature_names()
# weight = tf.toarray()
return word[-int(len(word) * p):]
def get_begin_word(sentences, p):
"""
获取jieba切词后
:param sentences:list, sentences of input
:param p: float,
:return: list, key_words
"""
sentence_cut_begin_list = [list(jieba.cut(text.strip(), cut_all=False, HMM=True))[0] for text in sentences]
len_begin_p = int(len(sentence_cut_begin_list) * p)
return sentence_cut_begin_list[-len_begin_p:]
def get_keyword_from_jieba_tfidf(sentences, p):
"""
基于TF-IDF算法进行关键词抽取
:param sentence: str, sentence of input
:return: list, return keyword
"""
sentence_cut_list = [" ".join(list(jieba.cut(text.strip(), cut_all=False, HMM=True))) for text in sentences]
sentence_cut_list_str = str(sentence_cut_list)
key_word = tfidf(sentence_cut_list_str)
return key_word
def get_keyword_from_jieba_textrank(sentences, p):
"""
基于textrank算法进行关键词抽取
:param sentence: str, sentence of input
:return: list, return keyword
"""
key_words = []
for sentences_one in sentences:
key_word = textrank(sentences_one)
key_words = key_words + key_word
# token_pattern指定统计词频的模式, 不指定, 默认如英文, 不统计单字
vectorizer = CountVectorizer(token_pattern='\\b\\w+\\b')
vectorizer.fit_transform(key_words)
word = vectorizer.get_feature_names()
return word[-int(len(word) * p):]
def generate_syns_from_list(sentence_list, begin_word="tfidf", p=0.1):
"""
读取txt文件原语句获取没有的生成句子
:param txt_path: str, path of corpus
:param begin_word: str, "tf", "tfidf", "textrank"
:param p: float, rate, 0 < p < 1
:return: list, generated sentence
"""
# 获取热门关键词
if begin_word == "tf":
generated_hot = get_keyword_from_tf(sentence_list, p)
elif begin_word == "textrank":
generated_hot = get_keyword_from_jieba_textrank(sentence_list, p)
elif begin_word == "begin_word":
generated_hot = get_begin_word(sentence_list, p)
else:
generated_hot = get_keyword_from_jieba_tfidf(sentence_list, p)
# 创建传递模型
model_txt = {}
model_txt = create_model(model_txt, sentence_list)
# 以关键词开头,构建同义句
gen_all_syn = []
for generated_hot_one in generated_hot:
generated_hot_one_1 = [generated_hot_one]
generated_str = generate_random_select(generated_hot_one_1, model_txt, twice=1000, len_min=5)
if generated_str:
gen_all_syn = gen_all_syn + generated_str
# 提取原句中没有的部分
gen_all_syn = list(set(gen_all_syn))
# 生成句子与原句的交集
syn_intersection = list(set(sentence_list).intersection(set(gen_all_syn)))
# 生成句子减去交集
gen_syns = list(set(gen_all_syn).difference(set(syn_intersection)))
return gen_syns
if __name__ == "__main__":
# 读取一个文件,再生成句子
txt_path = chicken_and_gossip_path
sentence_list = txtRead(txt_path)
sentence_list = sentence_list[0:100]
enhance_texts = generate_syns_from_list(sentence_list, begin_word="tfidf", p=0.1)
for enhance_texts_one in enhance_texts:
try:
print(enhance_texts_one)
except Exception as e:
print(str(e))

View File

@ -0,0 +1,5 @@
# -*- coding: UTF-8 -*-
# !/usr/bin/python
# @time :2019/4/9 21:16
# @author :Mo
# @function :

View File

@ -0,0 +1,5 @@
# -*- coding: UTF-8 -*-
# !/usr/bin/python
# @time :2019/4/9 21:15
# @author :Mo
# @function :

View File

@ -0,0 +1,5 @@
# -*- coding: UTF-8 -*-
# !/usr/bin/python
# @time :2019/4/9 22:58
# @author :Mo
# @function :

View File

@ -0,0 +1,107 @@
# -*- coding: UTF-8 -*-
# !/usr/bin/python
# @time :2019/4/9 23:05
# @author :Mo
# @function :使用腾讯账户(翻译君),回译
from conf.augment_constant import language_short_tencent
from conf.augment_constant import app_secret_tentcnet
from conf.augment_constant import app_key_tencent
from urllib.parse import quote
import logging as logger
import requests
import hashlib
import random
import string
import time
import json
def md5_sign(text):
"""
生成md5
:param src: str, sentence
:return: str, upper of string
"""
md5_model = hashlib.md5(text.encode("utf8"))
return md5_model.hexdigest().upper()
def get_params(text, from_l="zh", to_l="en"):
"""
生成sign和params
:param text: str, input sentence
:param from_: source language
:param to_: target language
:return: dict, params
"""
# 请求时间戳秒级用于防止请求重放保证签名5分钟有效  
time_stamp = str(int(time.time()))
# 请求随机字符串,用于保证签名不可预测  
nonce_str = ''.join(random.sample(string.ascii_letters + string.digits, 10))
params = {'app_id': app_key_tencent,
'source': from_l,
'target': to_l,
'text': text,
'time_stamp': time_stamp,
'nonce_str': nonce_str
}
signs = ''
# 要对key排序再拼接  
for key in sorted(params):
# 键值拼接过程value部分需要URL编码URL编码算法用大写字母例如%E8。quote默认大写。  
signs += '{}={}&'.format(key, quote(params[key], safe='').replace("%20", "+"))
# 将应用密钥以app_key为键名拼接到字符串sign_before末尾  
signs += 'app_key={}'.format(app_secret_tentcnet)
# 对字符串sign_before进行MD5运算得到接口请求签名  
sign = md5_sign(signs)
params['sign'] = sign
return params
def any_to_any_translate_tencent(text, from_='zh', to_='en'):
"""
调用搜狗翻译从任意一种语言到另外一种语言详情见常量LANGUAGE_SHORT_BAIDU
:param text: str, input sentence
:param from_: source language
:param to_: target language
:return: str, translate sentence
"""
try:
url = "https://api.ai.qq.com/fcgi-bin/nlp/nlp_texttranslate"
params_text = get_params(text, from_l=from_, to_l=to_)
res_post = requests.request("POST", url, data=params_text)
res_content = res_post.content.decode("utf8")
res_json = json.loads(res_content)
target_text = res_json["data"]["target_text"]
return target_text
except Exception as e:
logger.error(str(e))
return None
def translate_tencent_back(text, from_='zh', to_='en'):
"""
回译调用两次腾讯翻译
:param text: str, input sentence
:param from_: source language
:param to_: target language
:return: str, translate sentence
"""
try:
text_from_to = any_to_any_translate_tencent(text, from_=from_, to_=to_)
text_to_from = any_to_any_translate_tencent(text_from_to, from_=to_, to_=from_)
return text_to_from
except Exception as e:
logger.error(str(e))
return None
if __name__ == '__main__':
text_test = "你觉得JY会喜欢暗影随风、大漠帝国吗".strip()
for to_test in language_short_tencent:
res_test = translate_tencent_back(text_test, from_='zh', to_=to_test)
print("没有账户就为空,回译结果: ")
print(res_test)

View File

@ -0,0 +1,5 @@
# -*- coding: UTF-8 -*-
# !/usr/bin/python
# @time :2019/4/9 22:57
# @author :Mo
# @function :

View File

@ -0,0 +1,46 @@
# -*- coding: UTF-8 -*-
# !/usr/bin/python
# @time :2019/4/9 23:05
# @author :Mo
# @function :使用翻译工具translate.Translator回译
from conf.augment_constant import language_short_google
from utils.text_tools import judge_translate_english
from translate import Translator
def translate_tools_translate(text, to_='en'):
"""
调用translate进行句子生成
:param text: str, input
:param to_: language type
:return: str, result
"""
# provider = 'mymemory','microsoft'
translator1 = Translator(to_lang=to_, from_lang='zh', provider=None, secret_access_key=None)
translator2 = Translator(to_lang="zh", from_lang=to_, provider=None, secret_access_key=None)
translation1 = translator1.translate(text)
translation2 = translator2.translate(translation1)
return translation2
if __name__ == "__main__":
sen_org = "大漠帝国喜欢RSH、JY吗"
for language_short_google_one in language_short_google:
text_translate = translate_tools_translate(sen_org, to_=language_short_google_one)
judge = judge_translate_english(sen_org, text_translate)
if judge:
print("True")
print(text_translate)
else:
print("False")
print(text_translate)
# 测试结果:
# False
# 沙漠帝国是否像RSHJY
# False
# 沙漠帝国看起来像RSHJY
# False
# 帝国沙漠像rshjy

View File

@ -0,0 +1,5 @@
# -*- coding: UTF-8 -*-
# !/usr/bin/python
# @time :2019/4/9 22:58
# @author :Mo
# @function :

View File

@ -0,0 +1,154 @@
# -*- coding: UTF-8 -*-
# !/usr/bin/python
# @Time :2019/3/21 14:30
# @author :Mo
# @function :回译调用谷歌翻译模拟google token访问
from conf.augment_constant import language_short_google
from utils.text_tools import judge_translate_english
import logging as logger
import urllib.parse as parse
import requests
import execjs
class GoogleToken:
def __init__(self):
self.ctx = execjs.compile("""
function TL(a) {
var k = "";
var b = 406644;
var b1 = 3293161072;
var jd = ".";
var $b = "+-a^+6";
var Zb = "+-3^+b+-f";
for (var e = [], f = 0, g = 0; g < a.length; g++) {
var m = a.charCodeAt(g);
128 > m ? e[f++] = m : (2048 > m ? e[f++] = m >> 6 | 192 : (55296 == (m & 64512) && g + 1 < a.length && 56320 == (a.charCodeAt(g + 1) & 64512) ? (m = 65536 + ((m & 1023) << 10) + (a.charCodeAt(++g) & 1023),
e[f++] = m >> 18 | 240,
e[f++] = m >> 12 & 63 | 128) : e[f++] = m >> 12 | 224,
e[f++] = m >> 6 & 63 | 128),
e[f++] = m & 63 | 128)
}
a = b;
for (f = 0; f < e.length; f++) a += e[f],
a = RL(a, $b);
a = RL(a, Zb);
a ^= b1 || 0;
0 > a && (a = (a & 2147483647) + 2147483648);
a %= 1E6;
return a.toString() + jd + (a ^ b)
};
function RL(a, b) {
var t = "a";
var Yb = "+";
for (var c = 0; c < b.length - 2; c += 3) {
var d = b.charAt(c + 2),
d = d >= t ? d.charCodeAt(0) - 87 : Number(d),
d = b.charAt(c + 1) == Yb ? a >>> d: a << d;
a = b.charAt(c) == Yb ? a + d & 4294967295 : a ^ d
}
return a
}
""")
def get_google_token(self, text):
"""
获取谷歌访问token
:param text: str, input sentence
:return:
"""
return self.ctx.call("TL", text)
def open_url(url):
"""
新增header并request访问
:param url: str, url地址
:return: str, 目标url地址返回
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'}
req = requests.get(url=url, headers=headers)
return req.content.decode('utf-8')
def max_length(content):
"""
超过最大长度就不翻译
:param content: str, need translate
:return:
"""
if len(content) > 4891:
logger.info("翻译文本超过限制!")
return
def translate_result(result):
"""
删去无关词
:param result: str
:return: str
"""
str_end = result.find("\",")
if str_end > 4:
return result[4:str_end]
else:
return None
def any_to_any_translate(content, from_='zh-CN', to_='en'):
"""
自定义选择
:param content: str, 4891个字 用户输入
:param from_: str, original language
:param to_: str, target language
:return: str, result of translate
"""
max_length(content)
tk = google_tokn.get_google_token(content)
content = parse.quote(content)
url = "http://translate.google.cn/translate_a/single?client=t&sl={0}&tl={1}" \
"&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca&dt=rw&dt=rm&dt=ss&dt=t&" \
"ie=UTF-8&oe=UTF-8&source=btn&ssel=3&tsel=3&kc=0&tk={2}&q={3}".format(from_, to_, tk, content)
result = open_url(url)
res = translate_result(result)
return res
def any_to_any_translate_back(content, from_='zh-CN', to_='en'):
"""
中英英中回译
:param content:str, 4891个字 用户输入
:param from_: str, original language
:param to_: str, target language
:return: str, result of translate
"""
translate_content = any_to_any_translate(content, from_=from_, to_=to_)
result = any_to_any_translate(translate_content, from_=to_, to_=from_)
return result
if __name__ == '__main__':
google_tokn = GoogleToken()
while True:
sen_org = "过路蜻蜓喜欢口袋巧克力,这是什么意思"
for language_short_google_one in language_short_google:
text_translate = any_to_any_translate_back(sen_org, from_='zh', to_=language_short_google_one)
judge = judge_translate_english(sen_org, text_translate)
if judge:
print(language_short_google_one + " " + "True")
print(text_translate)
else:
print(language_short_google_one + " " + "False")
print(text_translate)
#测试结果
# en False
# 我喜欢口袋巧克力,这是什么意思?
# fr False
# 我喜欢口袋巧克力,这是什么意思?
# ru False
# 我喜欢口袋糖果,这是什么意思?
# de False
# 我喜欢袋巧克力,这是什么意思?