From 08408e8fd9c265a44f4dd326f94db1103a2efc2c Mon Sep 17 00:00:00 2001 From: yongzhuo <31341349+yongzhuo@users.noreply.github.com> Date: Sat, 27 Jul 2019 17:54:51 +0800 Subject: [PATCH] translate_google.py bug: couldn't translate long text --- .../translate_web/translate_google.py | 47 ++++++++++++------- 1 file changed, 31 insertions(+), 16 deletions(-) diff --git a/AugmentText/augment_translate/translate_web/translate_google.py b/AugmentText/augment_translate/translate_web/translate_google.py index 2ab2b02..3781b07 100644 --- a/AugmentText/augment_translate/translate_web/translate_google.py +++ b/AugmentText/augment_translate/translate_web/translate_google.py @@ -4,12 +4,14 @@ # @author :Mo # @function :回译调用谷歌翻译,模拟google token访问 -from conf.augment_constant import language_short_google -from utils.text_tools import judge_translate_english import logging as logger import urllib.parse as parse -import requests + import execjs +import requests + +from nlp_xiaojiang.conf.augment_constant import language_short_google +from nlp_xiaojiang.utils.text_tools import judge_translate_english class GoogleToken: @@ -71,7 +73,9 @@ def open_url(url): 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) ' 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'} req = requests.get(url=url, headers=headers) - return req.content.decode('utf-8') + # print('req.txt:') + # print(req.text.encode('gbk', 'ignore').decode('gbk')) + return req # .content.decode('utf-8') def max_length(content): @@ -82,7 +86,9 @@ def max_length(content): """ if len(content) > 4891: logger.info("翻译文本超过限制!") - return + return 4891 + else: + return None def translate_result(result): @@ -91,11 +97,11 @@ def translate_result(result): :param result: str :return: str """ - str_end = result.find("\",") - if str_end > 4: - return result[4:str_end] - else: - return None + result_last = '' + for res in result[0]: + if res[0]: + result_last += res[0] + return result_last def any_to_any_translate(content, from_='zh-CN', to_='en'): @@ -106,14 +112,17 @@ def any_to_any_translate(content, from_='zh-CN', to_='en'): :param to_: str, target language :return: str, result of translate """ - max_length(content) + max_len = max_length(content) + if max_len: + content = content[0:max_len] tk = google_tokn.get_google_token(content) content = parse.quote(content) url = "http://translate.google.cn/translate_a/single?client=t&sl={0}&tl={1}" \ "&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca&dt=rw&dt=rm&dt=ss&dt=t&" \ "ie=UTF-8&oe=UTF-8&source=btn&ssel=3&tsel=3&kc=0&tk={2}&q={3}".format(from_, to_, tk, content) result = open_url(url) - res = translate_result(result) + result_json = result.json() + res = translate_result(result_json) return res @@ -133,16 +142,22 @@ def any_to_any_translate_back(content, from_='zh-CN', to_='en'): if __name__ == '__main__': google_tokn = GoogleToken() while True: - sen_org = "过路蜻蜓喜欢口袋巧克力,这是什么意思" + # sen_org = "过路蜻蜓喜欢口袋巧克力,这是什么意思" + sen_org = "此外,李奇霖还认为,MLF期限是6个月,逆回购是7天,考虑到外汇占款流出的是长期限流动性," \ + "因此,无论哪一种货币投放模式都无法替代降准,降准的期限理论上是“无穷期”的。" \ + "从资金利率看,MLF资金利率在3.35%,比起降准释放的“无成本”流动性仍然偏高," \ + "经济下行压力之下,实体能提供的高收益资产有限,较高的资金利率能否缓解外汇占款对信用派生的收缩作用,也是有疑虑的。" \ + "“等汇率端稍见稳定后,我们能看到降准的出现,幅度约为100BP,时点预计在9月上旬。" + for language_short_google_one in language_short_google: text_translate = any_to_any_translate_back(sen_org, from_='zh', to_=language_short_google_one) judge = judge_translate_english(sen_org, text_translate) if judge: print(language_short_google_one + " " + "True") - print(text_translate) + print(text_translate.encode('gbk', 'ignore').decode('gbk')) else: print(language_short_google_one + " " + "False") - print(text_translate) + print(text_translate.encode('gbk', 'ignore').decode('gbk')) #测试结果 # en False # 我喜欢口袋巧克力,这是什么意思? @@ -151,4 +166,4 @@ if __name__ == '__main__': # ru False # 我喜欢口袋糖果,这是什么意思? # de False -# 我喜欢袋巧克力,这是什么意思? \ No newline at end of file +# 我喜欢袋巧克力,这是什么意思?