nlp_xiaojiang/AugmentText/augment_syntax/augment_mainpart.py
2019-04-11 10:23:02 +08:00

157 lines
6.0 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: UTF-8 -*-
# !/usr/bin/python
# @time :2019/4/9 22:49
# @author :Mo
# @function :get main_part by stanfordcorenlp
from conf.path_config import stanford_corenlp_full_path
from stanfordcorenlp import StanfordCoreNLP
# stanford-corenlp-full-2018-10-05需要预先下载启动较慢
nlp = StanfordCoreNLP(stanford_corenlp_full_path, lang='zh')
def stanford_parse(sentence):
tokenize = nlp.word_tokenize(sentence)
pos_tag = nlp.pos_tag(sentence)
name_entity = nlp.ner(sentence)
syntax_tree = nlp.parse(sentence)
dependence = nlp.dependency_parse(sentence)
result_dict = {}
result_dict['tokenize'] = tokenize
result_dict['dependence'] = dependence
result_dict['parse'] = syntax_tree
return result_dict
def combine_nn(tokenize, dependence, target):
"""
合并名词短语等
:param dependence: dict, enhancedPlusPlusDependencies
:param target: str, subject or object
:return: str, nn
"""
if not target:
return target
else:
for dependence_one in dependence:
if target == tokenize[dependence_one[1]-1] if dependence_one[1]!=0 else "root" and dependence_one[0] == "nn":
target = tokenize[dependence_one[2]-1] + target
return target
return target
def get_main_part_by_stanfordcorenlp(text):
"""
根据依存句法生成句子
:param text: str, 输入
:return: str, result of syn sentence
"""
# standcoreNLP 分词
result_dict = stanford_parse(text)
tokenize = result_dict['tokenize']
dependence = result_dict['dependence']
syntax_tree = result_dict['parse']
# 提取主谓宾
part_main = {"": "", "": "", "": ""}
if len(syntax_tree) >= 2:
if "NP" in syntax_tree[1] or "ROOT" not in str(dependence): # 名词短语 或者是没有谓语
count = 0
for syntax_tree_single in syntax_tree:
if "NP" in syntax_tree_single and "(" in syntax_tree_single and ")" in syntax_tree_single:
token_np = syntax_tree_single.split(" ")[-1]
token_np = token_np.replace("'", "").replace(")", "").strip()
part_main[""] = token_np if count == 0 else part_main[""] + token_np
count += 1
return part_main[""] + part_main[""] + part_main[""]
else:
for dependence_one in dependence:
dep = dependence_one[0]
dep_dep_gloss = tokenize[dependence_one[2]-1]
if dep == "ROOT": # ROOT作谓语
part_main[""] = dep_dep_gloss
elif dep == "cop": # 主系结构
part_main[""] = dep_dep_gloss + part_main[""]
else: # 主语和宾语
if dep == "nsubjpass" or dep == "dobj" or dep == "attr":
part_main[""] = dep_dep_gloss
elif dep == "nsubj" or dep == "top":
part_main[""] = dep_dep_gloss
part_main[""] = combine_nn(tokenize, dependence, part_main[""])
part_main[""] = combine_nn(tokenize, dependence, part_main[""])
return part_main[""] + part_main[""] + part_main[""]
if __name__ == "__main__":
sentence_list = [
"大漠帝国确实很喜欢JY",
"JY也喜欢大漠帝国哦",
"这个工程的作者是momo",
"momo是一个无门无派的浪人",
"只有自信的程序员才能把握未来",
"主干识别可以提高检索系统的智能",
"打更的住在这里",
"人民的名义",
"名词短语",
"我一直很喜欢你",
"你被我喜欢",
"美丽又善良的你被卑微的我深深的喜欢着……",
"搜索momo可以找到我的博客",
"静安区体育局2013年部门决算情况说明",
"红旗飘",
"柳丝长",
"乐队奏国歌",
"红扑扑的朝霞露出了笑脸",
"初升的太阳照耀着峻峭的群山",
"一个农人在路上看见一条冻僵了的蛇",
"我打量了他一眼", ]
sentence_type = ["陈述句与否定句",
"秦耕真是一个聪明的孩子",
"衣服洗得不干净",
"他没有做完作业",
"他不敢不来",
"没有一个人不怕他",
"我非把这本书读完不可",
"同学们无不欢欣鼓舞",
"他妈妈不让他去,无非是怕他吃亏",
"想起一个人的旅途,不无寂寥之感",
"你未必不知道",
"各种问句",
"你可以那到100分, 是吗?",
"刚才接你的人是谁?",
"什么叫函数?",
"你爸爸怎么样了?",
"你每天几点休息?",
"你爸爸在哪儿?",
"我们是从广州走, 还是从成都走?",
"他是不是又迟到了?",
"难道他已经跑了?",
"我怎么能负这个责任呢?",
"你是来帮助我们的, 还是来拆我们的台的?",
"这些人甘愿当走狗, 你说可恨不可恨?",
"祈使句",
"快去捞饭!米烂了!",
"给我喝水, 我渴!",
"走哇, 妈妈!",
"不许动!",
"太好啦",
]
for sen_one in sentence_list:
subject_object = get_main_part_by_stanfordcorenlp(sen_one)
print(sen_one + " " + subject_object)
while True:
print("请输入sentence ")
sen_test = input()
# syn_sentence_test = syn_by_syntactic_analys==(test_test)
syn_sentence_test = get_main_part_by_stanfordcorenlp(sen_test)
print(syn_sentence_test)
# Do not forget to close! The backend server will consume a lot memery
nlp.close()