#!/usr/bin/env python3
# coding: utf-8
# File: sentence_parser.py
# Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
# Date: 18-3-10
import os
from pyltp import Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller
class LtpParser:
def __init__(self):
LTP_DIR = "./ltp_data"
self.segmentor = Segmentor()
self.segmentor.load(os.path.join(LTP_DIR, "cws.model"))
self.postagger = Postagger()
self.postagger.load(os.path.join(LTP_DIR, "pos.model"))
self.parser = Parser()
self.parser.load(os.path.join(LTP_DIR, "parser.model"))
self.recognizer = NamedEntityRecognizer()
self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))
self.labeller = SementicRoleLabeller()
self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model'))
def format_labelrole(self, words, postags):
arcs = self.parser.parse(words, postags)
roles = self.labeller.label(words, postags, arcs)
roles_dict = {}
for role in roles:
roles_dict[role.index] = {arg.name:[arg.name,arg.range.start, arg.range.end] for arg in role.arguments}
return roles_dict
def build_parse_child_dict(self, words, postags, arcs):
child_dict_list = []
format_parse_list = []
for index in range(len(words)):
child_dict = dict()
for arc_index in range(len(arcs)):
if arcs[arc_index].head == index+1: #arcs的索引从1开始
if arcs[arc_index].relation in child_dict:
child_dict[arcs[arc_index].relation] = []
rely_id = [arc.head for arc in arcs] # 提取依存父节点id
relation = [arc.relation for arc in arcs] # 提取依存关系
heads = ['Root' if id == 0 else words[id - 1] for id in rely_id] # 匹配依存父节点词语
for i in range(len(words)):
# ['ATT', '李克强', 0, 'nh', '总理', 1, 'n']
a = [relation[i], words[i], i, postags[i], heads[i], rely_id[i]-1, postags[rely_id[i]-1]]
return child_dict_list, format_parse_list
def parser_main(self, sentence):
words = list(self.segmentor.segment(sentence))
postags = list(self.postagger.postag(words))
arcs = self.parser.parse(words, postags)
child_dict_list, format_parse_list = self.build_parse_child_dict(words, postags, arcs)
roles_dict = self.format_labelrole(words, postags)
return words, postags, child_dict_list, roles_dict, format_parse_list
if __name__ == '__main__':
parse = LtpParser()
sentence = '李克强总理今天来我家了,我感到非常荣幸'
words, postags, child_dict_list, roles_dict, format_parse_list = parse.parser_main(sentence)
print(words, len(words))
print(postags, len(postags))
print(child_dict_list, len(child_dict_list))
print(format_parse_list, len(format_parse_list))

#!/usr/bin/env python3
# coding: utf-8
# File: triple_extraction.py
# Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
# Date: 18-3-12
from sentence_parser import *
import re
class TripleExtractor:
def __init__(self):
self.parser = LtpParser()
'''文章分句处理, 切分长句,冒号,分号,感叹号等做切分标识'''
def split_sents(self, content):
return [sentence for sentence in re.split(r'[?!。;;:\n\r]', content) if sentence]
def ruler1(self, words, postags, roles_dict, role_index):
v = words[role_index]
role_info = roles_dict[role_index]
if 'A0' in role_info.keys() and 'A1' in role_info.keys():
s = ''.join([words[word_index] for word_index in range(role_info['A0'][1], role_info['A0'][2]+1) if
postags[word_index][0] not in ['w', 'u'] and words[word_index]])
o = ''.join([words[word_index] for word_index in range(role_info['A1'][1], role_info['A1'][2]+1) if
postags[word_index][0] not in ['w', 'u'] and words[word_index]])
if s and o:
return '1', [s, v, o]
elif 'A0' in role_info:
s = ''.join([words[word_index] for word_index in range(role_info['A0'][1], role_info['A0'][2] + 1) if
postags[word_index][0] not in ['w', 'u']])
if s:
return '2', [s, v]
elif 'A1' in role_info:
o = ''.join([words[word_index] for word_index in range(role_info['A1'][1], role_info['A1'][2]+1) if
postags[word_index][0] not in ['w']])
return '3', [v, o]
return '4', []
def ruler2(self, words, postags, child_dict_list, arcs, roles_dict):
svos = []
for index in range(len(postags)):
tmp = 1
# 先借助语义角色标注的结果,进行三元组抽取
if index in roles_dict:
flag, triple = self.ruler1(words, postags, roles_dict, index)
if flag == '1':
tmp = 0
if tmp == 1:
# 如果语义角色标记为空,则使用依存句法进行抽取
# if postags[index] == 'v':
if postags[index]:
# 抽取以谓词为中心的事实三元组
child_dict = child_dict_list[index]
# 主谓宾
if 'SBV' in child_dict and 'VOB' in child_dict:
r = words[index]
e1 = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0])
e2 = self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
svos.append([e1, r, e2])
# 定语后置,动宾关系
relation = arcs[index][0]
head = arcs[index][2]
if relation == 'ATT':
if 'VOB' in child_dict:
e1 = self.complete_e(words, postags, child_dict_list, head - 1)
r = words[index]
e2 = self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
temp_string = r + e2
if temp_string == e1[:len(temp_string)]:
e1 = e1[len(temp_string):]
if temp_string not in e1:
svos.append([e1, r, e2])
# 含有介宾关系的主谓动补关系
if 'SBV' in child_dict and 'CMP' in child_dict:
e1 = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0])
cmp_index = child_dict['CMP'][0]
r = words[index] + words[cmp_index]
if 'POB' in child_dict_list[cmp_index]:
e2 = self.complete_e(words, postags, child_dict_list, child_dict_list[cmp_index]['POB'][0])
svos.append([e1, r, e2])
return svos
def complete_e(self, words, postags, child_dict_list, word_index):
child_dict = child_dict_list[word_index]
prefix = ''
if 'ATT' in child_dict:
for i in range(len(child_dict['ATT'])):
prefix += self.complete_e(words, postags, child_dict_list, child_dict['ATT'][i])
postfix = ''
if postags[word_index] == 'v':
if 'VOB' in child_dict:
postfix += self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
if 'SBV' in child_dict:
prefix = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) + prefix
return prefix + words[word_index] + postfix
def triples_main(self, content):
sentences = self.split_sents(content)
svos = []
for sentence in sentences:
words, postags, child_dict_list, roles_dict, arcs = self.parser.parser_main(sentence)
svo = self.ruler2(words, postags, child_dict_list, arcs, roles_dict)
svos += svo
return svos
def test():
content1 = """环境很好,位置独立性很强,比较安静很切合店名,半闲居,偷得半日闲。点了比较经典的菜品,味道果然不错!烤乳鸽,超级赞赞赞,脆皮焦香,肉质细嫩,超好吃。艇仔粥料很足,香葱自己添加,很贴心。金钱肚味道不错,不过没有在广州吃的烂,牙口不好的慎点。凤爪很火候很好,推荐。最惊艳的是长寿菜,菜料十足,很新鲜,清淡又不乏味道,而且没有添加调料的味道,搭配的非常不错!"""
content2 = """近日一条男子高铁吃泡面被女乘客怒怼的视频引发热议。女子情绪激动言辞激烈大声斥责该乘客称高铁上有规定不能吃泡面质问其“有公德心吗”“没素质”。视频曝光后该女子回应称因自己的孩子对泡面过敏曾跟这名男子沟通过但对方执意不听她才发泄不满并称男子拍视频上传已侵犯了她的隐私权和名誉权将采取法律手段。12306客服人员表示高铁、动车上一般不卖泡面但没有规定高铁、动车上不能吃泡面。
content3 = '''(原标题:央视独家采访:陕西榆林产妇坠楼事件在场人员还原事情经过)
extractor = TripleExtractor()
svos = extractor.triples_main(content1)
print('svos', svos)