diff --git a/models/HMEAE: Hierarchical Modular Event Argument Extraction/utils.py b/models/HMEAE: Hierarchical Modular Event Argument Extraction/utils.py new file mode 100644 index 0000000..660c93d --- /dev/null +++ b/models/HMEAE: Hierarchical Modular Event Argument Extraction/utils.py @@ -0,0 +1,640 @@ +import os +import constant +from xml.dom.minidom import parse +from tqdm import tqdm +from stanfordcorenlp import StanfordCoreNLP +import re +import random +import json +import numpy as np +import copy + +class StanfordCoreNLPv2(StanfordCoreNLP): + def __init__(self,path): + super(StanfordCoreNLPv2,self).__init__(path) + def sent_tokenize(self,sentence): + r_dict = self._request('ssplit,tokenize', sentence) + tokens = [[token['originalText'] for token in s['tokens']] for s in r_dict['sentences']] + spans = [[(token['characterOffsetBegin'], token['characterOffsetEnd']) for token in s['tokens']] for s in r_dict['sentences'] ] + return tokens, spans + +class Extractor(): + def __init__(self): + self.dirs = ['bc','bn','cts','nw','un','wl'] + self.split_tags ={'bc':["",'','',''], + 'bn':["",""], + "cts":["",""], + 'nw':['','','',''], + 'un':['','','','','','','','','','','']} + self.Events = [] + self.None_events = [] + self.Entities = [] + + def find_index(self,offsets,offset): #offsets [) offset [] + idx_start = -1 + idx_end = -1 + for j, _offset in enumerate(offsets): + if idx_start == -1 and _offset[0] <= offset[0] and _offset[1] > offset[0]: + idx_start = j + if idx_end == -1 and _offset[0] <= offset[1] and _offset[1] > offset[1]: + idx_end = j + break + assert idx_start!=-1 and idx_end!=-1 + return idx_start,idx_end + + def sentence_distillation(self,sents,offsets,dir): + mark_split_tag = self.split_tags[dir] + + new_sents = [] + new_offsets = [] + + if dir == 'cst': + sents = sents[1:] + offsets = offsets[1:] + + for i, sent in enumerate(sents): + offset_per_sentence = offsets[i] + select = True + + start_posi = 0 + for j, token in enumerate(sent): + if bool(sum([token.startswith(e) for e in mark_split_tag])): + subsent = sent[start_posi:j] + suboffset = offset_per_sentence[start_posi:j] + if select and len(subsent) > 0: + assert (0, 0) not in suboffset + new_sents.append(subsent) + new_offsets.append(suboffset) + start_posi = j + 1 + select = True + elif token.startswith('<'): + select = False + + subsent = sent[start_posi:] + suboffset = offset_per_sentence[start_posi:] + if select and len(subsent) > 0: + assert (0, 0) not in suboffset + new_sents.append(subsent) + new_offsets.append(suboffset) + return new_sents,new_offsets + + + def correct_offsets(self,sents,offsets): + new_offsets = [] + minus = 0 + for i,offsets_per_sentence in enumerate(offsets): + sentence = sents[i] + new_offsets_per_sentence = [] + for j,offset in enumerate(offsets_per_sentence): + if sentence[j].startswith('<'): + new_offsets_per_sentence.append((0,0)) + minus+=len(sentence[j]) + + else: + new_offsets_per_sentence.append((offset[0]-minus,offset[1]-minus)) + new_offsets.append(new_offsets_per_sentence) + return sents,new_offsets + + + def Files_Extract(self): + self.event_files = {} + self.source_files = {} + self.amp_files = [] + for dir in self.dirs: + path = constant.ACE_FILES+'/'+dir+'/timex2norm' + files = os.listdir(path) + self.event_files[dir] = [file for file in files if file.endswith('.apf.xml')] + self.source_files[dir] = [file for file in files if file.endswith('.sgm')] + for file in self.source_files[dir]: + with open(path+'/'+file,'r') as f: + text = f.read() + if '&' in text: + self.amp_files.append(file[:-3]) + + srclen = 0 + evtlen = 0 + for dir in self.dirs: + srclen+=len(self.source_files[dir]) + evtlen+=len(self.event_files[dir]) + assert evtlen==srclen + assert evtlen==599 + + + def Entity_Extract(self): + for dir in self.dirs: + path = constant.ACE_FILES+'/'+dir+'/timex2norm' + files = self.event_files[dir] + for file in files: + DOMtree = parse(path + "/" + file) + collection = DOMtree.documentElement + mention_tags = ['entity_mention','value_mention','timex2_mention'] + for mention_tag in mention_tags: + mention = collection.getElementsByTagName(mention_tag) + for sample in mention: + start = int(sample.getElementsByTagName("extent")[0].getElementsByTagName("charseq")[0].getAttribute("START")) + end = int(sample.getElementsByTagName("extent")[0].getElementsByTagName("charseq")[0].getAttribute("END")) + name = str(sample.getElementsByTagName("extent")[0].getElementsByTagName("charseq")[0].childNodes[0].data) + entity_info = (name,start,end,file,dir) + self.Entities.append(entity_info) + self.Entities = list(set(self.Entities)) + self.Entities = [{'name':e[0],'start':e[1],'end':e[2],'file':e[3],'dir':e[4],'role':'None'} for e in self.Entities] + + def Event_Extract(self): + nlp = StanfordCoreNLPv2(constant.corenlp_path) + offsets2idx = {} + for dir in self.dirs: + path = constant.ACE_FILES+'/'+dir+'/timex2norm' + files =self.event_files[dir] + for file in files: + DOMtree = parse(path + "/" + file) + collection = DOMtree.documentElement + events = collection.getElementsByTagName("event") + Entities = [e for e in self.Entities if e['dir']==dir and e['file']==file] + for event in events: + event_type = str(event.getAttribute("SUBTYPE")) + event_mentions = event.getElementsByTagName("event_mention") + for event_mention in event_mentions: + event_info = event_mention.getElementsByTagName("ldc_scope")[0].getElementsByTagName("charseq")[0] + sent = str(event_info.childNodes[0].data) + start = int(event_info.getAttribute("START")) + end = int(event_info.getAttribute("END")) + + trigger_info = event_mention.getElementsByTagName("anchor")[0].getElementsByTagName("charseq")[0] + trigger = str(trigger_info.childNodes[0].data) + trigger_start = int(trigger_info.getAttribute("START")) + trigger_end = int(trigger_info.getAttribute("END")) + + entities = [copy.deepcopy(e) for e in Entities if e['start']>=start and e['end']<=end] + + map_entity = {(e['start'],e['end']):i for i,e in enumerate(entities)} + + arguments = event_mention.getElementsByTagName("event_mention_argument") + for argument in arguments: + role = str(argument.getAttribute("ROLE")) + argument_info = argument.getElementsByTagName("extent")[0].getElementsByTagName("charseq")[0] + argument_name = str(argument_info.childNodes[0].data) + argument_start = int(argument_info.getAttribute("START")) + argument_end = int(argument_info.getAttribute("END")) + assert (argument_start,argument_end) in map_entity + entity_id = map_entity[(argument_start,argument_end)] + assert argument_name==entities[entity_id]['name'] + entities[entity_id]['role'] = role + tokens,offsets = nlp.word_tokenize(sent,True) + + plus = 0 + for j,token in enumerate(tokens): + st = offsets[j][0] + plus + if file[:-7] in self.amp_files: + plus += 4*token.count('&') + ed = offsets[j][1] + plus + offsets[j] = (st,ed) + + tokens_offsets = [(e[0] + start, e[1] - 1 + start) for e in offsets] + find_offsets = [(e[0]+start,e[1]+start) for e in offsets] + trigger_s,trigger_e = self.find_index(find_offsets,(trigger_start,trigger_end)) + trigger_offsets = tokens_offsets[trigger_s:trigger_e+1] + trigger_tokens = tokens[trigger_s:trigger_e+1] + _entities = [] + for e in entities: + idx_start,idx_end = self.find_index(find_offsets,(e['start'],e['end'])) + entity_tokens = tokens[idx_start:idx_end+1] + entity_offsets = tokens_offsets[idx_start:idx_end+1] + entity_start = entity_offsets[0][0] + entity_end = entity_offsets[-1][1] + entity_info = {'tokens':entity_tokens, + 'offsets':entity_offsets, + 'start':entity_start, + 'end':entity_end, + 'idx_start':idx_start, + 'idx_end':idx_end, + 'role':e['role'] + } + _entities.append(entity_info) + event_summary = {"tokens": tokens, + 'offsets':tokens_offsets, + "event_type": [event_type], + "start": start, + "end": end, + "trigger_tokens": [trigger_tokens], + "trigger_start": [trigger_s], + "trigger_end": [trigger_e], + 'trigger_offsets':[trigger_offsets], + "entities": [_entities], + 'file': file[:-8], + 'dir': dir} + offsets_join = str(event_summary['start'])+'_'+str(event_summary['end'])+"_"+event_summary['file']+"_"+event_summary['dir'] + + if offsets_join in offsets2idx: + event_idx = offsets2idx[offsets_join] + self.Events[event_idx]['event_type'].extend(event_summary['event_type']) + self.Events[event_idx]['trigger_tokens'].extend(event_summary['trigger_tokens']) + self.Events[event_idx]['trigger_start'].extend(event_summary['trigger_start']) + self.Events[event_idx]['trigger_end'].extend(event_summary['trigger_end']) + self.Events[event_idx]['trigger_offsets'].extend(event_summary['trigger_offsets']) + self.Events[event_idx]['entities'].extend(event_summary['entities']) + else: + offsets2idx[offsets_join] = len(self.Events) + self.Events.append(event_summary) + nlp.close() + def None_event_Extract(self): + nlp = StanfordCoreNLPv2(constant.corenlp_path) + for dir in self.dirs: + path = constant.ACE_FILES+'/'+dir+'/timex2norm' + files =self.source_files[dir] + for file in files: + event_in_this_file = [(e['start'],e['end']) for e in self.Events if e['file']==file[:-4] and e['dir']==dir] + Entities = [(e['start'],e['end']) for e in self.Entities if e['dir']==dir and e['file'][:-7]==file[:-3]] + with open(path+'/'+file,'r') as f: + text = f.read() + sents,offsets = nlp.sent_tokenize(text) + sents,offsets = self.correct_offsets(sents,offsets) + sents,offsets = self.sentence_distillation(sents,offsets,dir) + + + new_sents = [] + new_offsets = [] + for j,sent in enumerate(sents): + offset = offsets[j] + select = True + for event in event_in_this_file: + if (offset[0][0]>=event[0] and offset[0][0]<=event[1]) or \ + (offset[-1][1]-1>=event[0] and offset[-1][1]-1<=event[1]): + select = False + break + if select: + new_sents.append(sent) + new_offsets.append(offset) + + sents = new_sents + offsets = new_offsets + + for i,sent in enumerate(sents): + offset = offsets[i] + tokens = sent + start = offset[0][0] + end = offset[-1][1]-1 + tokens_offset = [(e[0],e[1]-1) for e in offset] + event_type = 'None' + trigger_tokens= [] + trigger_offsets = [] + trigger_start = -1 + trigger_end = -1 + entities = [] + + _entities = [e for e in Entities if e[0]>=start and e[1]<=end] + for e in _entities: + idx_start,idx_end = self.find_index(offset,e) + entity_info = {'token':sent[idx_start:idx_end+1], + 'role':'None', + 'offsets':[(e[0],e[1]-1) for e in offset[idx_start:idx_end+1]], + 'start':offset[idx_start][0], + 'end':offset[idx_end][1]-1, + 'idx_start':idx_start, + 'idx_end':idx_end} + entities.append(entity_info) + none_event_summary = { + 'tokens':tokens, + 'start':start, + 'end':end, + 'offsets':tokens_offset, + 'event_type': event_type, + 'trigger_tokens':trigger_tokens, + 'trigger_start':trigger_start, + 'trigger_end':trigger_end, + 'trigger_offsets':trigger_offsets, + 'entities':entities, + 'file':file[:-4], + 'dir':dir + } + self.None_events.append(none_event_summary) + nlp.close() + + def process(self): + Events = [] + for event in self.Events: + for i in range(len(event['trigger_start'])): + _event = { + 'tokens':event['tokens'], + 'start':event['start'], + 'end':event['end'], + 'offsets':event['offsets'], + 'event_type': event['event_type'][i], + 'trigger_tokens':event['trigger_tokens'][i], + 'trigger_start':event['trigger_start'][i], + 'trigger_end':event['trigger_end'][i], + 'trigger_offsets':event['trigger_offsets'][i], + 'entities':event['entities'][i], + 'file':event['file'], + 'dir':event['dir'] + } + Events.append(_event) + + _entities = [] + for entity in event['entities'][0]: + add_entity = copy.deepcopy(entity) + add_entity['role']='None' + _entities.append(add_entity) + for i in range(len(event['tokens'])): + if i in event['trigger_start']: + continue + _event = { + 'tokens':event['tokens'], + 'start':event['start'], + 'end':event['end'], + 'offsets':event['offsets'], + 'event_type': 'None', + 'trigger_tokens':[event['tokens'][i]], + 'trigger_start':i, + 'trigger_end':i, + 'trigger_offsets':[event['offsets'][i]], + 'entities':_entities, + 'file':event['file'], + 'dir':event['dir'] + } + Events.append(_event) + self.Events = Events + + None_events = [] + for none_event in self.None_events: + for i in range(len(none_event['tokens'])): + _none_event = { + 'tokens':none_event['tokens'], + 'start':none_event['start'], + 'end':none_event['end'], + 'offsets':none_event['offsets'], + 'event_type':'None', + 'trigger_tokens':[none_event['tokens'][i]], + 'trigger_start':i, + 'trigger_end':i, + 'trigger_offsets':[none_event['offsets'][i]], + 'entities':none_event['entities'], + 'file':none_event['file'], + 'dir':none_event['dir'] + } + None_events.append(_none_event) + self.None_events = None_events + + + def Extract(self): + if os.path.exists(constant.ACE_DUMP+'/train.json'): + print('--Already Exists Files--') + return + + self.Files_Extract() + print('--File Extraction Finish--') + self.Entity_Extract() + print('--Entity Extraction Finish--') + self.Event_Extract() + print('--Event Mention Extraction Finish--') + self.None_event_Extract() + print('--Negetive Mention Extraction Finish--') + self.process() + print('--Preprocess Data Finish--') + + # Random Split + # nw = self.source_files['nw'] + # random.shuffle(nw) + # random.shuffle(nw) + # other_files = [file for dir in self.dirs for file in self.source_files[dir] if dir!='nw']+nw[40:] + # random.shuffle(other_files) + # random.shuffle(other_files) + + # test_files = nw[:40] + # dev_files = other_files[:30] + # train_files = other_files[30:] + + # test_set = [instance for instance in self.Events if instance['file']+'.sgm' in test_files]+[instance for instance in self.None_events if instance['file']+".sgm" in test_files] + # dev_set = [instance for instance in self.Events if instance['file']+'.sgm' in dev_files]+[instance for instance in self.None_events if instance['file']+".sgm" in dev_files] + # train_set = [instance for instance in self.Events if instance['file']+'.sgm' in train_files]+[instance for instance in self.None_events if instance['file']+".sgm" in train_files] + + # Use fix split + with open('./logs/split.json','r') as f: + splits = json.load(f) + test_files = splits['test'] + dev_files = splits['dev'] + train_files = splits['train'] + test_set = [instance for instance in self.Events if instance['file'] in test_files]+[instance for instance in self.None_events if instance['file'] in test_files] + dev_set = [instance for instance in self.Events if instance['file'] in dev_files]+[instance for instance in self.None_events if instance['file'] in dev_files] + train_set = [instance for instance in self.Events if instance['file'] in train_files]+[instance for instance in self.None_events if instance['file'] in train_files] + + with open(constant.ACE_DUMP+'/train.json','w') as f: + json.dump(train_set,f) + with open(constant.ACE_DUMP+'/dev.json','w') as f: + json.dump(dev_set,f) + with open(constant.ACE_DUMP+'/test.json','w') as f: + json.dump(test_set,f) + +class StanfordCoreNLPv2(StanfordCoreNLP): + def __init__(self,path): + super(StanfordCoreNLPv2,self).__init__(path) + def sent_tokenize(self,sentence): + r_dict = self._request('ssplit,tokenize', sentence) + tokens = [[token['originalText'] for token in s['tokens']] for s in r_dict['sentences']] + spans = [[(token['characterOffsetBegin'], token['characterOffsetEnd']) for token in s['tokens']] for s in r_dict['sentences'] ] + return tokens, spans + +class Loader(): + def __init__(self): + self.train_path = constant.ACE_DUMP+'/train.json' + self.dev_path = constant.ACE_DUMP+'/dev.json' + self.test_path = constant.ACE_DUMP+'/test.json' + self.glove_path = constant.GloVe_file + + def load_embedding(self): + word2idx = {} + wordemb = [] + with open(self.glove_path,'r',encoding='utf-8') as f: + for line in f: + splt = line.split() + assert len(splt)==constant.embedding_dim+1 + vector = list(map(float, splt[-constant.embedding_dim:])) + word = splt[0] + word2idx[word] = len(word2idx)+2 + wordemb.append(vector) + return word2idx,np.asarray(wordemb,np.float32) + + def get_maxlen(self): + paths = [self.train_path,self.dev_path,self.test_path] + maxlens = [] + for path in paths: + with open(path,'r') as f: + data = json.load(f) + _maxlen = max([len(d['tokens']) for d in data]) + maxlens.append(_maxlen) + self.maxlen = max(maxlens) + return self.maxlen + + def get_max_argument_len(self): + paths = [self.train_path,self.dev_path,self.test_path] + maxlens = [] + for path in paths: + with open(path,'r') as f: + data = json.load(f) + for instance in data: + if len(instance['entities'])==0: + continue + _maxlen = max([entity['idx_end']+1-entity['idx_start'] for entity in instance['entities']]) + maxlens.append(_maxlen) + self.max_argument_len = max(maxlens) + return self.max_argument_len + + def get_positions(self,start_idx,sent_len,maxlen): + return list(range(maxlen-start_idx, maxlen)) + [maxlen] + \ + list(range(maxlen+1, maxlen+sent_len - start_idx))+[0]*(maxlen-sent_len) + + def get_word(self,tokens,word2idx,pad_lenth): + idx = [] + for word in tokens: + if word.lower() in word2idx: + idx.append(word2idx[word.lower()]) + else: + idx.append(1) + idx += [0]*(pad_lenth-len(idx)) + return idx + + def get_trigger_mask(self,posi,sent_len,maxlen,direction): + assert direction in ['left','right'] + mask = [0.]*maxlen + if direction=='left': + mask[:posi] = [1.]*posi + else: + mask[posi:sent_len] = [1.]*(sent_len-posi) + return mask + + def get_argument_mask(self,posi1,posi2,sent_len,maxlen,direction): + assert direction in ['left','right','mid'] + mask = [0.]*maxlen + posi_min = min(posi1,posi2) + posi_max = max(posi1,posi2) + if direction=='left': + mask[:posi_min] = [1.]*posi_min + elif direction=='mid': + mask[posi_min:posi_max] = [1.]*(posi_max-posi_min) + else: + mask[posi_max:sent_len] = [1.]*(sent_len-posi_max) + return mask + + def load_one_trigger(self,path,maxlen,word2idx): + trigger_posis,sents,trigger_maskls,trigger_maskrs,event_types,trigger_lexical= [], [], [], [], [], [] + with open(path,'r') as f: + data = json.load(f) + for instance in data: + tokens = instance['tokens'] + event_type = instance['event_type'] + trigger_posi = instance['trigger_start'] + words = self.get_word(tokens,word2idx,maxlen) + + trigger_posis.append(self.get_positions(trigger_posi,len(tokens),maxlen)) + sents.append(words) + trigger_maskls.append(self.get_trigger_mask(trigger_posi,len(tokens),maxlen,'left')) + trigger_maskrs.append(self.get_trigger_mask(trigger_posi, len(tokens),maxlen, 'right')) + event_types.append(constant.EVENT_TYPE_TO_ID[event_type]) + + _trigger_lexical = [] + if trigger_posi==0: + _trigger_lexical.append(0) + else: + _trigger_lexical.append(words[trigger_posi-1]) + + _trigger_lexical.append(words[trigger_posi]) + + if trigger_posi==len(tokens)-1: + _trigger_lexical.append(0) + else: + _trigger_lexical.append(words[trigger_posi+1]) + + trigger_lexical.append(_trigger_lexical) + + return np.array(trigger_posis,np.int32),np.array(sents,np.int32),np.array(trigger_maskls,np.int32),\ + np.array(trigger_maskrs,np.int32),np.array(event_types,np.int32),np.array(trigger_lexical,np.int32) + + def load_trigger(self): + print('--Loading Trigger--') + word2idx,self.wordemb = self.load_embedding() + maxlen = self.get_maxlen() + paths = [self.train_path, self.dev_path, self.test_path] + results = [] + for path in paths: + result = self.load_one_trigger(path,maxlen,word2idx) + results.append(result) + return results + + def load_one_argument(self,path,maxlen,word2idx,max_argument_len,dataset ='train'): + sents, event_types, roles, maskl, maskm, maskr, trigger_lexical, argument_lexical,\ + trigger_maskl, trigger_maskr,trigger_posis,argument_posis = [],[],[],[],[],[],[],[],[],[],[],[] + + with open(path,'r') as f: + data = json.load(f) + for instance in data: + if dataset =='train': + if instance['event_type']=='None': + continue + if len(instance['entities'])==0: + continue + tokens = instance['tokens'] + words = self.get_word(tokens,word2idx,maxlen) + trigger_posi = instance['trigger_start'] + event_type = instance['event_type'] + + _trigger_lexical = [] + if trigger_posi==0: + _trigger_lexical.append(0) + else: + _trigger_lexical.append(words[trigger_posi-1]) + + _trigger_lexical.append(words[trigger_posi]) + + if trigger_posi==len(tokens)-1: + _trigger_lexical.append(0) + else: + _trigger_lexical.append(words[trigger_posi+1]) + + for entity in instance['entities']: + role = entity['role'] + entity_start = entity['idx_start'] + entity_end = entity['idx_end'] + + sents.append(words) + event_types.append(constant.EVENT_TYPE_TO_ID[event_type]) + roles.append(constant.ROLE_TO_ID[role]) + trigger_posis.append(self.get_positions(trigger_posi,len(tokens),maxlen)) + argument_posis.append(self.get_positions(entity_start,len(tokens),maxlen)) + maskl.append(self.get_argument_mask(entity_start,trigger_posi,len(tokens),maxlen,'left')) + maskm.append(self.get_argument_mask(entity_start,trigger_posi,len(tokens),maxlen,'mid')) + maskr.append(self.get_argument_mask(entity_start,trigger_posi,len(tokens),maxlen,'right')) + trigger_lexical.append(_trigger_lexical) + + trigger_maskl.append(self.get_trigger_mask(trigger_posi,len(tokens),maxlen,'left')) + trigger_maskr.append(self.get_trigger_mask(trigger_posi, len(tokens),maxlen, 'right')) + + _argument_lexical = [] + if entity_start==0: + _argument_lexical.append(0) + else: + _argument_lexical.append(words[entity_start-1]) + + _argument_lexical.extend(words[entity_start:entity_end+1]+[0]*(max_argument_len-entity_end-1+entity_start)) + + if entity_end==len(tokens)-1: + _argument_lexical.append(0) + else: + _argument_lexical.append(words[entity_end+1]) + + argument_lexical.append(_argument_lexical) + return np.array(sents,np.int32),np.array(event_types,np.int32),np.array(roles,np.int32),\ + np.array(maskl,np.int32),np.array(maskm,np.int32),np.array(maskr,np.int32),\ + np.array(trigger_lexical,np.int32),np.array(argument_lexical,np.int32),\ + np.array(trigger_maskl,np.int32),np.array(trigger_maskr,np.int32),\ + np.array(trigger_posis,np.int32),np.array(argument_posis,np.int32) + + def load_argument(self): + print('--Loading Argument--') + word2idx,self.wordemb = self.load_embedding() + maxlen = self.get_maxlen() + max_argument_len = self.get_max_argument_len() + results = [] + for path,dataset in [(self.train_path,'train'),(self.dev_path,'dev'),(self.test_path,'test')]: + result = self.load_one_argument(path,maxlen,word2idx,max_argument_len,dataset) + results.append(result) + return results