DeepIE/layers/decoders/crf.py
2020-05-20 20:45:09 +08:00

275 lines
14 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
# @Author: Jie Yang
# @Date: 2017-12-04 23:19:38
# @Last Modified by: Jie Yang, Contact: jieynlp@gmail.com
# @Last Modified time: 2018-05-27 22:48:17
import torch
import torch.autograd as autograd
import torch.nn as nn
START_TAG = -2
STOP_TAG = -1
# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec, m_size):
"""
calculate log of exp sum
args:
vec (batch_size, vanishing_dim, hidden_dim) : input tensor (b,t,t)
m_size : hidden_dim
return:
batch_size, hidden_dim
"""
_, idx = torch.max(vec, 1) # B * 1 * M
max_score = torch.gather(vec, 1, idx.view(-1, 1, m_size)).view(-1, 1, m_size) # B * M
return max_score.view(-1, m_size) + torch.log(torch.sum(torch.exp(vec - max_score.expand_as(vec)), 1)).view(-1,
m_size)
class CRF(nn.Module):
def __init__(self, tagset_size, gpu, device=0):
super(CRF, self).__init__()
print("build batched crf...")
self.gpu = gpu
self.device = device
# Matrix of transition parameters. Entry i,j is the score of transitioning *to* i *from* j.
self.average_batch = False
self.tagset_size = tagset_size
# # We add 2 here, because of START_TAG and STOP_TAG
# # transitions (f_tag_size, t_tag_size), transition value from f_tag to t_tag
init_transitions = torch.zeros(self.tagset_size + 2, self.tagset_size + 2)
# init_transitions = torch.zeros(self.tagset_size+2, self.tagset_size+2)
# init_transitions[:,START_TAG] = -1000.0
# init_transitions[STOP_TAG,:] = -1000.0
# init_transitions[:,0] = -1000.0
# init_transitions[0,:] = -1000.0
if self.gpu:
init_transitions = init_transitions.cuda(device)
self.transitions = nn.Parameter(init_transitions) # (t+2,t+2)
# self.transitions = nn.Parameter(torch.Tensor(self.tagset_size+2, self.tagset_size+2))
# self.transitions.data.zero_()
def _calculate_PZ(self, feats, mask):
"""
input:
feats: (batch, seq_len, self.tag_size+2) (b,m,t+2)
masks: (batch, seq_len) (b,m)
"""
batch_size = feats.size(0)
seq_len = feats.size(1)
tag_size = feats.size(2)
# print feats.view(seq_len, tag_size)
assert (tag_size == self.tagset_size + 2)
mask = mask.transpose(1, 0).contiguous() # (m,b)
ins_num = seq_len * batch_size
## be careful the view shape, it is .view(ins_num, 1, tag_size) but not .view(ins_num, tag_size, 1)
feats = feats.transpose(1, 0).contiguous().view(ins_num, 1, tag_size).expand(ins_num, tag_size, tag_size)
## need to consider start
scores = feats + self.transitions.view(1, tag_size, tag_size).expand(ins_num, tag_size, tag_size)
scores = scores.view(seq_len, batch_size, tag_size, tag_size)
# build iter
seq_iter = enumerate(scores) # (index,matrix) index is among the first dim: seqlen
_, inivalues = seq_iter.__next__()
# only need start from start_tag
partition = inivalues[:, START_TAG, :].clone().view(batch_size, tag_size, 1)
## add start score (from start to all tag, duplicate to batch_size)
# partition = partition + self.transitions[START_TAG,:].view(1, tag_size, 1).expand(batch_size, tag_size, 1)
# iter over last scores
for idx, cur_values in seq_iter:
# previous to_target is current from_target
# partition: previous results log(exp(from_target)), #(batch_size * from_target)
# cur_values: bat_size * from_target * to_target
cur_values = cur_values + partition.contiguous().view(batch_size, tag_size, 1).expand(batch_size, tag_size,
tag_size)
cur_partition = log_sum_exp(cur_values, tag_size) # (b,t)
# print cur_partition.data
# (bat_size * from_target * to_target) -> (bat_size * to_target)
# partition = utils.switch(partition, cur_partition, mask[idx].view(bat_size, 1).expand(bat_size, self.tagset_size)).view(bat_size, -1)
mask_idx = mask[idx, :].view(batch_size, 1).expand(batch_size, tag_size)
## effective updated partition part, only keep the partition value of mask value = 1
masked_cur_partition = cur_partition.masked_select(mask_idx)
## let mask_idx broadcastable, to disable warning
mask_idx = mask_idx.contiguous().view(batch_size, tag_size, 1)
## replace the partition where the maskvalue=1, other partition value keeps the same
partition.masked_scatter_(mask_idx, masked_cur_partition)
# until the last state, add transition score for all partition (and do log_sum_exp) then select the value in STOP_TAG
cur_values = self.transitions.view(1, tag_size, tag_size).expand(batch_size, tag_size,
tag_size) + partition.contiguous().view(
batch_size, tag_size, 1).expand(batch_size, tag_size, tag_size)
cur_partition = log_sum_exp(cur_values, tag_size) # (batch_size,hidden_dim)
final_partition = cur_partition[:, STOP_TAG] # (batch_size)
return final_partition.sum(), scores # scores: (seq_len, batch, tag_size, tag_size)
def _viterbi_decode(self, feats, mask):
"""
input:
feats: (batch, seq_len, self.tag_size+2)
mask: (batch, seq_len)
output:
decode_idx: (batch, seq_len) decoded sequence
path_score: (batch, 1) corresponding score for each sequence (to be implementated)
"""
batch_size = feats.size(0)
seq_len = feats.size(1)
tag_size = feats.size(2)
assert (tag_size == self.tagset_size + 2)
## calculate sentence length for each sentence
length_mask = torch.sum(mask.long(), dim=1).view(batch_size, 1).long()
## mask to (seq_len, batch_size)
mask = mask.transpose(1, 0).contiguous() # seq_len,b
ins_num = seq_len * batch_size
## be careful the view shape, it is .view(ins_num, 1, tag_size) but not .view(ins_num, tag_size, 1)
feats = feats.transpose(1, 0).contiguous().view(ins_num, 1, tag_size).expand(ins_num, tag_size,
tag_size) # (ins_num, tag_size, tag_size)
## need to consider start
scores = feats + self.transitions.view(1, tag_size, tag_size).expand(ins_num, tag_size, tag_size)
scores = scores.view(seq_len, batch_size, tag_size, tag_size)
# build iter
seq_iter = enumerate(scores)
## record the position of best score
back_points = list()
partition_history = list()
## reverse mask (bug for mask = 1- mask, use this as alternative choice)
# mask = 1 + (-1)*mask
mask = (1 - mask.long()).byte()
_, inivalues = seq_iter.__next__() # bat_size * from_target_size * to_target_size
# only need start from start_tag
partition = inivalues[:, START_TAG, :].clone().view(batch_size, tag_size, 1) # bat_size * to_target_size
# print(partition.size())
partition_history.append(partition) # (seqlen,batch_size,tag_size,1)
# iter over last scores
for idx, cur_values in seq_iter:
# previous to_target is current from_target
# partition: previous results log(exp(from_target)), #(batch_size * from_target)
# cur_values: batch_size * from_target * to_target
cur_values = cur_values + partition.contiguous().view(batch_size, tag_size, 1).expand(batch_size, tag_size,
tag_size)
## forscores, cur_bp = torch.max(cur_values[:,:-2,:], 1) # do not consider START_TAG/STOP_TAG
partition, cur_bp = torch.max(cur_values, dim=1)
# print(partition.size())
partition_history.append(partition.unsqueeze(2))
## cur_bp: (batch_size, tag_size) max source score position in current tag
## set padded label as 0, which will be filtered in post processing
cur_bp.masked_fill_(mask[idx].view(batch_size, 1).expand(batch_size, tag_size), 0)
back_points.append(cur_bp)
### add score to final STOP_TAG
partition_history = torch.cat(partition_history, dim=0).view(seq_len, batch_size, -1).transpose(1,
0).contiguous() ## (batch_size, seq_len, tag_size)
### get the last position for each setences, and select the last partitions using gather()
last_position = length_mask.view(batch_size, 1, 1).expand(batch_size, 1, tag_size) - 1
last_partition = torch.gather(partition_history, 1, last_position).view(batch_size, tag_size, 1)
### calculate the score from last partition to end state (and then select the STOP_TAG from it)
last_values = last_partition.expand(batch_size, tag_size, tag_size) + self.transitions.view(1, tag_size,
tag_size).expand(
batch_size, tag_size, tag_size)
_, last_bp = torch.max(last_values, 1) # (batch_size,tag_size)
pad_zero = autograd.Variable(torch.zeros(batch_size, tag_size)).long()
if self.gpu:
pad_zero = pad_zero.cuda(self.device)
back_points.append(pad_zero)
back_points = torch.cat(back_points).view(seq_len, batch_size, tag_size)
## select end ids in STOP_TAG
pointer = last_bp[:, STOP_TAG] # (batch_size)
insert_last = pointer.contiguous().view(batch_size, 1, 1).expand(batch_size, 1, tag_size)
back_points = back_points.transpose(1, 0).contiguous() # (batch_size,sq_len,tag_size)
## move the end ids(expand to tag_size) to the corresponding position of back_points to replace the 0 values
# print "lp:",last_position
# print "il:",insert_last
back_points.scatter_(1, last_position, insert_last) ##(batch_size,sq_len,tag_size)
# print "bp:",back_points
# exit(0)
back_points = back_points.transpose(1, 0).contiguous() # (seq_len, batch_size, tag_size)
## decode from the end, padded position ids are 0, which will be filtered if following evaluation
decode_idx = autograd.Variable(torch.LongTensor(seq_len, batch_size))
if self.gpu:
decode_idx = decode_idx.cuda(self.device)
decode_idx[-1] = pointer.data
for idx in range(len(back_points) - 2, -1, -1):
pointer = torch.gather(back_points[idx], 1,
pointer.contiguous().view(batch_size, 1)) # pointer's size:(batch_size,1)
decode_idx[idx] = pointer.squeeze(1).data
path_score = None
decode_idx = decode_idx.transpose(1, 0) # (batch_size, sent_len)
return path_score, decode_idx #
def forward(self, feats):
path_score, best_path = self._viterbi_decode(feats)
return path_score, best_path
def _score_sentence(self, scores, mask, tags):
"""
input:
scores: variable (seq_len, batch, tag_size, tag_size)
mask: (batch, seq_len)
tags: tensor (batch, seq_len)
output:
score: sum of score for gold sequences within whole batch
"""
# Gives the score of a provided tag sequence
batch_size = scores.size(1)
seq_len = scores.size(0)
tag_size = scores.size(2)
## convert tag value into a new format, recorded label bigram information to index
new_tags = autograd.Variable(torch.LongTensor(batch_size, seq_len))
if self.gpu:
new_tags = new_tags.cuda(self.device)
for idx in range(seq_len):
if idx == 0:
## start -> first score
new_tags[:, 0] = (tag_size - 2) * tag_size + tags[:, 0]
else:
new_tags[:, idx] = tags[:, idx - 1] * tag_size + tags[:, idx]
## transition for label to STOP_TAG
end_transition = self.transitions[:, STOP_TAG].contiguous().view(1, tag_size).expand(batch_size, tag_size)
## length for batch, last word position = length - 1
length_mask = torch.sum(mask.long(), dim=1).view(batch_size, 1).long()
## index the label id of last word
end_ids = torch.gather(tags, 1, length_mask - 1)
## index the transition score for end_id to STOP_TAG
end_energy = torch.gather(end_transition, 1, end_ids)
## convert tag as (seq_len, batch_size, 1)
new_tags = new_tags.transpose(1, 0).contiguous().view(seq_len, batch_size, 1)
### need convert tags id to search from 400 positions of scores
tg_energy = torch.gather(scores.view(seq_len, batch_size, -1), 2, new_tags).view(seq_len,
batch_size) # seq_len * bat_size
## mask transpose to (seq_len, batch_size)
tg_energy = tg_energy.masked_select(mask.transpose(1, 0))
# ## calculate the score from START_TAG to first label
# start_transition = self.transitions[START_TAG,:].view(1, tag_size).expand(batch_size, tag_size)
# start_energy = torch.gather(start_transition, 1, tags[0,:])
## add all score together
# gold_score = start_energy.sum() + tg_energy.sum() + end_energy.sum()
gold_score = tg_energy.sum() + end_energy.sum()
return gold_score
def neg_log_likelihood_loss(self, feats, mask, tags):
# nonegative log likelihood
batch_size = feats.size(0)
forward_score, scores = self._calculate_PZ(feats,
mask) # forward_score:long, scores: (seq_len, batch, tag_size, tag_size)
gold_score = self._score_sentence(scores, mask, tags)
# print ("batch, f:", forward_score.data, " g:", gold_score.data, " dis:", forward_score.data - gold_score.data)
# exit(0)
if self.average_batch:
return (forward_score - gold_score) / batch_size
else:
return forward_score - gold_score