ner of bert+bilstm+crf
This commit is contained in:
parent
ff8cd5ab0b
commit
d7518a3c92
5
Ner/__init__.py
Normal file
5
Ner/__init__.py
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
# -*- coding: UTF-8 -*-
|
||||||
|
# !/usr/bin/python
|
||||||
|
# @time :2019/5/21 15:23
|
||||||
|
# @author :Mo
|
||||||
|
# @function :
|
5
Ner/bert/__init__.py
Normal file
5
Ner/bert/__init__.py
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
# -*- coding: UTF-8 -*-
|
||||||
|
# !/usr/bin/python
|
||||||
|
# @time :2019/5/21 15:23
|
||||||
|
# @author :Mo
|
||||||
|
# @function :
|
33
Ner/bert/args.py
Normal file
33
Ner/bert/args.py
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
# bi-lstm
|
||||||
|
return_sequences = True
|
||||||
|
use_cudnn_cell = True
|
||||||
|
use_lstm = True
|
||||||
|
use_crf = True
|
||||||
|
is_training = True
|
||||||
|
|
||||||
|
loss = 'categorical_crossentropy'
|
||||||
|
metrics = ['accuracy'] # 'crf_loss' # ['accuracy']
|
||||||
|
activation = 'relu' # 'relu'
|
||||||
|
optimizers = 'adam'
|
||||||
|
learning_rate = 1e-3
|
||||||
|
epsilon = 1e-9
|
||||||
|
embedding_dim = 768
|
||||||
|
keep_prob = 0.5
|
||||||
|
units = 256
|
||||||
|
decay = 0.0
|
||||||
|
label = 7
|
||||||
|
l2 = 0.032
|
||||||
|
|
||||||
|
epochs = 320
|
||||||
|
batch_size = 16
|
||||||
|
path_save_model = 'models/bilstm/bert_ner_bilstm_no_12_config.h5'
|
||||||
|
path_tag_li = 'models/bilstm/tag_l_i.pkl'
|
||||||
|
|
||||||
|
# gpu使用率
|
||||||
|
gpu_memory_fraction = 0.32
|
||||||
|
|
||||||
|
# ner当然是所有层都会提取啦,句向量默认取倒数第二层的输出值作为句向量
|
||||||
|
layer_indexes = [i for i in range(13)] # [-2]
|
||||||
|
|
||||||
|
# 序列的最大程度
|
||||||
|
max_seq_len = 50
|
90
Ner/bert/keras_bert_embedding.py
Normal file
90
Ner/bert/keras_bert_embedding.py
Normal file
@ -0,0 +1,90 @@
|
|||||||
|
# -*- coding: UTF-8 -*-
|
||||||
|
# !/usr/bin/python
|
||||||
|
# @time :2019/5/8 20:04
|
||||||
|
# @author :Mo
|
||||||
|
# @function :embedding of bert keras
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
import keras.backend.tensorflow_backend as ktf_keras
|
||||||
|
import tensorflow as tf
|
||||||
|
from Ner.bert.keras_bert_layer import NonMaskingLayer
|
||||||
|
from keras.layers import Add, Concatenate
|
||||||
|
from keras.models import Model
|
||||||
|
from keras_bert import load_trained_model_from_checkpoint
|
||||||
|
|
||||||
|
from Ner.bert.args import gpu_memory_fraction, max_seq_len, layer_indexes
|
||||||
|
from conf.feature_config import config_name, ckpt_name, vocab_file
|
||||||
|
|
||||||
|
# 全局使用,使其可以django、flask、tornado等调用
|
||||||
|
graph = None
|
||||||
|
model = None
|
||||||
|
|
||||||
|
# gpu配置与使用率设置
|
||||||
|
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
|
||||||
|
config = tf.ConfigProto()
|
||||||
|
config.gpu_options.per_process_gpu_memory_fraction = gpu_memory_fraction
|
||||||
|
sess = tf.Session(config=config)
|
||||||
|
ktf_keras.set_session(sess)
|
||||||
|
|
||||||
|
class KerasBertEmbedding():
|
||||||
|
def __init__(self):
|
||||||
|
self.config_path, self.checkpoint_path, self.dict_path, self.max_seq_len = config_name, ckpt_name, vocab_file, max_seq_len
|
||||||
|
|
||||||
|
def bert_encode(self):
|
||||||
|
# 全局使用,使其可以django、flask、tornado等调用
|
||||||
|
global graph
|
||||||
|
graph = tf.get_default_graph()
|
||||||
|
global model
|
||||||
|
model = load_trained_model_from_checkpoint(self.config_path, self.checkpoint_path,
|
||||||
|
seq_len=self.max_seq_len)
|
||||||
|
bert_layers = model.layers
|
||||||
|
# return model
|
||||||
|
print(bert_layers)
|
||||||
|
print(model.output)
|
||||||
|
print(len(model.layers))
|
||||||
|
# lay = model.layers
|
||||||
|
#一共104个layer,其中前八层包括token,pos,embed等,
|
||||||
|
# 每8层(MultiHeadAttention,Dropout,Add,LayerNormalization)
|
||||||
|
# 一共12层+最开始未处理那层(可以理解为input)
|
||||||
|
layer_dict = [7]
|
||||||
|
layer_0 = 7
|
||||||
|
for i in range(13):
|
||||||
|
layer_0 = layer_0 + 8
|
||||||
|
layer_dict.append(layer_0)
|
||||||
|
|
||||||
|
# 输出它本身
|
||||||
|
if len(layer_indexes) == 0:
|
||||||
|
encoder_layer = model.output
|
||||||
|
# 分类如果只有一层,就只取最后那一层的weight,取得不正确
|
||||||
|
elif len(layer_indexes) == 1:
|
||||||
|
if layer_indexes[0] in [i+1 for i in range(12)]:
|
||||||
|
encoder_layer = model.get_layer(index=layer_dict[layer_indexes[0]]).output
|
||||||
|
else:
|
||||||
|
encoder_layer = model.get_layer(index=layer_dict[-1]).output
|
||||||
|
# 否则遍历需要取的层,把所有层的weight取出来并拼接起来shape:768*层数
|
||||||
|
else:
|
||||||
|
# layer_indexes must be [1,2,3,......12]
|
||||||
|
# all_layers = [model.get_layer(index=lay).output if lay is not 1 else model.get_layer(index=lay).output[0] for lay in layer_indexes]
|
||||||
|
all_layers = [model.get_layer(index=layer_dict[lay]).output if lay in [i for i in range(13)]
|
||||||
|
else model.get_layer(index=layer_dict[-1]).output #如果给出不正确,就默认输出最后一层
|
||||||
|
for lay in layer_indexes]
|
||||||
|
print(layer_indexes)
|
||||||
|
print(all_layers)
|
||||||
|
all_layers_select = []
|
||||||
|
for all_layers_one in all_layers:
|
||||||
|
all_layers_select.append(all_layers_one)
|
||||||
|
# encoder_layer = Add()(all_layers_select)
|
||||||
|
encoder_layer = Concatenate(axis=-1)(all_layers_select)
|
||||||
|
print(encoder_layer.shape)
|
||||||
|
print("KerasBertEmbedding:")
|
||||||
|
print(encoder_layer.shape)
|
||||||
|
output = NonMaskingLayer()(encoder_layer)
|
||||||
|
model = Model(model.inputs, output)
|
||||||
|
# model.summary(120)
|
||||||
|
return model.inputs, model.output
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
bert_vector = KerasBertEmbedding()
|
||||||
|
pooled = bert_vector.bert_encode()
|
780
Ner/bert/keras_bert_layer.py
Normal file
780
Ner/bert/keras_bert_layer.py
Normal file
@ -0,0 +1,780 @@
|
|||||||
|
# -*- coding: UTF-8 -*-
|
||||||
|
# !/usr/bin/python
|
||||||
|
# @time :2019/5/10 10:49
|
||||||
|
# @author :Mo
|
||||||
|
# @function : 1. create model of keras-bert for get [-2] layers
|
||||||
|
# 2. create model of AttentionWeightedAverage for get avg attention pooling
|
||||||
|
# 3. create layer of
|
||||||
|
# code class NonMaskingLayer from https://github.com/jacoxu
|
||||||
|
# code class AttentionWeightedAverage from https://github.com/BrikerMan/Kashgari
|
||||||
|
# code class CRF most from https://github.com/keras-team/keras-contrib, a little of 'theano' from https://github.com/BrikerMan/Kashgari
|
||||||
|
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import division
|
||||||
|
|
||||||
|
|
||||||
|
from keras.engine import InputSpec
|
||||||
|
import keras.backend as k_keras
|
||||||
|
from keras.engine import Layer
|
||||||
|
from keras import initializers
|
||||||
|
from keras import backend as K
|
||||||
|
from keras import regularizers
|
||||||
|
from keras import activations
|
||||||
|
from keras import constraints
|
||||||
|
import warnings
|
||||||
|
import keras
|
||||||
|
# crf_loss
|
||||||
|
from keras.losses import sparse_categorical_crossentropy
|
||||||
|
from keras.losses import categorical_crossentropy
|
||||||
|
|
||||||
|
|
||||||
|
class NonMaskingLayer(Layer):
|
||||||
|
"""
|
||||||
|
fix convolutional 1D can't receive masked input, detail: https://github.com/keras-team/keras/issues/4978
|
||||||
|
thanks for https://github.com/jacoxu
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, **kwargs):
|
||||||
|
self.supports_masking = True
|
||||||
|
super(NonMaskingLayer, self).__init__(**kwargs)
|
||||||
|
|
||||||
|
def build(self, input_shape):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def compute_mask(self, input, input_mask=None):
|
||||||
|
# do not pass the mask to the next layers
|
||||||
|
return None
|
||||||
|
|
||||||
|
def call(self, x, mask=None):
|
||||||
|
return x
|
||||||
|
|
||||||
|
def compute_output_shape(self, input_shape):
|
||||||
|
return input_shape
|
||||||
|
|
||||||
|
|
||||||
|
class AttentionWeightedAverage(Layer):
|
||||||
|
'''
|
||||||
|
codes from: https://github.com/BrikerMan/Kashgari
|
||||||
|
detail: https://github.com/BrikerMan/Kashgari/blob/master/kashgari/tasks/classification/models.py
|
||||||
|
Computes a weighted average of the different channels across timesteps.
|
||||||
|
Uses 1 parameter pr. channel to compute the attention value for a single timestep.
|
||||||
|
'''
|
||||||
|
|
||||||
|
def __init__(self, return_attention=False, **kwargs):
|
||||||
|
self.init = initializers.get('uniform')
|
||||||
|
self.supports_masking = True
|
||||||
|
self.return_attention = return_attention
|
||||||
|
super(AttentionWeightedAverage, self).__init__(**kwargs)
|
||||||
|
|
||||||
|
def build(self, input_shape):
|
||||||
|
self.input_spec = [InputSpec(ndim=3)]
|
||||||
|
assert len(input_shape) == 3
|
||||||
|
|
||||||
|
self.W = self.add_weight(shape=(input_shape[2], 1),
|
||||||
|
name='{}_w'.format(self.name),
|
||||||
|
initializer=self.init)
|
||||||
|
self.trainable_weights = [self.W]
|
||||||
|
super(AttentionWeightedAverage, self).build(input_shape)
|
||||||
|
|
||||||
|
def call(self, x, mask=None):
|
||||||
|
# computes a probability distribution over the timesteps
|
||||||
|
# uses 'max trick' for numerical stability
|
||||||
|
# reshape is done to avoid issue with Tensorflow
|
||||||
|
# and 1-dimensional weights
|
||||||
|
logits = k_keras.dot(x, self.W)
|
||||||
|
x_shape = k_keras.shape(x)
|
||||||
|
logits = k_keras.reshape(logits, (x_shape[0], x_shape[1]))
|
||||||
|
ai = k_keras.exp(logits - k_keras.max(logits, axis=-1, keepdims=True))
|
||||||
|
|
||||||
|
# masked timesteps have zero weight
|
||||||
|
if mask is not None:
|
||||||
|
mask = k_keras.cast(mask, k_keras.floatx())
|
||||||
|
ai = ai * mask
|
||||||
|
att_weights = ai / (k_keras.sum(ai, axis=1, keepdims=True) + k_keras.epsilon())
|
||||||
|
weighted_input = x * k_keras.expand_dims(att_weights)
|
||||||
|
result = k_keras.sum(weighted_input, axis=1)
|
||||||
|
if self.return_attention:
|
||||||
|
return [result, att_weights]
|
||||||
|
return result
|
||||||
|
|
||||||
|
def get_output_shape_for(self, input_shape):
|
||||||
|
return self.compute_output_shape(input_shape)
|
||||||
|
|
||||||
|
def compute_output_shape(self, input_shape):
|
||||||
|
output_len = input_shape[2]
|
||||||
|
if self.return_attention:
|
||||||
|
return [(input_shape[0], output_len), (input_shape[0], input_shape[1])]
|
||||||
|
return (input_shape[0], output_len)
|
||||||
|
|
||||||
|
def compute_mask(self, input, input_mask=None):
|
||||||
|
if isinstance(input_mask, list):
|
||||||
|
return [None] * len(input_mask)
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# crf_loss
|
||||||
|
def crf_nll(y_true, y_pred):
|
||||||
|
"""The negative log-likelihood for linear chain Conditional Random Field (CRF).
|
||||||
|
This loss function is only used when the `layers.CRF` layer
|
||||||
|
is trained in the "join" mode.
|
||||||
|
# Arguments
|
||||||
|
y_true: tensor with true targets.
|
||||||
|
y_pred: tensor with predicted targets.
|
||||||
|
# Returns
|
||||||
|
A scalar representing corresponding to the negative log-likelihood.
|
||||||
|
# Raises
|
||||||
|
TypeError: If CRF is not the last layer.
|
||||||
|
# About GitHub
|
||||||
|
If you open an issue or a pull request about CRF, please
|
||||||
|
add `cc @lzfelix` to notify Luiz Felix.
|
||||||
|
"""
|
||||||
|
|
||||||
|
crf, idx = y_pred._keras_history[:2]
|
||||||
|
if crf._outbound_nodes:
|
||||||
|
raise TypeError('When learn_model="join", CRF must be the last layer.')
|
||||||
|
if crf.sparse_target:
|
||||||
|
y_true = K.one_hot(K.cast(y_true[:, :, 0], 'int32'), crf.units)
|
||||||
|
X = crf._inbound_nodes[idx].input_tensors[0]
|
||||||
|
mask = crf._inbound_nodes[idx].input_masks[0]
|
||||||
|
nloglik = crf.get_negative_log_likelihood(y_true, X, mask)
|
||||||
|
# 新加的
|
||||||
|
# nloglik = k_keras.abs(nloglik)
|
||||||
|
return nloglik
|
||||||
|
|
||||||
|
def crf_loss(y_true, y_pred):
|
||||||
|
"""General CRF loss function depending on the learning mode.
|
||||||
|
# Arguments
|
||||||
|
y_true: tensor with true targets.
|
||||||
|
y_pred: tensor with predicted targets.
|
||||||
|
# Returns
|
||||||
|
If the CRF layer is being trained in the join mode, returns the negative
|
||||||
|
log-likelihood. Otherwise returns the categorical crossentropy implemented
|
||||||
|
by the underlying Keras backend.
|
||||||
|
# About GitHub
|
||||||
|
If you open an issue or a pull request about CRF, please
|
||||||
|
add `cc @lzfelix` to notify Luiz Felix.
|
||||||
|
"""
|
||||||
|
crf, idx = y_pred._keras_history[:2]
|
||||||
|
if crf.learn_mode == 'join':
|
||||||
|
return crf_nll(y_true, y_pred)
|
||||||
|
else:
|
||||||
|
if crf.sparse_target:
|
||||||
|
return sparse_categorical_crossentropy(y_true, y_pred)
|
||||||
|
else:
|
||||||
|
return categorical_crossentropy(y_true, y_pred)
|
||||||
|
|
||||||
|
# crf_marginal_accuracy, crf_viterbi_accuracy
|
||||||
|
def _get_accuracy(y_true, y_pred, mask, sparse_target=False):
|
||||||
|
"""
|
||||||
|
:param y_true:
|
||||||
|
:param y_pred:
|
||||||
|
:param mask:
|
||||||
|
:param sparse_target:
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
y_pred = K.argmax(y_pred, -1)
|
||||||
|
if sparse_target:
|
||||||
|
y_true = K.cast(y_true[:, :, 0], K.dtype(y_pred))
|
||||||
|
else:
|
||||||
|
y_true = K.argmax(y_true, -1)
|
||||||
|
judge = K.cast(K.equal(y_pred, y_true), K.floatx())
|
||||||
|
if mask is None:
|
||||||
|
return K.mean(judge)
|
||||||
|
else:
|
||||||
|
mask = K.cast(mask, K.floatx())
|
||||||
|
return K.sum(judge * mask) / K.sum(mask)
|
||||||
|
|
||||||
|
def crf_viterbi_accuracy(y_true, y_pred):
|
||||||
|
'''Use Viterbi algorithm to get best path, and compute its accuracy.
|
||||||
|
`y_pred` must be an output from CRF.'''
|
||||||
|
crf, idx = y_pred._keras_history[:2]
|
||||||
|
X = crf._inbound_nodes[idx].input_tensors[0]
|
||||||
|
mask = crf._inbound_nodes[idx].input_masks[0]
|
||||||
|
y_pred = crf.viterbi_decoding(X, mask)
|
||||||
|
return _get_accuracy(y_true, y_pred, mask, crf.sparse_target)
|
||||||
|
|
||||||
|
def crf_marginal_accuracy(y_true, y_pred):
|
||||||
|
'''Use time-wise marginal argmax as prediction.
|
||||||
|
`y_pred` must be an output from CRF with `learn_mode="marginal"`.'''
|
||||||
|
crf, idx = y_pred._keras_history[:2]
|
||||||
|
X = crf._inbound_nodes[idx].input_tensors[0]
|
||||||
|
mask = crf._inbound_nodes[idx].input_masks[0]
|
||||||
|
y_pred = crf.get_marginal_prob(X, mask)
|
||||||
|
return _get_accuracy(y_true, y_pred, mask, crf.sparse_target)
|
||||||
|
|
||||||
|
def crf_accuracy(y_true, y_pred):
|
||||||
|
'''Ge default accuracy based on CRF `test_mode`.'''
|
||||||
|
crf, idx = y_pred._keras_history[:2]
|
||||||
|
if crf.test_mode == 'viterbi':
|
||||||
|
return crf_viterbi_accuracy(y_true, y_pred)
|
||||||
|
else:
|
||||||
|
return crf_marginal_accuracy(y_true, y_pred)
|
||||||
|
|
||||||
|
def to_tuple(shape):
|
||||||
|
"""This functions is here to fix an inconsistency between keras and tf.keras.
|
||||||
|
In tf.keras, the input_shape argument is an tuple with `Dimensions` objects.
|
||||||
|
In keras, the input_shape is a simple tuple of ints or `None`.
|
||||||
|
We'll work with tuples of ints or `None` to be consistent
|
||||||
|
with keras-team/keras. So we must apply this function to
|
||||||
|
all input_shapes of the build methods in custom layers.
|
||||||
|
"""
|
||||||
|
if is_tf_keras:
|
||||||
|
import tensorflow as tf
|
||||||
|
return tuple(tf.TensorShape(shape).as_list())
|
||||||
|
else:
|
||||||
|
return shape
|
||||||
|
|
||||||
|
class CRF(Layer):
|
||||||
|
"""
|
||||||
|
codes from: https://github.com/keras-team/keras-contrib
|
||||||
|
detail: https://github.com/keras-team/keras-contrib/blob/fff264273d5347613574ff533c598f55f15d4763/keras_contrib/layers/crf.py
|
||||||
|
|
||||||
|
An implementation of linear chain conditional random field (CRF).
|
||||||
|
An linear chain CRF is defined to maximize the following likelihood function:
|
||||||
|
$$ L(W, U, b; y_1, ..., y_n) := \frac{1}{Z}
|
||||||
|
\sum_{y_1, ..., y_n} \exp(-a_1' y_1 - a_n' y_n
|
||||||
|
- \sum_{k=1^n}((f(x_k' W + b) y_k) + y_1' U y_2)), $$
|
||||||
|
where:
|
||||||
|
$Z$: normalization constant
|
||||||
|
$x_k, y_k$: inputs and outputs
|
||||||
|
This implementation has two modes for optimization:
|
||||||
|
1. (`join mode`) optimized by maximizing join likelihood,
|
||||||
|
which is optimal in theory of statistics.
|
||||||
|
Note that in this case, CRF must be the output/last layer.
|
||||||
|
2. (`marginal mode`) return marginal probabilities on each time
|
||||||
|
step and optimized via composition
|
||||||
|
likelihood (product of marginal likelihood), i.e.,
|
||||||
|
using `categorical_crossentropy` loss.
|
||||||
|
Note that in this case, CRF can be either the last layer or an
|
||||||
|
intermediate layer (though not explored).
|
||||||
|
For prediction (test phrase), one can choose either Viterbi
|
||||||
|
best path (class indices) or marginal
|
||||||
|
probabilities if probabilities are needed.
|
||||||
|
However, if one chooses *join mode* for training,
|
||||||
|
Viterbi output is typically better than marginal output,
|
||||||
|
but the marginal output will still perform
|
||||||
|
reasonably close, while if *marginal mode* is used for training,
|
||||||
|
marginal output usually performs
|
||||||
|
much better. The default behavior and `metrics.crf_accuracy`
|
||||||
|
is set according to this observation.
|
||||||
|
In addition, this implementation supports masking and accepts either
|
||||||
|
onehot or sparse target.
|
||||||
|
If you open a issue or a pull request about CRF, please
|
||||||
|
add 'cc @lzfelix' to notify Luiz Felix.
|
||||||
|
# Examples
|
||||||
|
```python
|
||||||
|
from keras_contrib.layers import CRF
|
||||||
|
from keras_contrib.losses import crf_loss
|
||||||
|
from keras_contrib.metrics import crf_viterbi_accuracy
|
||||||
|
model = Sequential()
|
||||||
|
model.add(Embedding(3001, 300, mask_zero=True)(X)
|
||||||
|
# use learn_mode = 'join', test_mode = 'viterbi',
|
||||||
|
# sparse_target = True (label indice output)
|
||||||
|
crf = CRF(10, sparse_target=True)
|
||||||
|
model.add(crf)
|
||||||
|
# crf_accuracy is default to Viterbi acc if using join-mode (default).
|
||||||
|
# One can add crf.marginal_acc if interested, but may slow down learning
|
||||||
|
model.compile('adam', loss=crf_loss, metrics=[crf_viterbi_accuracy])
|
||||||
|
# y must be label indices (with shape 1 at dim 3) here,
|
||||||
|
# since `sparse_target=True`
|
||||||
|
model.fit(x, y)
|
||||||
|
# prediction give onehot representation of Viterbi best path
|
||||||
|
y_hat = model.predict(x_test)
|
||||||
|
```
|
||||||
|
The following snippet shows how to load a persisted
|
||||||
|
model that uses the CRF layer:
|
||||||
|
```python
|
||||||
|
from keras.models import load_model
|
||||||
|
from keras_contrib.losses import import crf_loss
|
||||||
|
from keras_contrib.metrics import crf_viterbi_accuracy
|
||||||
|
custom_objects={'CRF': CRF,
|
||||||
|
'crf_loss': crf_loss,
|
||||||
|
'crf_viterbi_accuracy': crf_viterbi_accuracy}
|
||||||
|
loaded_model = load_model('<path_to_model>',
|
||||||
|
custom_objects=custom_objects)
|
||||||
|
```
|
||||||
|
# Arguments
|
||||||
|
units: Positive integer, dimensionality of the output space.
|
||||||
|
learn_mode: Either 'join' or 'marginal'.
|
||||||
|
The former train the model by maximizing join likelihood while the latter
|
||||||
|
maximize the product of marginal likelihood over all time steps.
|
||||||
|
One should use `losses.crf_nll` for 'join' mode
|
||||||
|
and `losses.categorical_crossentropy` or
|
||||||
|
`losses.sparse_categorical_crossentropy` for
|
||||||
|
`marginal` mode. For convenience, simply
|
||||||
|
use `losses.crf_loss`, which will decide the proper loss as described.
|
||||||
|
test_mode: Either 'viterbi' or 'marginal'.
|
||||||
|
The former is recommended and as default when `learn_mode = 'join'` and
|
||||||
|
gives one-hot representation of the best path at test (prediction) time,
|
||||||
|
while the latter is recommended and chosen as default
|
||||||
|
when `learn_mode = 'marginal'`,
|
||||||
|
which produces marginal probabilities for each time step.
|
||||||
|
For evaluating metrics, one should
|
||||||
|
use `metrics.crf_viterbi_accuracy` for 'viterbi' mode and
|
||||||
|
'metrics.crf_marginal_accuracy' for 'marginal' mode, or
|
||||||
|
simply use `metrics.crf_accuracy` for
|
||||||
|
both which automatically decides it as described.
|
||||||
|
One can also use both for evaluation at training.
|
||||||
|
sparse_target: Boolean (default False) indicating
|
||||||
|
if provided labels are one-hot or
|
||||||
|
indices (with shape 1 at dim 3).
|
||||||
|
use_boundary: Boolean (default True) indicating if trainable
|
||||||
|
start-end chain energies
|
||||||
|
should be added to model.
|
||||||
|
use_bias: Boolean, whether the layer uses a bias vector.
|
||||||
|
kernel_initializer: Initializer for the `kernel` weights matrix,
|
||||||
|
used for the linear transformation of the inputs.
|
||||||
|
(see [initializers](../initializers.md)).
|
||||||
|
chain_initializer: Initializer for the `chain_kernel` weights matrix,
|
||||||
|
used for the CRF chain energy.
|
||||||
|
(see [initializers](../initializers.md)).
|
||||||
|
boundary_initializer: Initializer for the `left_boundary`,
|
||||||
|
'right_boundary' weights vectors,
|
||||||
|
used for the start/left and end/right boundary energy.
|
||||||
|
(see [initializers](../initializers.md)).
|
||||||
|
bias_initializer: Initializer for the bias vector
|
||||||
|
(see [initializers](../initializers.md)).
|
||||||
|
activation: Activation function to use
|
||||||
|
(see [activations](../activations.md)).
|
||||||
|
If you pass None, no activation is applied
|
||||||
|
(ie. "linear" activation: `a(x) = x`).
|
||||||
|
kernel_regularizer: Regularizer function applied to
|
||||||
|
the `kernel` weights matrix
|
||||||
|
(see [regularizer](../regularizers.md)).
|
||||||
|
chain_regularizer: Regularizer function applied to
|
||||||
|
the `chain_kernel` weights matrix
|
||||||
|
(see [regularizer](../regularizers.md)).
|
||||||
|
boundary_regularizer: Regularizer function applied to
|
||||||
|
the 'left_boundary', 'right_boundary' weight vectors
|
||||||
|
(see [regularizer](../regularizers.md)).
|
||||||
|
bias_regularizer: Regularizer function applied to the bias vector
|
||||||
|
(see [regularizer](../regularizers.md)).
|
||||||
|
kernel_constraint: Constraint function applied to
|
||||||
|
the `kernel` weights matrix
|
||||||
|
(see [constraints](../constraints.md)).
|
||||||
|
chain_constraint: Constraint function applied to
|
||||||
|
the `chain_kernel` weights matrix
|
||||||
|
(see [constraints](../constraints.md)).
|
||||||
|
boundary_constraint: Constraint function applied to
|
||||||
|
the `left_boundary`, `right_boundary` weights vectors
|
||||||
|
(see [constraints](../constraints.md)).
|
||||||
|
bias_constraint: Constraint function applied to the bias vector
|
||||||
|
(see [constraints](../constraints.md)).
|
||||||
|
input_dim: dimensionality of the input (integer).
|
||||||
|
This argument (or alternatively, the keyword argument `input_shape`)
|
||||||
|
is required when using this layer as the first layer in a model.
|
||||||
|
unroll: Boolean (default False). If True, the network will be
|
||||||
|
unrolled, else a symbolic loop will be used.
|
||||||
|
Unrolling can speed-up a RNN, although it tends
|
||||||
|
to be more memory-intensive.
|
||||||
|
Unrolling is only suitable for short sequences.
|
||||||
|
# Input shape
|
||||||
|
3D tensor with shape `(nb_samples, timesteps, input_dim)`.
|
||||||
|
# Output shape
|
||||||
|
3D tensor with shape `(nb_samples, timesteps, units)`.
|
||||||
|
# Masking
|
||||||
|
This layer supports masking for input data with a variable number
|
||||||
|
of timesteps. To introduce masks to your data,
|
||||||
|
use an [Embedding](embeddings.md) layer with the `mask_zero` parameter
|
||||||
|
set to `True`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, units,
|
||||||
|
learn_mode='join',
|
||||||
|
test_mode=None,
|
||||||
|
sparse_target=False,
|
||||||
|
use_boundary=True,
|
||||||
|
use_bias=True,
|
||||||
|
activation='linear',
|
||||||
|
kernel_initializer='glorot_uniform',
|
||||||
|
chain_initializer='orthogonal',
|
||||||
|
bias_initializer='zeros',
|
||||||
|
boundary_initializer='zeros',
|
||||||
|
kernel_regularizer=None,
|
||||||
|
chain_regularizer=None,
|
||||||
|
boundary_regularizer=None,
|
||||||
|
bias_regularizer=None,
|
||||||
|
kernel_constraint=None,
|
||||||
|
chain_constraint=None,
|
||||||
|
boundary_constraint=None,
|
||||||
|
bias_constraint=None,
|
||||||
|
input_dim=None,
|
||||||
|
unroll=False,
|
||||||
|
**kwargs):
|
||||||
|
super(CRF, self).__init__(**kwargs)
|
||||||
|
self.supports_masking = True
|
||||||
|
self.units = units
|
||||||
|
self.learn_mode = learn_mode
|
||||||
|
assert self.learn_mode in ['join', 'marginal']
|
||||||
|
self.test_mode = test_mode
|
||||||
|
if self.test_mode is None:
|
||||||
|
self.test_mode = 'viterbi' if self.learn_mode == 'join' else 'marginal'
|
||||||
|
else:
|
||||||
|
assert self.test_mode in ['viterbi', 'marginal']
|
||||||
|
self.sparse_target = sparse_target
|
||||||
|
self.use_boundary = use_boundary
|
||||||
|
self.use_bias = use_bias
|
||||||
|
|
||||||
|
self.activation = activations.get(activation)
|
||||||
|
|
||||||
|
self.kernel_initializer = initializers.get(kernel_initializer)
|
||||||
|
self.chain_initializer = initializers.get(chain_initializer)
|
||||||
|
self.boundary_initializer = initializers.get(boundary_initializer)
|
||||||
|
self.bias_initializer = initializers.get(bias_initializer)
|
||||||
|
|
||||||
|
self.kernel_regularizer = regularizers.get(kernel_regularizer)
|
||||||
|
self.chain_regularizer = regularizers.get(chain_regularizer)
|
||||||
|
self.boundary_regularizer = regularizers.get(boundary_regularizer)
|
||||||
|
self.bias_regularizer = regularizers.get(bias_regularizer)
|
||||||
|
|
||||||
|
self.kernel_constraint = constraints.get(kernel_constraint)
|
||||||
|
self.chain_constraint = constraints.get(chain_constraint)
|
||||||
|
self.boundary_constraint = constraints.get(boundary_constraint)
|
||||||
|
self.bias_constraint = constraints.get(bias_constraint)
|
||||||
|
|
||||||
|
self.unroll = unroll
|
||||||
|
|
||||||
|
def build(self, input_shape):
|
||||||
|
# input_shape = to_tuple(input_shape)
|
||||||
|
self.input_spec = [InputSpec(shape=input_shape)]
|
||||||
|
self.input_dim = input_shape[-1]
|
||||||
|
|
||||||
|
self.kernel = self.add_weight(shape=(self.input_dim, self.units),
|
||||||
|
name='kernel',
|
||||||
|
initializer=self.kernel_initializer,
|
||||||
|
regularizer=self.kernel_regularizer,
|
||||||
|
constraint=self.kernel_constraint)
|
||||||
|
self.chain_kernel = self.add_weight(shape=(self.units, self.units),
|
||||||
|
name='chain_kernel',
|
||||||
|
initializer=self.chain_initializer,
|
||||||
|
regularizer=self.chain_regularizer,
|
||||||
|
constraint=self.chain_constraint)
|
||||||
|
if self.use_bias:
|
||||||
|
self.bias = self.add_weight(shape=(self.units,),
|
||||||
|
name='bias',
|
||||||
|
initializer=self.bias_initializer,
|
||||||
|
regularizer=self.bias_regularizer,
|
||||||
|
constraint=self.bias_constraint)
|
||||||
|
else:
|
||||||
|
self.bias = 0
|
||||||
|
|
||||||
|
if self.use_boundary:
|
||||||
|
self.left_boundary = self.add_weight(shape=(self.units,),
|
||||||
|
name='left_boundary',
|
||||||
|
initializer=self.boundary_initializer,
|
||||||
|
regularizer=self.boundary_regularizer,
|
||||||
|
constraint=self.boundary_constraint)
|
||||||
|
self.right_boundary = self.add_weight(shape=(self.units,),
|
||||||
|
name='right_boundary',
|
||||||
|
initializer=self.boundary_initializer,
|
||||||
|
regularizer=self.boundary_regularizer,
|
||||||
|
constraint=self.boundary_constraint)
|
||||||
|
self.built = True
|
||||||
|
|
||||||
|
def call(self, X, mask=None):
|
||||||
|
if mask is not None:
|
||||||
|
assert K.ndim(mask) == 2, 'Input mask to CRF must have dim 2 if not None'
|
||||||
|
|
||||||
|
if self.test_mode == 'viterbi':
|
||||||
|
test_output = self.viterbi_decoding(X, mask)
|
||||||
|
else:
|
||||||
|
test_output = self.get_marginal_prob(X, mask)
|
||||||
|
|
||||||
|
self.uses_learning_phase = True
|
||||||
|
if self.learn_mode == 'join':
|
||||||
|
train_output = K.zeros_like(K.dot(X, self.kernel))
|
||||||
|
out = K.in_train_phase(train_output, test_output)
|
||||||
|
else:
|
||||||
|
if self.test_mode == 'viterbi':
|
||||||
|
train_output = self.get_marginal_prob(X, mask)
|
||||||
|
out = K.in_train_phase(train_output, test_output)
|
||||||
|
else:
|
||||||
|
out = test_output
|
||||||
|
return out
|
||||||
|
|
||||||
|
def compute_output_shape(self, input_shape):
|
||||||
|
return input_shape[:2] + (self.units,)
|
||||||
|
|
||||||
|
def compute_mask(self, input, mask=None):
|
||||||
|
if mask is not None and self.learn_mode == 'join':
|
||||||
|
return K.any(mask, axis=1)
|
||||||
|
return mask
|
||||||
|
|
||||||
|
def get_config(self):
|
||||||
|
config = {
|
||||||
|
'units': self.units,
|
||||||
|
'learn_mode': self.learn_mode,
|
||||||
|
'test_mode': self.test_mode,
|
||||||
|
'use_boundary': self.use_boundary,
|
||||||
|
'use_bias': self.use_bias,
|
||||||
|
'sparse_target': self.sparse_target,
|
||||||
|
'kernel_initializer': initializers.serialize(self.kernel_initializer),
|
||||||
|
'chain_initializer': initializers.serialize(self.chain_initializer),
|
||||||
|
'boundary_initializer': initializers.serialize(
|
||||||
|
self.boundary_initializer),
|
||||||
|
'bias_initializer': initializers.serialize(self.bias_initializer),
|
||||||
|
'activation': activations.serialize(self.activation),
|
||||||
|
'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
|
||||||
|
'chain_regularizer': regularizers.serialize(self.chain_regularizer),
|
||||||
|
'boundary_regularizer': regularizers.serialize(
|
||||||
|
self.boundary_regularizer),
|
||||||
|
'bias_regularizer': regularizers.serialize(self.bias_regularizer),
|
||||||
|
'kernel_constraint': constraints.serialize(self.kernel_constraint),
|
||||||
|
'chain_constraint': constraints.serialize(self.chain_constraint),
|
||||||
|
'boundary_constraint': constraints.serialize(self.boundary_constraint),
|
||||||
|
'bias_constraint': constraints.serialize(self.bias_constraint),
|
||||||
|
'input_dim': self.input_dim,
|
||||||
|
'unroll': self.unroll}
|
||||||
|
base_config = super(CRF, self).get_config()
|
||||||
|
return dict(list(base_config.items()) + list(config.items()))
|
||||||
|
|
||||||
|
# @property
|
||||||
|
# def loss_function(self):
|
||||||
|
# warnings.warn('CRF.loss_function is deprecated '
|
||||||
|
# 'and it might be removed in the future. Please '
|
||||||
|
# 'use losses.crf_loss instead.')
|
||||||
|
# return crf_loss
|
||||||
|
#
|
||||||
|
# @property
|
||||||
|
# def accuracy(self):
|
||||||
|
# warnings.warn('CRF.accuracy is deprecated and it '
|
||||||
|
# 'might be removed in the future. Please '
|
||||||
|
# 'use metrics.crf_accuracy')
|
||||||
|
# if self.test_mode == 'viterbi':
|
||||||
|
# return crf_viterbi_accuracy
|
||||||
|
# else:
|
||||||
|
# return crf_marginal_accuracy
|
||||||
|
#
|
||||||
|
# @property
|
||||||
|
# def viterbi_acc(self):
|
||||||
|
# warnings.warn('CRF.viterbi_acc is deprecated and it might '
|
||||||
|
# 'be removed in the future. Please '
|
||||||
|
# 'use metrics.viterbi_acc instead.')
|
||||||
|
# return crf_viterbi_accuracy
|
||||||
|
#
|
||||||
|
# @property
|
||||||
|
# def marginal_acc(self):
|
||||||
|
# warnings.warn('CRF.moarginal_acc is deprecated and it '
|
||||||
|
# 'might be removed in the future. Please '
|
||||||
|
# 'use metrics.marginal_acc instead.')
|
||||||
|
# return crf_marginal_accuracy
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def softmaxNd(x, axis=-1):
|
||||||
|
m = K.max(x, axis=axis, keepdims=True)
|
||||||
|
exp_x = K.exp(x - m)
|
||||||
|
prob_x = exp_x / K.sum(exp_x, axis=axis, keepdims=True)
|
||||||
|
return prob_x
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def shift_left(x, offset=1):
|
||||||
|
assert offset > 0
|
||||||
|
return K.concatenate([x[:, offset:], K.zeros_like(x[:, :offset])], axis=1)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def shift_right(x, offset=1):
|
||||||
|
assert offset > 0
|
||||||
|
return K.concatenate([K.zeros_like(x[:, :offset]), x[:, :-offset]], axis=1)
|
||||||
|
|
||||||
|
def add_boundary_energy(self, energy, mask, start, end):
|
||||||
|
start = K.expand_dims(K.expand_dims(start, 0), 0)
|
||||||
|
end = K.expand_dims(K.expand_dims(end, 0), 0)
|
||||||
|
if mask is None:
|
||||||
|
energy = K.concatenate([energy[:, :1, :] + start, energy[:, 1:, :]],
|
||||||
|
axis=1)
|
||||||
|
energy = K.concatenate([energy[:, :-1, :], energy[:, -1:, :] + end],
|
||||||
|
axis=1)
|
||||||
|
else:
|
||||||
|
mask = K.expand_dims(K.cast(mask, K.floatx()))
|
||||||
|
start_mask = K.cast(K.greater(mask, self.shift_right(mask)), K.floatx())
|
||||||
|
end_mask = K.cast(K.greater(self.shift_left(mask), mask), K.floatx())
|
||||||
|
energy = energy + start_mask * start
|
||||||
|
energy = energy + end_mask * end
|
||||||
|
return energy
|
||||||
|
|
||||||
|
def get_log_normalization_constant(self, input_energy, mask, **kwargs):
|
||||||
|
"""Compute logarithm of the normalization constant Z, where
|
||||||
|
Z = sum exp(-E) -> logZ = log sum exp(-E) =: -nlogZ
|
||||||
|
"""
|
||||||
|
# should have logZ[:, i] == logZ[:, j] for any i, j
|
||||||
|
logZ = self.recursion(input_energy, mask, return_sequences=False, **kwargs)
|
||||||
|
return logZ[:, 0]
|
||||||
|
|
||||||
|
def get_energy(self, y_true, input_energy, mask):
|
||||||
|
"""Energy = a1' y1 + u1' y1 + y1' U y2 + u2' y2 + y2' U y3 + u3' y3 + an' y3
|
||||||
|
"""
|
||||||
|
input_energy = K.sum(input_energy * y_true, 2) # (B, T)
|
||||||
|
# (B, T-1)
|
||||||
|
chain_energy = K.sum(K.dot(y_true[:, :-1, :],
|
||||||
|
self.chain_kernel) * y_true[:, 1:, :], 2)
|
||||||
|
|
||||||
|
if mask is not None:
|
||||||
|
mask = K.cast(mask, K.floatx())
|
||||||
|
# (B, T-1), mask[:,:-1]*mask[:,1:] makes it work with any padding
|
||||||
|
chain_mask = mask[:, :-1] * mask[:, 1:]
|
||||||
|
input_energy = input_energy * mask
|
||||||
|
chain_energy = chain_energy * chain_mask
|
||||||
|
total_energy = K.sum(input_energy, -1) + K.sum(chain_energy, -1) # (B, )
|
||||||
|
|
||||||
|
return total_energy
|
||||||
|
|
||||||
|
def get_negative_log_likelihood(self, y_true, X, mask):
|
||||||
|
"""Compute the loss, i.e., negative log likelihood (normalize by number of time steps)
|
||||||
|
likelihood = 1/Z * exp(-E) -> neg_log_like = - log(1/Z * exp(-E)) = logZ + E
|
||||||
|
"""
|
||||||
|
input_energy = self.activation(K.dot(X, self.kernel) + self.bias)
|
||||||
|
if self.use_boundary:
|
||||||
|
input_energy = self.add_boundary_energy(input_energy, mask,
|
||||||
|
self.left_boundary,
|
||||||
|
self.right_boundary)
|
||||||
|
energy = self.get_energy(y_true, input_energy, mask)
|
||||||
|
logZ = self.get_log_normalization_constant(input_energy, mask,
|
||||||
|
input_length=K.int_shape(X)[1])
|
||||||
|
nloglik = logZ + energy
|
||||||
|
if mask is not None:
|
||||||
|
nloglik = nloglik / K.sum(K.cast(mask, K.floatx()), 1)
|
||||||
|
else:
|
||||||
|
nloglik = nloglik / K.cast(K.shape(X)[1], K.floatx())
|
||||||
|
return nloglik
|
||||||
|
|
||||||
|
def step(self, input_energy_t, states, return_logZ=True):
|
||||||
|
# not in the following `prev_target_val` has shape = (B, F)
|
||||||
|
# where B = batch_size, F = output feature dim
|
||||||
|
# Note: `i` is of float32, due to the behavior of `K.rnn`
|
||||||
|
prev_target_val, i, chain_energy = states[:3]
|
||||||
|
t = K.cast(i[0, 0], dtype='int32')
|
||||||
|
if len(states) > 3:
|
||||||
|
if K.backend() == 'theano':
|
||||||
|
m = states[3][:, t:(t + 2)]
|
||||||
|
else:
|
||||||
|
m = K.tf.slice(states[3], [0, t], [-1, 2])
|
||||||
|
input_energy_t = input_energy_t * K.expand_dims(m[:, 0])
|
||||||
|
# (1, F, F)*(B, 1, 1) -> (B, F, F)
|
||||||
|
chain_energy = chain_energy * K.expand_dims(
|
||||||
|
K.expand_dims(m[:, 0] * m[:, 1]))
|
||||||
|
if return_logZ:
|
||||||
|
# shapes: (1, B, F) + (B, F, 1) -> (B, F, F)
|
||||||
|
energy = chain_energy + K.expand_dims(input_energy_t - prev_target_val, 2)
|
||||||
|
new_target_val = K.logsumexp(-energy, 1) # shapes: (B, F)
|
||||||
|
return new_target_val, [new_target_val, i + 1]
|
||||||
|
else:
|
||||||
|
energy = chain_energy + K.expand_dims(input_energy_t + prev_target_val, 2)
|
||||||
|
min_energy = K.min(energy, 1)
|
||||||
|
# cast for tf-version `K.rnn
|
||||||
|
argmin_table = K.cast(K.argmin(energy, 1), K.floatx())
|
||||||
|
return argmin_table, [min_energy, i + 1]
|
||||||
|
|
||||||
|
def recursion(self, input_energy, mask=None, go_backwards=False,
|
||||||
|
return_sequences=True, return_logZ=True, input_length=None):
|
||||||
|
"""Forward (alpha) or backward (beta) recursion
|
||||||
|
If `return_logZ = True`, compute the logZ, the normalization constant:
|
||||||
|
\[ Z = \sum_{y1, y2, y3} exp(-E) # energy
|
||||||
|
= \sum_{y1, y2, y3} exp(-(u1' y1 + y1' W y2 + u2' y2 + y2' W y3 + u3' y3))
|
||||||
|
= sum_{y2, y3} (exp(-(u2' y2 + y2' W y3 + u3' y3))
|
||||||
|
sum_{y1} exp(-(u1' y1' + y1' W y2))) \]
|
||||||
|
Denote:
|
||||||
|
\[ S(y2) := sum_{y1} exp(-(u1' y1 + y1' W y2)), \]
|
||||||
|
\[ Z = sum_{y2, y3} exp(log S(y2) - (u2' y2 + y2' W y3 + u3' y3)) \]
|
||||||
|
\[ logS(y2) = log S(y2) = log_sum_exp(-(u1' y1' + y1' W y2)) \]
|
||||||
|
Note that:
|
||||||
|
yi's are one-hot vectors
|
||||||
|
u1, u3: boundary energies have been merged
|
||||||
|
If `return_logZ = False`, compute the Viterbi's best path lookup table.
|
||||||
|
"""
|
||||||
|
chain_energy = self.chain_kernel
|
||||||
|
# shape=(1, F, F): F=num of output features. 1st F is for t-1, 2nd F for t
|
||||||
|
chain_energy = K.expand_dims(chain_energy, 0)
|
||||||
|
# shape=(B, F), dtype=float32
|
||||||
|
prev_target_val = K.zeros_like(input_energy[:, 0, :])
|
||||||
|
|
||||||
|
if go_backwards:
|
||||||
|
input_energy = K.reverse(input_energy, 1)
|
||||||
|
if mask is not None:
|
||||||
|
mask = K.reverse(mask, 1)
|
||||||
|
|
||||||
|
initial_states = [prev_target_val, K.zeros_like(prev_target_val[:, :1])]
|
||||||
|
constants = [chain_energy]
|
||||||
|
|
||||||
|
if mask is not None:
|
||||||
|
mask2 = K.cast(K.concatenate([mask, K.zeros_like(mask[:, :1])], axis=1),
|
||||||
|
K.floatx())
|
||||||
|
constants.append(mask2)
|
||||||
|
|
||||||
|
def _step(input_energy_i, states):
|
||||||
|
return self.step(input_energy_i, states, return_logZ)
|
||||||
|
|
||||||
|
target_val_last, target_val_seq, _ = K.rnn(_step, input_energy,
|
||||||
|
initial_states,
|
||||||
|
constants=constants,
|
||||||
|
input_length=input_length,
|
||||||
|
unroll=self.unroll)
|
||||||
|
|
||||||
|
if return_sequences:
|
||||||
|
if go_backwards:
|
||||||
|
target_val_seq = K.reverse(target_val_seq, 1)
|
||||||
|
return target_val_seq
|
||||||
|
else:
|
||||||
|
return target_val_last
|
||||||
|
|
||||||
|
def forward_recursion(self, input_energy, **kwargs):
|
||||||
|
return self.recursion(input_energy, **kwargs)
|
||||||
|
|
||||||
|
def backward_recursion(self, input_energy, **kwargs):
|
||||||
|
return self.recursion(input_energy, go_backwards=True, **kwargs)
|
||||||
|
|
||||||
|
def get_marginal_prob(self, X, mask=None):
|
||||||
|
input_energy = self.activation(K.dot(X, self.kernel) + self.bias)
|
||||||
|
if self.use_boundary:
|
||||||
|
input_energy = self.add_boundary_energy(input_energy, mask,
|
||||||
|
self.left_boundary,
|
||||||
|
self.right_boundary)
|
||||||
|
input_length = K.int_shape(X)[1]
|
||||||
|
alpha = self.forward_recursion(input_energy, mask=mask,
|
||||||
|
input_length=input_length)
|
||||||
|
beta = self.backward_recursion(input_energy, mask=mask,
|
||||||
|
input_length=input_length)
|
||||||
|
if mask is not None:
|
||||||
|
input_energy = input_energy * K.expand_dims(K.cast(mask, K.floatx()))
|
||||||
|
margin = -(self.shift_right(alpha) + input_energy + self.shift_left(beta))
|
||||||
|
return self.softmaxNd(margin)
|
||||||
|
|
||||||
|
def viterbi_decoding(self, X, mask=None):
|
||||||
|
input_energy = self.activation(K.dot(X, self.kernel) + self.bias)
|
||||||
|
if self.use_boundary:
|
||||||
|
input_energy = self.add_boundary_energy(
|
||||||
|
input_energy, mask, self.left_boundary, self.right_boundary)
|
||||||
|
|
||||||
|
argmin_tables = self.recursion(input_energy, mask, return_logZ=False)
|
||||||
|
argmin_tables = K.cast(argmin_tables, 'int32')
|
||||||
|
|
||||||
|
# backward to find best path, `initial_best_idx` can be any,
|
||||||
|
# as all elements in the last argmin_table are the same
|
||||||
|
argmin_tables = K.reverse(argmin_tables, 1)
|
||||||
|
# matrix instead of vector is required by tf `K.rnn`
|
||||||
|
initial_best_idx = [K.expand_dims(argmin_tables[:, 0, 0])]
|
||||||
|
if K.backend() == 'theano':
|
||||||
|
initial_best_idx = [K.T.unbroadcast(initial_best_idx[0], 1)]
|
||||||
|
|
||||||
|
def gather_each_row(params, indices):
|
||||||
|
n = K.shape(indices)[0]
|
||||||
|
if K.backend() == 'theano':
|
||||||
|
return params[K.T.arange(n), indices]
|
||||||
|
else:
|
||||||
|
indices = K.transpose(K.stack([K.tf.range(n), indices]))
|
||||||
|
return K.tf.gather_nd(params, indices)
|
||||||
|
|
||||||
|
def find_path(argmin_table, best_idx):
|
||||||
|
next_best_idx = gather_each_row(argmin_table, best_idx[0][:, 0])
|
||||||
|
next_best_idx = K.expand_dims(next_best_idx)
|
||||||
|
if K.backend() == 'theano':
|
||||||
|
next_best_idx = K.T.unbroadcast(next_best_idx, 1)
|
||||||
|
return next_best_idx, [next_best_idx]
|
||||||
|
|
||||||
|
_, best_paths, _ = K.rnn(find_path, argmin_tables, initial_best_idx,
|
||||||
|
input_length=K.int_shape(X)[1], unroll=self.unroll)
|
||||||
|
best_paths = K.reverse(best_paths, 1)
|
||||||
|
best_paths = K.squeeze(best_paths, 2)
|
||||||
|
|
||||||
|
return K.one_hot(best_paths, self.units)
|
335
Ner/bert/keras_bert_ner_bi_lstm.py
Normal file
335
Ner/bert/keras_bert_ner_bi_lstm.py
Normal file
@ -0,0 +1,335 @@
|
|||||||
|
# -*- coding: UTF-8 -*-
|
||||||
|
# !/usr/bin/python
|
||||||
|
# @time :2019/5/10 18:05
|
||||||
|
# @author :Mo
|
||||||
|
# @function :classify text of bert and bi-lstm
|
||||||
|
|
||||||
|
from __future__ import division, absolute_import
|
||||||
|
|
||||||
|
import logging as logger
|
||||||
|
import numpy as np
|
||||||
|
import pickle
|
||||||
|
import codecs
|
||||||
|
|
||||||
|
# bert embedding
|
||||||
|
from Ner.bert.keras_bert_layer import CRF, crf_loss, crf_accuracy
|
||||||
|
from Ner.bert.keras_bert_embedding import KerasBertEmbedding
|
||||||
|
from Ner.bert.keras_bert_layer import NonMaskingLayer
|
||||||
|
|
||||||
|
# bert trained path
|
||||||
|
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
|
||||||
|
from keras.layers import Bidirectional, CuDNNGRU, CuDNNLSTM
|
||||||
|
from keras.layers import Dense, Dropout
|
||||||
|
from keras.layers import GRU, LSTM, TimeDistributed
|
||||||
|
from keras.objectives import sparse_categorical_crossentropy
|
||||||
|
from keras.optimizers import Adam
|
||||||
|
from keras.models import Model
|
||||||
|
from keras import regularizers
|
||||||
|
|
||||||
|
# bert sequence taging
|
||||||
|
from keras.preprocessing.sequence import pad_sequences
|
||||||
|
from keras.utils import to_categorical
|
||||||
|
from keras_bert import Tokenizer
|
||||||
|
|
||||||
|
# corpus path
|
||||||
|
from Ner.bert import args
|
||||||
|
from conf.path_config import path_ner_people_train, path_ner_people_dev, path_ner_people_test
|
||||||
|
from conf.feature_config import vocab_file
|
||||||
|
|
||||||
|
|
||||||
|
class BertNerBiLstmModel():
|
||||||
|
def __init__(self):
|
||||||
|
# logger.info("BertBiLstmModel init start!")
|
||||||
|
print("BertNerBiLstmModel init start!")
|
||||||
|
self.dict_path, self.max_seq_len, self.keep_prob, self.is_training = vocab_file, args.max_seq_len, args.keep_prob, args.is_training
|
||||||
|
# reader tokenizer
|
||||||
|
self.token_dict = {}
|
||||||
|
with codecs.open(self.dict_path, 'r', 'utf8') as reader:
|
||||||
|
for line in reader:
|
||||||
|
token = line.strip()
|
||||||
|
self.token_dict[token] = len(self.token_dict)
|
||||||
|
|
||||||
|
self.tokenizer = Tokenizer(self.token_dict)
|
||||||
|
# 你可以选择一个model build,有bi-lstm single、bi-lstm 3-layers、bi-lstm_attention
|
||||||
|
self.build_model_bilstm_layers()
|
||||||
|
self.compile_model()
|
||||||
|
# self.build_model_bilstm_single()
|
||||||
|
# logger.info("BertBiLstmModel init end!")
|
||||||
|
print("BertNerBiLstmModel init end!")
|
||||||
|
|
||||||
|
def process_single(self, texts):
|
||||||
|
# 文本预处理,传入一个list,返回的是ids\mask\type-ids
|
||||||
|
input_ids = []
|
||||||
|
input_masks = []
|
||||||
|
input_type_ids = []
|
||||||
|
for text in texts:
|
||||||
|
if type(text) is list:
|
||||||
|
text = "".join(text)
|
||||||
|
logger.info(text)
|
||||||
|
tokens_text = self.tokenizer.tokenize(text)
|
||||||
|
logger.info('Tokens:', tokens_text)
|
||||||
|
input_id, input_type_id = self.tokenizer.encode(first=text, max_len=self.max_seq_len)
|
||||||
|
input_mask = [0 if ids == 0 else 1 for ids in input_id]
|
||||||
|
input_ids.append(input_id)
|
||||||
|
input_type_ids.append(input_type_id)
|
||||||
|
input_masks.append(input_mask)
|
||||||
|
# numpy处理list
|
||||||
|
input_ids = np.array(input_ids)
|
||||||
|
input_masks = np.array(input_masks)
|
||||||
|
input_type_ids = np.array(input_type_ids)
|
||||||
|
logger.info("process ok!")
|
||||||
|
return [input_ids, input_masks, input_type_ids]
|
||||||
|
|
||||||
|
def process_pair(self, textss):
|
||||||
|
# 文本预处理,传入一个list,返回的是ids\mask\type-ids
|
||||||
|
input_ids = []
|
||||||
|
input_masks = []
|
||||||
|
input_type_ids = []
|
||||||
|
for texts in textss:
|
||||||
|
tokens_text = self.tokenizer.tokenize(texts[0])
|
||||||
|
logger.info('Tokens1:', tokens_text)
|
||||||
|
tokens_text2 = self.tokenizer.tokenize(texts[1])
|
||||||
|
logger.info('Tokens2:', tokens_text2)
|
||||||
|
input_id, input_type_id = self.tokenizer.encode(first=texts[0], second=texts[1], max_len=self.max_seq_len)
|
||||||
|
input_mask = [0 if ids == 0 else 1 for ids in input_id]
|
||||||
|
input_ids.append(input_id)
|
||||||
|
input_type_ids.append(input_type_id)
|
||||||
|
input_masks.append(input_mask)
|
||||||
|
# numpy处理list
|
||||||
|
input_ids = np.array(input_ids)
|
||||||
|
input_masks = np.array(input_masks)
|
||||||
|
input_type_ids = np.array(input_type_ids)
|
||||||
|
logger.info("process ok!")
|
||||||
|
return [input_ids, input_masks, input_type_ids]
|
||||||
|
|
||||||
|
def build_model_bilstm_layers(self):
|
||||||
|
if args.use_lstm:
|
||||||
|
if args.use_cudnn_cell:
|
||||||
|
layer_cell = CuDNNLSTM
|
||||||
|
else:
|
||||||
|
layer_cell = LSTM
|
||||||
|
else:
|
||||||
|
if args.use_cudnn_cell:
|
||||||
|
layer_cell = CuDNNGRU
|
||||||
|
else:
|
||||||
|
layer_cell = GRU
|
||||||
|
# bert embedding
|
||||||
|
bert_inputs, bert_output = KerasBertEmbedding().bert_encode()
|
||||||
|
|
||||||
|
# Bi-LSTM
|
||||||
|
x = Bidirectional(layer_cell(units=args.units,
|
||||||
|
return_sequences=args.return_sequences,
|
||||||
|
))(bert_output)
|
||||||
|
# 最后
|
||||||
|
x = TimeDistributed(Dropout(self.keep_prob))(x)
|
||||||
|
dense_layer = Dense(args.max_seq_len, activation=args.activation)(x)
|
||||||
|
crf = CRF(args.label, sparse_target=False, learn_mode="join", test_mode='viterbi')
|
||||||
|
output_layers = crf(dense_layer)
|
||||||
|
self.model = Model(bert_inputs, output_layers)
|
||||||
|
self.model.summary(132)
|
||||||
|
|
||||||
|
def compile_model(self):
|
||||||
|
self.model.compile(
|
||||||
|
optimizer=Adam(lr=args.learning_rate, beta_1=0.9, beta_2=0.999, epsilon=args.epsilon, decay=0.0),
|
||||||
|
loss=crf_loss if args.use_crf else sparse_categorical_crossentropy,
|
||||||
|
metrics=[crf_accuracy] if args.metrics is 'crf_loss' else args.metrics)
|
||||||
|
# loss=CRF.loss_function if args.use_crf else categorical_crossentropy,
|
||||||
|
# metrics=[CRF.accuracy] if args.metrics is 'crf_loss' else args.metrics)
|
||||||
|
# loss=crf.loss if args.use_crf else categorical_crossentropy,
|
||||||
|
# metrics=[crf.accuracy] if args.metrics is 'crf_loss' else args.metrics)
|
||||||
|
|
||||||
|
def callback(self):
|
||||||
|
cb = [ModelCheckpoint(monitor='val_loss', mode='min', filepath=args.path_save_model, verbose=1, save_best_only=True, save_weights_only=False),
|
||||||
|
ReduceLROnPlateau(monitor='val_loss', mode='min', factor=0.2, patience=2, verbose=0, epsilon=1e-6, cooldown=4, min_lr=1e-8),
|
||||||
|
EarlyStopping(monitor='val_loss', mode='min', min_delta=1e-8, patience=2)
|
||||||
|
]
|
||||||
|
return cb
|
||||||
|
|
||||||
|
def fit(self, x_train, y_train, x_dev, y_dev):
|
||||||
|
self.model.fit(x_train, y_train, batch_size=args.batch_size,
|
||||||
|
epochs=args.epochs, validation_data=(x_dev, y_dev),
|
||||||
|
shuffle=True,
|
||||||
|
callbacks=self.callback())
|
||||||
|
self.model.save(args.path_save_model)
|
||||||
|
|
||||||
|
def load_model(self):
|
||||||
|
print("BertNerBiLstmModel load_model start!")
|
||||||
|
# logger.info("BertBiLstmModel load_model start!")
|
||||||
|
self.model.load_weights(args.path_save_model)
|
||||||
|
# logger.info("BertBiLstmModel load_model end+!")
|
||||||
|
print("BertNerBiLstmModel load_model end+!")
|
||||||
|
|
||||||
|
def predict(self, sen):
|
||||||
|
input_ids, input_masks, input_type_ids = self.process_single([sen])
|
||||||
|
probs = self.model.predict([input_ids, input_masks], batch_size=1)
|
||||||
|
probs_first = probs[0]
|
||||||
|
preds = []
|
||||||
|
for prob_one in probs_first:
|
||||||
|
prob_max = np.argmax(prob_one)
|
||||||
|
preds.append(prob_max)
|
||||||
|
return preds
|
||||||
|
|
||||||
|
def predict_list(self, questions):
|
||||||
|
label_preds = []
|
||||||
|
for questions_pair in questions:
|
||||||
|
input_ids, input_masks, input_type_ids = self.process_single([questions_pair])
|
||||||
|
label_pred = self.model.predict([input_ids, input_masks], batch_size=1)
|
||||||
|
label_preds.append(label_pred)
|
||||||
|
return label_preds
|
||||||
|
|
||||||
|
|
||||||
|
def get_sequence_tagging_data_from_chinese_people_daily_ner_corpus(file_path):
|
||||||
|
"""
|
||||||
|
读取人民日报语料,其实就是一个txt阅读器
|
||||||
|
:param file_path: str, text
|
||||||
|
:return: list, list
|
||||||
|
"""
|
||||||
|
_x_, _y_ = [], []
|
||||||
|
with open(file_path, "r", encoding="utf-8") as fr:
|
||||||
|
lines = fr.read().splitlines()
|
||||||
|
x, y = [], []
|
||||||
|
for line_one in lines:
|
||||||
|
rows = line_one.split(" ")
|
||||||
|
if len(rows) == 1:
|
||||||
|
_x_.append(x), _y_.append(y)
|
||||||
|
x, y = [], []
|
||||||
|
else:
|
||||||
|
x.append(rows[0]), y.append(rows[1])
|
||||||
|
return _x_, _y_
|
||||||
|
|
||||||
|
|
||||||
|
def label_tagging(data_x_s, tag_label2index, len_max=32):
|
||||||
|
"""
|
||||||
|
根据类别字典dict、语料y和最大文本长度l,padding和to_categorical
|
||||||
|
:param data_x_s: list
|
||||||
|
:param tag_label2index:dict
|
||||||
|
:param len_max: int
|
||||||
|
:return: list
|
||||||
|
"""
|
||||||
|
tag_labels = []
|
||||||
|
for data_x in data_x_s:
|
||||||
|
if len(data_x) <= len_max-2:
|
||||||
|
tag_labels.append([tag_label2index['O']] + [tag_label2index[i] for i in data_x] + [tag_label2index['O'] for i in range(len_max - len(data_x) - 1)])
|
||||||
|
else:
|
||||||
|
tag_labels.append([tag_label2index['O']] + [tag_label2index[i] for i in data_x[:len_max-1]] + [tag_label2index['O']])
|
||||||
|
|
||||||
|
tag_labels_pad = pad_sequences(sequences=tag_labels, maxlen=len_max, dtype='int32',
|
||||||
|
padding='post', truncating='post', value=tag_label2index['O'])
|
||||||
|
one_hot_y = to_categorical(tag_labels_pad, num_classes=len(tag_label2index))
|
||||||
|
|
||||||
|
label_num = len(set(["".join(str(i)) for i in tag_labels]))
|
||||||
|
# tag_labels_pad_to = to_categorical(y=tag_labels_pad.tolist(), num_classes=label_num)
|
||||||
|
return one_hot_y, label_num
|
||||||
|
|
||||||
|
|
||||||
|
def label_tagging_predict(y_predicts, tag_i2l):
|
||||||
|
y_preds = []
|
||||||
|
count_y_predict = y_predicts[0].shape[1]
|
||||||
|
for y_predict in y_predicts:
|
||||||
|
temp = []
|
||||||
|
for i in range(count_y_predict):
|
||||||
|
y_predict_list = y_predict[0][i].tolist()
|
||||||
|
y_predict_max = y_predict_list.index(max(y_predict_list))
|
||||||
|
pred_label = tag_i2l[y_predict_max]
|
||||||
|
temp.append(pred_label)
|
||||||
|
y_preds.append(temp)
|
||||||
|
return y_preds
|
||||||
|
|
||||||
|
|
||||||
|
def create_label_index_dict(data_x_s):
|
||||||
|
"""
|
||||||
|
构建类别和index标签,一一对应等
|
||||||
|
:param data_x_s: list, labels of train data
|
||||||
|
:return: list, list
|
||||||
|
"""
|
||||||
|
# 首先构建index2label, 或者label2index
|
||||||
|
tag_label2index = {}
|
||||||
|
tag_index2label = {}
|
||||||
|
data_x_s_one = []
|
||||||
|
for d in data_x_s:
|
||||||
|
data_x_s_one = data_x_s_one + d
|
||||||
|
label_data_x_s = list(set(data_x_s_one))
|
||||||
|
for i in range(len(label_data_x_s)):
|
||||||
|
tag_label2index[label_data_x_s[i]] = i
|
||||||
|
tag_index2label[i] = label_data_x_s[i]
|
||||||
|
return tag_label2index, tag_index2label
|
||||||
|
|
||||||
|
|
||||||
|
def process_ner_y(y_data, length_max):
|
||||||
|
"""
|
||||||
|
根据训练语料y生成喂入模型的input_y
|
||||||
|
:param y_data: list
|
||||||
|
:param length_max: int
|
||||||
|
:return: list, dict, dict
|
||||||
|
"""
|
||||||
|
# 保存类别字典
|
||||||
|
import os
|
||||||
|
if not os.path.exists(args.path_tag_li):
|
||||||
|
tag_l2i, tag_i2l = create_label_index_dict(y_data)
|
||||||
|
with open(args.path_tag_li, 'wb') as f:
|
||||||
|
pickle.dump((tag_l2i, tag_i2l), f)
|
||||||
|
else:
|
||||||
|
with open(args.path_tag_li, 'rb') as f:
|
||||||
|
tag_l2i, tag_i2l = pickle.load(f)
|
||||||
|
tagging_index, label_num = y_data, length_max
|
||||||
|
try:
|
||||||
|
# tagging
|
||||||
|
tagging_index, label_num = label_tagging(y_data, tag_l2i, length_max)
|
||||||
|
except:
|
||||||
|
gg = 0
|
||||||
|
|
||||||
|
return tagging_index, label_num, tag_l2i, tag_i2l
|
||||||
|
|
||||||
|
|
||||||
|
def train():
|
||||||
|
# 1. trian
|
||||||
|
bert_model = BertNerBiLstmModel()
|
||||||
|
# bert_model.compile_model()
|
||||||
|
print("process corpus start!")
|
||||||
|
# 读取语料
|
||||||
|
x_train, y_train = get_sequence_tagging_data_from_chinese_people_daily_ner_corpus(path_ner_people_train)
|
||||||
|
x_dev, y_dev = get_sequence_tagging_data_from_chinese_people_daily_ner_corpus(path_ner_people_dev)
|
||||||
|
# ques和label index and padding
|
||||||
|
x_train = bert_model.process_single(x_train)
|
||||||
|
x_dev = bert_model.process_single(x_dev)
|
||||||
|
y_train_tagging_index, label_num, tag_l2i, tag_i2l = process_ner_y(y_train, args.max_seq_len)
|
||||||
|
y_dev_tagging_index, _, _, _ = process_ner_y(y_dev, args.max_seq_len)
|
||||||
|
# args.label = label_num
|
||||||
|
print(label_num)
|
||||||
|
print("process corpus end!")
|
||||||
|
# gg = x_train[0:2]
|
||||||
|
x_train_2 = x_train[0:2]
|
||||||
|
x_dev_2 = x_dev[0:2]
|
||||||
|
print(x_train_2.__sizeof__())
|
||||||
|
print(x_dev_2.__sizeof__())
|
||||||
|
y_train_2 = y_train_tagging_index
|
||||||
|
y_dev_2 = y_dev_tagging_index
|
||||||
|
|
||||||
|
bert_model.fit(x_train_2, y_train_2, x_dev_2, y_dev_2)
|
||||||
|
|
||||||
|
|
||||||
|
def predict():
|
||||||
|
# 3. predict
|
||||||
|
_, _, tag_l2i, tag_i2l = process_ner_y([], [])
|
||||||
|
bert_model = BertNerBiLstmModel()
|
||||||
|
bert_model.load_model()
|
||||||
|
pred = bert_model.predict(sen='欧美和台湾经济怎么样')
|
||||||
|
tag_labels = []
|
||||||
|
for pre in pred:
|
||||||
|
tag_labels.append(tag_i2l[pre])
|
||||||
|
print(tag_labels)
|
||||||
|
while True:
|
||||||
|
print("sen: ")
|
||||||
|
sen_1 = input()
|
||||||
|
pred = bert_model.predict(sen=sen_1)
|
||||||
|
tag_labels = []
|
||||||
|
for pre in pred:
|
||||||
|
tag_labels.append(tag_i2l[pre])
|
||||||
|
print(tag_labels)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
train()
|
||||||
|
# predict()
|
||||||
|
|
||||||
|
|
78
Ner/bert/layer_crf_bojone.py
Normal file
78
Ner/bert/layer_crf_bojone.py
Normal file
@ -0,0 +1,78 @@
|
|||||||
|
# -*- coding: UTF-8 -*-
|
||||||
|
# !/usr/bin/python
|
||||||
|
# @time :2019/5/26 9:29
|
||||||
|
# @author :Mo
|
||||||
|
# @function :
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import division
|
||||||
|
|
||||||
|
from keras.layers import Layer
|
||||||
|
import keras.backend as K
|
||||||
|
|
||||||
|
|
||||||
|
class CRF(Layer):
|
||||||
|
"""
|
||||||
|
codes from: https://github.com/bojone/crf/blob/master/crf_keras.py
|
||||||
|
纯Keras实现CRF层
|
||||||
|
CRF层本质上是一个带训练参数的loss计算层,因此CRF层只用来训练模型,
|
||||||
|
而预测则需要另外建立模型。
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, ignore_last_label=False, **kwargs):
|
||||||
|
"""ignore_last_label:定义要不要忽略最后一个标签,起到mask的效果
|
||||||
|
"""
|
||||||
|
self.ignore_last_label = 1 if ignore_last_label else 0
|
||||||
|
super(CRF, self).__init__(**kwargs)
|
||||||
|
|
||||||
|
def build(self, input_shape):
|
||||||
|
self.num_labels = input_shape[-1] - self.ignore_last_label
|
||||||
|
self.trans = self.add_weight(name='crf_trans',
|
||||||
|
shape=(self.num_labels, self.num_labels),
|
||||||
|
initializer='glorot_uniform',
|
||||||
|
trainable=True)
|
||||||
|
|
||||||
|
def log_norm_step(self, inputs, states):
|
||||||
|
"""递归计算归一化因子
|
||||||
|
要点:1、递归计算;2、用logsumexp避免溢出。
|
||||||
|
技巧:通过expand_dims来对齐张量。
|
||||||
|
"""
|
||||||
|
states = K.expand_dims(states[0], 2) # (batch_size, output_dim, 1)
|
||||||
|
trans = K.expand_dims(self.trans, 0) # (1, output_dim, output_dim)
|
||||||
|
output = K.logsumexp(states + trans, 1) # (batch_size, output_dim)
|
||||||
|
return output + inputs, [output + inputs]
|
||||||
|
|
||||||
|
def path_score(self, inputs, labels):
|
||||||
|
"""计算目标路径的相对概率(还没有归一化)
|
||||||
|
要点:逐标签得分,加上转移概率得分。
|
||||||
|
技巧:用“预测”点乘“目标”的方法抽取出目标路径的得分。
|
||||||
|
"""
|
||||||
|
point_score = K.sum(K.sum(inputs * labels, 2), 1, keepdims=True) # 逐标签得分
|
||||||
|
labels1 = K.expand_dims(labels[:, :-1], 3)
|
||||||
|
labels2 = K.expand_dims(labels[:, 1:], 2)
|
||||||
|
labels = labels1 * labels2 # 两个错位labels,负责从转移矩阵中抽取目标转移得分
|
||||||
|
trans = K.expand_dims(K.expand_dims(self.trans, 0), 0)
|
||||||
|
trans_score = K.sum(K.sum(trans * labels, [2, 3]), 1, keepdims=True)
|
||||||
|
return point_score + trans_score # 两部分得分之和
|
||||||
|
|
||||||
|
def call(self, inputs): # CRF本身不改变输出,它只是一个loss
|
||||||
|
return inputs
|
||||||
|
|
||||||
|
def loss(self, y_true, y_pred): # 目标y_pred需要是one hot形式
|
||||||
|
mask = 1 - y_true[:, 1:, -1] if self.ignore_last_label else None
|
||||||
|
y_true, y_pred = y_true[:, :, :self.num_labels], y_pred[:, :, :self.num_labels]
|
||||||
|
init_states = [y_pred[:, 0]] # 初始状态
|
||||||
|
log_norm, _, _ = K.rnn(self.log_norm_step, y_pred[:, 1:], init_states, mask=mask) # 计算Z向量(对数)
|
||||||
|
log_norm = K.logsumexp(log_norm, 1, keepdims=True) # 计算Z(对数)
|
||||||
|
path_score = self.path_score(y_pred, y_true) # 计算分子(对数)
|
||||||
|
return log_norm - path_score # 即log(分子/分母)
|
||||||
|
|
||||||
|
def accuracy(self, y_true, y_pred): # 训练过程中显示逐帧准确率的函数,排除了mask的影响
|
||||||
|
mask = 1 - y_true[:, :, -1] if self.ignore_last_label else None
|
||||||
|
y_true, y_pred = y_true[:, :, :self.num_labels], y_pred[:, :, :self.num_labels]
|
||||||
|
isequal = K.equal(K.argmax(y_true, 2), K.argmax(y_pred, 2))
|
||||||
|
isequal = K.cast(isequal, 'float32')
|
||||||
|
if mask == None:
|
||||||
|
return K.mean(isequal)
|
||||||
|
else:
|
||||||
|
return K.sum(isequal * mask) / K.sum(mask)
|
1
Ner/bert/models/bilstm/useless.txt
Normal file
1
Ner/bert/models/bilstm/useless.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
useless
|
Loading…
Reference in New Issue
Block a user