diff --git a/Ner/__init__.py b/Ner/__init__.py new file mode 100644 index 0000000..079ab0b --- /dev/null +++ b/Ner/__init__.py @@ -0,0 +1,5 @@ +# -*- coding: UTF-8 -*- +# !/usr/bin/python +# @time :2019/5/21 15:23 +# @author :Mo +# @function : \ No newline at end of file diff --git a/Ner/bert/__init__.py b/Ner/bert/__init__.py new file mode 100644 index 0000000..079ab0b --- /dev/null +++ b/Ner/bert/__init__.py @@ -0,0 +1,5 @@ +# -*- coding: UTF-8 -*- +# !/usr/bin/python +# @time :2019/5/21 15:23 +# @author :Mo +# @function : \ No newline at end of file diff --git a/Ner/bert/args.py b/Ner/bert/args.py new file mode 100644 index 0000000..cb42163 --- /dev/null +++ b/Ner/bert/args.py @@ -0,0 +1,33 @@ +# bi-lstm +return_sequences = True +use_cudnn_cell = True +use_lstm = True +use_crf = True +is_training = True + +loss = 'categorical_crossentropy' +metrics = ['accuracy'] # 'crf_loss' # ['accuracy'] +activation = 'relu' # 'relu' +optimizers = 'adam' +learning_rate = 1e-3 +epsilon = 1e-9 +embedding_dim = 768 +keep_prob = 0.5 +units = 256 +decay = 0.0 +label = 7 +l2 = 0.032 + +epochs = 320 +batch_size = 16 +path_save_model = 'models/bilstm/bert_ner_bilstm_no_12_config.h5' +path_tag_li = 'models/bilstm/tag_l_i.pkl' + +# gpu使用率 +gpu_memory_fraction = 0.32 + +# ner当然是所有层都会提取啦,句向量默认取倒数第二层的输出值作为句向量 +layer_indexes = [i for i in range(13)] # [-2] + +# 序列的最大程度 +max_seq_len = 50 diff --git a/Ner/bert/keras_bert_embedding.py b/Ner/bert/keras_bert_embedding.py new file mode 100644 index 0000000..307100f --- /dev/null +++ b/Ner/bert/keras_bert_embedding.py @@ -0,0 +1,90 @@ +# -*- coding: UTF-8 -*- +# !/usr/bin/python +# @time :2019/5/8 20:04 +# @author :Mo +# @function :embedding of bert keras + +import os + +import keras.backend.tensorflow_backend as ktf_keras +import tensorflow as tf +from Ner.bert.keras_bert_layer import NonMaskingLayer +from keras.layers import Add, Concatenate +from keras.models import Model +from keras_bert import load_trained_model_from_checkpoint + +from Ner.bert.args import gpu_memory_fraction, max_seq_len, layer_indexes +from conf.feature_config import config_name, ckpt_name, vocab_file + +# 全局使用,使其可以django、flask、tornado等调用 +graph = None +model = None + +# gpu配置与使用率设置 +os.environ['CUDA_VISIBLE_DEVICES'] = '0' +config = tf.ConfigProto() +config.gpu_options.per_process_gpu_memory_fraction = gpu_memory_fraction +sess = tf.Session(config=config) +ktf_keras.set_session(sess) + +class KerasBertEmbedding(): + def __init__(self): + self.config_path, self.checkpoint_path, self.dict_path, self.max_seq_len = config_name, ckpt_name, vocab_file, max_seq_len + + def bert_encode(self): + # 全局使用,使其可以django、flask、tornado等调用 + global graph + graph = tf.get_default_graph() + global model + model = load_trained_model_from_checkpoint(self.config_path, self.checkpoint_path, + seq_len=self.max_seq_len) + bert_layers = model.layers + # return model + print(bert_layers) + print(model.output) + print(len(model.layers)) + # lay = model.layers + #一共104个layer,其中前八层包括token,pos,embed等, + # 每8层(MultiHeadAttention,Dropout,Add,LayerNormalization) + # 一共12层+最开始未处理那层(可以理解为input) + layer_dict = [7] + layer_0 = 7 + for i in range(13): + layer_0 = layer_0 + 8 + layer_dict.append(layer_0) + + # 输出它本身 + if len(layer_indexes) == 0: + encoder_layer = model.output + # 分类如果只有一层,就只取最后那一层的weight,取得不正确 + elif len(layer_indexes) == 1: + if layer_indexes[0] in [i+1 for i in range(12)]: + encoder_layer = model.get_layer(index=layer_dict[layer_indexes[0]]).output + else: + encoder_layer = model.get_layer(index=layer_dict[-1]).output + # 否则遍历需要取的层,把所有层的weight取出来并拼接起来shape:768*层数 + else: + # layer_indexes must be [1,2,3,......12] + # all_layers = [model.get_layer(index=lay).output if lay is not 1 else model.get_layer(index=lay).output[0] for lay in layer_indexes] + all_layers = [model.get_layer(index=layer_dict[lay]).output if lay in [i for i in range(13)] + else model.get_layer(index=layer_dict[-1]).output #如果给出不正确,就默认输出最后一层 + for lay in layer_indexes] + print(layer_indexes) + print(all_layers) + all_layers_select = [] + for all_layers_one in all_layers: + all_layers_select.append(all_layers_one) + # encoder_layer = Add()(all_layers_select) + encoder_layer = Concatenate(axis=-1)(all_layers_select) + print(encoder_layer.shape) + print("KerasBertEmbedding:") + print(encoder_layer.shape) + output = NonMaskingLayer()(encoder_layer) + model = Model(model.inputs, output) + # model.summary(120) + return model.inputs, model.output + + +if __name__ == "__main__": + bert_vector = KerasBertEmbedding() + pooled = bert_vector.bert_encode() diff --git a/Ner/bert/keras_bert_layer.py b/Ner/bert/keras_bert_layer.py new file mode 100644 index 0000000..7478147 --- /dev/null +++ b/Ner/bert/keras_bert_layer.py @@ -0,0 +1,780 @@ +# -*- coding: UTF-8 -*- +# !/usr/bin/python +# @time :2019/5/10 10:49 +# @author :Mo +# @function : 1. create model of keras-bert for get [-2] layers +# 2. create model of AttentionWeightedAverage for get avg attention pooling +# 3. create layer of +# code class NonMaskingLayer from https://github.com/jacoxu +# code class AttentionWeightedAverage from https://github.com/BrikerMan/Kashgari +# code class CRF most from https://github.com/keras-team/keras-contrib, a little of 'theano' from https://github.com/BrikerMan/Kashgari + + +from __future__ import absolute_import +from __future__ import division + + +from keras.engine import InputSpec +import keras.backend as k_keras +from keras.engine import Layer +from keras import initializers +from keras import backend as K +from keras import regularizers +from keras import activations +from keras import constraints +import warnings +import keras +# crf_loss +from keras.losses import sparse_categorical_crossentropy +from keras.losses import categorical_crossentropy + + +class NonMaskingLayer(Layer): + """ + fix convolutional 1D can't receive masked input, detail: https://github.com/keras-team/keras/issues/4978 + thanks for https://github.com/jacoxu + """ + + def __init__(self, **kwargs): + self.supports_masking = True + super(NonMaskingLayer, self).__init__(**kwargs) + + def build(self, input_shape): + pass + + def compute_mask(self, input, input_mask=None): + # do not pass the mask to the next layers + return None + + def call(self, x, mask=None): + return x + + def compute_output_shape(self, input_shape): + return input_shape + + +class AttentionWeightedAverage(Layer): + ''' + codes from: https://github.com/BrikerMan/Kashgari + detail: https://github.com/BrikerMan/Kashgari/blob/master/kashgari/tasks/classification/models.py + Computes a weighted average of the different channels across timesteps. + Uses 1 parameter pr. channel to compute the attention value for a single timestep. + ''' + + def __init__(self, return_attention=False, **kwargs): + self.init = initializers.get('uniform') + self.supports_masking = True + self.return_attention = return_attention + super(AttentionWeightedAverage, self).__init__(**kwargs) + + def build(self, input_shape): + self.input_spec = [InputSpec(ndim=3)] + assert len(input_shape) == 3 + + self.W = self.add_weight(shape=(input_shape[2], 1), + name='{}_w'.format(self.name), + initializer=self.init) + self.trainable_weights = [self.W] + super(AttentionWeightedAverage, self).build(input_shape) + + def call(self, x, mask=None): + # computes a probability distribution over the timesteps + # uses 'max trick' for numerical stability + # reshape is done to avoid issue with Tensorflow + # and 1-dimensional weights + logits = k_keras.dot(x, self.W) + x_shape = k_keras.shape(x) + logits = k_keras.reshape(logits, (x_shape[0], x_shape[1])) + ai = k_keras.exp(logits - k_keras.max(logits, axis=-1, keepdims=True)) + + # masked timesteps have zero weight + if mask is not None: + mask = k_keras.cast(mask, k_keras.floatx()) + ai = ai * mask + att_weights = ai / (k_keras.sum(ai, axis=1, keepdims=True) + k_keras.epsilon()) + weighted_input = x * k_keras.expand_dims(att_weights) + result = k_keras.sum(weighted_input, axis=1) + if self.return_attention: + return [result, att_weights] + return result + + def get_output_shape_for(self, input_shape): + return self.compute_output_shape(input_shape) + + def compute_output_shape(self, input_shape): + output_len = input_shape[2] + if self.return_attention: + return [(input_shape[0], output_len), (input_shape[0], input_shape[1])] + return (input_shape[0], output_len) + + def compute_mask(self, input, input_mask=None): + if isinstance(input_mask, list): + return [None] * len(input_mask) + else: + return None + + +# crf_loss +def crf_nll(y_true, y_pred): + """The negative log-likelihood for linear chain Conditional Random Field (CRF). + This loss function is only used when the `layers.CRF` layer + is trained in the "join" mode. + # Arguments + y_true: tensor with true targets. + y_pred: tensor with predicted targets. + # Returns + A scalar representing corresponding to the negative log-likelihood. + # Raises + TypeError: If CRF is not the last layer. + # About GitHub + If you open an issue or a pull request about CRF, please + add `cc @lzfelix` to notify Luiz Felix. + """ + + crf, idx = y_pred._keras_history[:2] + if crf._outbound_nodes: + raise TypeError('When learn_model="join", CRF must be the last layer.') + if crf.sparse_target: + y_true = K.one_hot(K.cast(y_true[:, :, 0], 'int32'), crf.units) + X = crf._inbound_nodes[idx].input_tensors[0] + mask = crf._inbound_nodes[idx].input_masks[0] + nloglik = crf.get_negative_log_likelihood(y_true, X, mask) + # 新加的 + # nloglik = k_keras.abs(nloglik) + return nloglik + +def crf_loss(y_true, y_pred): + """General CRF loss function depending on the learning mode. + # Arguments + y_true: tensor with true targets. + y_pred: tensor with predicted targets. + # Returns + If the CRF layer is being trained in the join mode, returns the negative + log-likelihood. Otherwise returns the categorical crossentropy implemented + by the underlying Keras backend. + # About GitHub + If you open an issue or a pull request about CRF, please + add `cc @lzfelix` to notify Luiz Felix. + """ + crf, idx = y_pred._keras_history[:2] + if crf.learn_mode == 'join': + return crf_nll(y_true, y_pred) + else: + if crf.sparse_target: + return sparse_categorical_crossentropy(y_true, y_pred) + else: + return categorical_crossentropy(y_true, y_pred) + +# crf_marginal_accuracy, crf_viterbi_accuracy +def _get_accuracy(y_true, y_pred, mask, sparse_target=False): + """ + :param y_true: + :param y_pred: + :param mask: + :param sparse_target: + :return: + """ + y_pred = K.argmax(y_pred, -1) + if sparse_target: + y_true = K.cast(y_true[:, :, 0], K.dtype(y_pred)) + else: + y_true = K.argmax(y_true, -1) + judge = K.cast(K.equal(y_pred, y_true), K.floatx()) + if mask is None: + return K.mean(judge) + else: + mask = K.cast(mask, K.floatx()) + return K.sum(judge * mask) / K.sum(mask) + +def crf_viterbi_accuracy(y_true, y_pred): + '''Use Viterbi algorithm to get best path, and compute its accuracy. + `y_pred` must be an output from CRF.''' + crf, idx = y_pred._keras_history[:2] + X = crf._inbound_nodes[idx].input_tensors[0] + mask = crf._inbound_nodes[idx].input_masks[0] + y_pred = crf.viterbi_decoding(X, mask) + return _get_accuracy(y_true, y_pred, mask, crf.sparse_target) + +def crf_marginal_accuracy(y_true, y_pred): + '''Use time-wise marginal argmax as prediction. + `y_pred` must be an output from CRF with `learn_mode="marginal"`.''' + crf, idx = y_pred._keras_history[:2] + X = crf._inbound_nodes[idx].input_tensors[0] + mask = crf._inbound_nodes[idx].input_masks[0] + y_pred = crf.get_marginal_prob(X, mask) + return _get_accuracy(y_true, y_pred, mask, crf.sparse_target) + +def crf_accuracy(y_true, y_pred): + '''Ge default accuracy based on CRF `test_mode`.''' + crf, idx = y_pred._keras_history[:2] + if crf.test_mode == 'viterbi': + return crf_viterbi_accuracy(y_true, y_pred) + else: + return crf_marginal_accuracy(y_true, y_pred) + +def to_tuple(shape): + """This functions is here to fix an inconsistency between keras and tf.keras. + In tf.keras, the input_shape argument is an tuple with `Dimensions` objects. + In keras, the input_shape is a simple tuple of ints or `None`. + We'll work with tuples of ints or `None` to be consistent + with keras-team/keras. So we must apply this function to + all input_shapes of the build methods in custom layers. + """ + if is_tf_keras: + import tensorflow as tf + return tuple(tf.TensorShape(shape).as_list()) + else: + return shape + +class CRF(Layer): + """ + codes from: https://github.com/keras-team/keras-contrib + detail: https://github.com/keras-team/keras-contrib/blob/fff264273d5347613574ff533c598f55f15d4763/keras_contrib/layers/crf.py + + An implementation of linear chain conditional random field (CRF). + An linear chain CRF is defined to maximize the following likelihood function: + $$ L(W, U, b; y_1, ..., y_n) := \frac{1}{Z} + \sum_{y_1, ..., y_n} \exp(-a_1' y_1 - a_n' y_n + - \sum_{k=1^n}((f(x_k' W + b) y_k) + y_1' U y_2)), $$ + where: + $Z$: normalization constant + $x_k, y_k$: inputs and outputs + This implementation has two modes for optimization: + 1. (`join mode`) optimized by maximizing join likelihood, + which is optimal in theory of statistics. + Note that in this case, CRF must be the output/last layer. + 2. (`marginal mode`) return marginal probabilities on each time + step and optimized via composition + likelihood (product of marginal likelihood), i.e., + using `categorical_crossentropy` loss. + Note that in this case, CRF can be either the last layer or an + intermediate layer (though not explored). + For prediction (test phrase), one can choose either Viterbi + best path (class indices) or marginal + probabilities if probabilities are needed. + However, if one chooses *join mode* for training, + Viterbi output is typically better than marginal output, + but the marginal output will still perform + reasonably close, while if *marginal mode* is used for training, + marginal output usually performs + much better. The default behavior and `metrics.crf_accuracy` + is set according to this observation. + In addition, this implementation supports masking and accepts either + onehot or sparse target. + If you open a issue or a pull request about CRF, please + add 'cc @lzfelix' to notify Luiz Felix. + # Examples + ```python + from keras_contrib.layers import CRF + from keras_contrib.losses import crf_loss + from keras_contrib.metrics import crf_viterbi_accuracy + model = Sequential() + model.add(Embedding(3001, 300, mask_zero=True)(X) + # use learn_mode = 'join', test_mode = 'viterbi', + # sparse_target = True (label indice output) + crf = CRF(10, sparse_target=True) + model.add(crf) + # crf_accuracy is default to Viterbi acc if using join-mode (default). + # One can add crf.marginal_acc if interested, but may slow down learning + model.compile('adam', loss=crf_loss, metrics=[crf_viterbi_accuracy]) + # y must be label indices (with shape 1 at dim 3) here, + # since `sparse_target=True` + model.fit(x, y) + # prediction give onehot representation of Viterbi best path + y_hat = model.predict(x_test) + ``` + The following snippet shows how to load a persisted + model that uses the CRF layer: + ```python + from keras.models import load_model + from keras_contrib.losses import import crf_loss + from keras_contrib.metrics import crf_viterbi_accuracy + custom_objects={'CRF': CRF, + 'crf_loss': crf_loss, + 'crf_viterbi_accuracy': crf_viterbi_accuracy} + loaded_model = load_model('', + custom_objects=custom_objects) + ``` + # Arguments + units: Positive integer, dimensionality of the output space. + learn_mode: Either 'join' or 'marginal'. + The former train the model by maximizing join likelihood while the latter + maximize the product of marginal likelihood over all time steps. + One should use `losses.crf_nll` for 'join' mode + and `losses.categorical_crossentropy` or + `losses.sparse_categorical_crossentropy` for + `marginal` mode. For convenience, simply + use `losses.crf_loss`, which will decide the proper loss as described. + test_mode: Either 'viterbi' or 'marginal'. + The former is recommended and as default when `learn_mode = 'join'` and + gives one-hot representation of the best path at test (prediction) time, + while the latter is recommended and chosen as default + when `learn_mode = 'marginal'`, + which produces marginal probabilities for each time step. + For evaluating metrics, one should + use `metrics.crf_viterbi_accuracy` for 'viterbi' mode and + 'metrics.crf_marginal_accuracy' for 'marginal' mode, or + simply use `metrics.crf_accuracy` for + both which automatically decides it as described. + One can also use both for evaluation at training. + sparse_target: Boolean (default False) indicating + if provided labels are one-hot or + indices (with shape 1 at dim 3). + use_boundary: Boolean (default True) indicating if trainable + start-end chain energies + should be added to model. + use_bias: Boolean, whether the layer uses a bias vector. + kernel_initializer: Initializer for the `kernel` weights matrix, + used for the linear transformation of the inputs. + (see [initializers](../initializers.md)). + chain_initializer: Initializer for the `chain_kernel` weights matrix, + used for the CRF chain energy. + (see [initializers](../initializers.md)). + boundary_initializer: Initializer for the `left_boundary`, + 'right_boundary' weights vectors, + used for the start/left and end/right boundary energy. + (see [initializers](../initializers.md)). + bias_initializer: Initializer for the bias vector + (see [initializers](../initializers.md)). + activation: Activation function to use + (see [activations](../activations.md)). + If you pass None, no activation is applied + (ie. "linear" activation: `a(x) = x`). + kernel_regularizer: Regularizer function applied to + the `kernel` weights matrix + (see [regularizer](../regularizers.md)). + chain_regularizer: Regularizer function applied to + the `chain_kernel` weights matrix + (see [regularizer](../regularizers.md)). + boundary_regularizer: Regularizer function applied to + the 'left_boundary', 'right_boundary' weight vectors + (see [regularizer](../regularizers.md)). + bias_regularizer: Regularizer function applied to the bias vector + (see [regularizer](../regularizers.md)). + kernel_constraint: Constraint function applied to + the `kernel` weights matrix + (see [constraints](../constraints.md)). + chain_constraint: Constraint function applied to + the `chain_kernel` weights matrix + (see [constraints](../constraints.md)). + boundary_constraint: Constraint function applied to + the `left_boundary`, `right_boundary` weights vectors + (see [constraints](../constraints.md)). + bias_constraint: Constraint function applied to the bias vector + (see [constraints](../constraints.md)). + input_dim: dimensionality of the input (integer). + This argument (or alternatively, the keyword argument `input_shape`) + is required when using this layer as the first layer in a model. + unroll: Boolean (default False). If True, the network will be + unrolled, else a symbolic loop will be used. + Unrolling can speed-up a RNN, although it tends + to be more memory-intensive. + Unrolling is only suitable for short sequences. + # Input shape + 3D tensor with shape `(nb_samples, timesteps, input_dim)`. + # Output shape + 3D tensor with shape `(nb_samples, timesteps, units)`. + # Masking + This layer supports masking for input data with a variable number + of timesteps. To introduce masks to your data, + use an [Embedding](embeddings.md) layer with the `mask_zero` parameter + set to `True`. + """ + + def __init__(self, units, + learn_mode='join', + test_mode=None, + sparse_target=False, + use_boundary=True, + use_bias=True, + activation='linear', + kernel_initializer='glorot_uniform', + chain_initializer='orthogonal', + bias_initializer='zeros', + boundary_initializer='zeros', + kernel_regularizer=None, + chain_regularizer=None, + boundary_regularizer=None, + bias_regularizer=None, + kernel_constraint=None, + chain_constraint=None, + boundary_constraint=None, + bias_constraint=None, + input_dim=None, + unroll=False, + **kwargs): + super(CRF, self).__init__(**kwargs) + self.supports_masking = True + self.units = units + self.learn_mode = learn_mode + assert self.learn_mode in ['join', 'marginal'] + self.test_mode = test_mode + if self.test_mode is None: + self.test_mode = 'viterbi' if self.learn_mode == 'join' else 'marginal' + else: + assert self.test_mode in ['viterbi', 'marginal'] + self.sparse_target = sparse_target + self.use_boundary = use_boundary + self.use_bias = use_bias + + self.activation = activations.get(activation) + + self.kernel_initializer = initializers.get(kernel_initializer) + self.chain_initializer = initializers.get(chain_initializer) + self.boundary_initializer = initializers.get(boundary_initializer) + self.bias_initializer = initializers.get(bias_initializer) + + self.kernel_regularizer = regularizers.get(kernel_regularizer) + self.chain_regularizer = regularizers.get(chain_regularizer) + self.boundary_regularizer = regularizers.get(boundary_regularizer) + self.bias_regularizer = regularizers.get(bias_regularizer) + + self.kernel_constraint = constraints.get(kernel_constraint) + self.chain_constraint = constraints.get(chain_constraint) + self.boundary_constraint = constraints.get(boundary_constraint) + self.bias_constraint = constraints.get(bias_constraint) + + self.unroll = unroll + + def build(self, input_shape): + # input_shape = to_tuple(input_shape) + self.input_spec = [InputSpec(shape=input_shape)] + self.input_dim = input_shape[-1] + + self.kernel = self.add_weight(shape=(self.input_dim, self.units), + name='kernel', + initializer=self.kernel_initializer, + regularizer=self.kernel_regularizer, + constraint=self.kernel_constraint) + self.chain_kernel = self.add_weight(shape=(self.units, self.units), + name='chain_kernel', + initializer=self.chain_initializer, + regularizer=self.chain_regularizer, + constraint=self.chain_constraint) + if self.use_bias: + self.bias = self.add_weight(shape=(self.units,), + name='bias', + initializer=self.bias_initializer, + regularizer=self.bias_regularizer, + constraint=self.bias_constraint) + else: + self.bias = 0 + + if self.use_boundary: + self.left_boundary = self.add_weight(shape=(self.units,), + name='left_boundary', + initializer=self.boundary_initializer, + regularizer=self.boundary_regularizer, + constraint=self.boundary_constraint) + self.right_boundary = self.add_weight(shape=(self.units,), + name='right_boundary', + initializer=self.boundary_initializer, + regularizer=self.boundary_regularizer, + constraint=self.boundary_constraint) + self.built = True + + def call(self, X, mask=None): + if mask is not None: + assert K.ndim(mask) == 2, 'Input mask to CRF must have dim 2 if not None' + + if self.test_mode == 'viterbi': + test_output = self.viterbi_decoding(X, mask) + else: + test_output = self.get_marginal_prob(X, mask) + + self.uses_learning_phase = True + if self.learn_mode == 'join': + train_output = K.zeros_like(K.dot(X, self.kernel)) + out = K.in_train_phase(train_output, test_output) + else: + if self.test_mode == 'viterbi': + train_output = self.get_marginal_prob(X, mask) + out = K.in_train_phase(train_output, test_output) + else: + out = test_output + return out + + def compute_output_shape(self, input_shape): + return input_shape[:2] + (self.units,) + + def compute_mask(self, input, mask=None): + if mask is not None and self.learn_mode == 'join': + return K.any(mask, axis=1) + return mask + + def get_config(self): + config = { + 'units': self.units, + 'learn_mode': self.learn_mode, + 'test_mode': self.test_mode, + 'use_boundary': self.use_boundary, + 'use_bias': self.use_bias, + 'sparse_target': self.sparse_target, + 'kernel_initializer': initializers.serialize(self.kernel_initializer), + 'chain_initializer': initializers.serialize(self.chain_initializer), + 'boundary_initializer': initializers.serialize( + self.boundary_initializer), + 'bias_initializer': initializers.serialize(self.bias_initializer), + 'activation': activations.serialize(self.activation), + 'kernel_regularizer': regularizers.serialize(self.kernel_regularizer), + 'chain_regularizer': regularizers.serialize(self.chain_regularizer), + 'boundary_regularizer': regularizers.serialize( + self.boundary_regularizer), + 'bias_regularizer': regularizers.serialize(self.bias_regularizer), + 'kernel_constraint': constraints.serialize(self.kernel_constraint), + 'chain_constraint': constraints.serialize(self.chain_constraint), + 'boundary_constraint': constraints.serialize(self.boundary_constraint), + 'bias_constraint': constraints.serialize(self.bias_constraint), + 'input_dim': self.input_dim, + 'unroll': self.unroll} + base_config = super(CRF, self).get_config() + return dict(list(base_config.items()) + list(config.items())) + + # @property + # def loss_function(self): + # warnings.warn('CRF.loss_function is deprecated ' + # 'and it might be removed in the future. Please ' + # 'use losses.crf_loss instead.') + # return crf_loss + # + # @property + # def accuracy(self): + # warnings.warn('CRF.accuracy is deprecated and it ' + # 'might be removed in the future. Please ' + # 'use metrics.crf_accuracy') + # if self.test_mode == 'viterbi': + # return crf_viterbi_accuracy + # else: + # return crf_marginal_accuracy + # + # @property + # def viterbi_acc(self): + # warnings.warn('CRF.viterbi_acc is deprecated and it might ' + # 'be removed in the future. Please ' + # 'use metrics.viterbi_acc instead.') + # return crf_viterbi_accuracy + # + # @property + # def marginal_acc(self): + # warnings.warn('CRF.moarginal_acc is deprecated and it ' + # 'might be removed in the future. Please ' + # 'use metrics.marginal_acc instead.') + # return crf_marginal_accuracy + + @staticmethod + def softmaxNd(x, axis=-1): + m = K.max(x, axis=axis, keepdims=True) + exp_x = K.exp(x - m) + prob_x = exp_x / K.sum(exp_x, axis=axis, keepdims=True) + return prob_x + + @staticmethod + def shift_left(x, offset=1): + assert offset > 0 + return K.concatenate([x[:, offset:], K.zeros_like(x[:, :offset])], axis=1) + + @staticmethod + def shift_right(x, offset=1): + assert offset > 0 + return K.concatenate([K.zeros_like(x[:, :offset]), x[:, :-offset]], axis=1) + + def add_boundary_energy(self, energy, mask, start, end): + start = K.expand_dims(K.expand_dims(start, 0), 0) + end = K.expand_dims(K.expand_dims(end, 0), 0) + if mask is None: + energy = K.concatenate([energy[:, :1, :] + start, energy[:, 1:, :]], + axis=1) + energy = K.concatenate([energy[:, :-1, :], energy[:, -1:, :] + end], + axis=1) + else: + mask = K.expand_dims(K.cast(mask, K.floatx())) + start_mask = K.cast(K.greater(mask, self.shift_right(mask)), K.floatx()) + end_mask = K.cast(K.greater(self.shift_left(mask), mask), K.floatx()) + energy = energy + start_mask * start + energy = energy + end_mask * end + return energy + + def get_log_normalization_constant(self, input_energy, mask, **kwargs): + """Compute logarithm of the normalization constant Z, where + Z = sum exp(-E) -> logZ = log sum exp(-E) =: -nlogZ + """ + # should have logZ[:, i] == logZ[:, j] for any i, j + logZ = self.recursion(input_energy, mask, return_sequences=False, **kwargs) + return logZ[:, 0] + + def get_energy(self, y_true, input_energy, mask): + """Energy = a1' y1 + u1' y1 + y1' U y2 + u2' y2 + y2' U y3 + u3' y3 + an' y3 + """ + input_energy = K.sum(input_energy * y_true, 2) # (B, T) + # (B, T-1) + chain_energy = K.sum(K.dot(y_true[:, :-1, :], + self.chain_kernel) * y_true[:, 1:, :], 2) + + if mask is not None: + mask = K.cast(mask, K.floatx()) + # (B, T-1), mask[:,:-1]*mask[:,1:] makes it work with any padding + chain_mask = mask[:, :-1] * mask[:, 1:] + input_energy = input_energy * mask + chain_energy = chain_energy * chain_mask + total_energy = K.sum(input_energy, -1) + K.sum(chain_energy, -1) # (B, ) + + return total_energy + + def get_negative_log_likelihood(self, y_true, X, mask): + """Compute the loss, i.e., negative log likelihood (normalize by number of time steps) + likelihood = 1/Z * exp(-E) -> neg_log_like = - log(1/Z * exp(-E)) = logZ + E + """ + input_energy = self.activation(K.dot(X, self.kernel) + self.bias) + if self.use_boundary: + input_energy = self.add_boundary_energy(input_energy, mask, + self.left_boundary, + self.right_boundary) + energy = self.get_energy(y_true, input_energy, mask) + logZ = self.get_log_normalization_constant(input_energy, mask, + input_length=K.int_shape(X)[1]) + nloglik = logZ + energy + if mask is not None: + nloglik = nloglik / K.sum(K.cast(mask, K.floatx()), 1) + else: + nloglik = nloglik / K.cast(K.shape(X)[1], K.floatx()) + return nloglik + + def step(self, input_energy_t, states, return_logZ=True): + # not in the following `prev_target_val` has shape = (B, F) + # where B = batch_size, F = output feature dim + # Note: `i` is of float32, due to the behavior of `K.rnn` + prev_target_val, i, chain_energy = states[:3] + t = K.cast(i[0, 0], dtype='int32') + if len(states) > 3: + if K.backend() == 'theano': + m = states[3][:, t:(t + 2)] + else: + m = K.tf.slice(states[3], [0, t], [-1, 2]) + input_energy_t = input_energy_t * K.expand_dims(m[:, 0]) + # (1, F, F)*(B, 1, 1) -> (B, F, F) + chain_energy = chain_energy * K.expand_dims( + K.expand_dims(m[:, 0] * m[:, 1])) + if return_logZ: + # shapes: (1, B, F) + (B, F, 1) -> (B, F, F) + energy = chain_energy + K.expand_dims(input_energy_t - prev_target_val, 2) + new_target_val = K.logsumexp(-energy, 1) # shapes: (B, F) + return new_target_val, [new_target_val, i + 1] + else: + energy = chain_energy + K.expand_dims(input_energy_t + prev_target_val, 2) + min_energy = K.min(energy, 1) + # cast for tf-version `K.rnn + argmin_table = K.cast(K.argmin(energy, 1), K.floatx()) + return argmin_table, [min_energy, i + 1] + + def recursion(self, input_energy, mask=None, go_backwards=False, + return_sequences=True, return_logZ=True, input_length=None): + """Forward (alpha) or backward (beta) recursion + If `return_logZ = True`, compute the logZ, the normalization constant: + \[ Z = \sum_{y1, y2, y3} exp(-E) # energy + = \sum_{y1, y2, y3} exp(-(u1' y1 + y1' W y2 + u2' y2 + y2' W y3 + u3' y3)) + = sum_{y2, y3} (exp(-(u2' y2 + y2' W y3 + u3' y3)) + sum_{y1} exp(-(u1' y1' + y1' W y2))) \] + Denote: + \[ S(y2) := sum_{y1} exp(-(u1' y1 + y1' W y2)), \] + \[ Z = sum_{y2, y3} exp(log S(y2) - (u2' y2 + y2' W y3 + u3' y3)) \] + \[ logS(y2) = log S(y2) = log_sum_exp(-(u1' y1' + y1' W y2)) \] + Note that: + yi's are one-hot vectors + u1, u3: boundary energies have been merged + If `return_logZ = False`, compute the Viterbi's best path lookup table. + """ + chain_energy = self.chain_kernel + # shape=(1, F, F): F=num of output features. 1st F is for t-1, 2nd F for t + chain_energy = K.expand_dims(chain_energy, 0) + # shape=(B, F), dtype=float32 + prev_target_val = K.zeros_like(input_energy[:, 0, :]) + + if go_backwards: + input_energy = K.reverse(input_energy, 1) + if mask is not None: + mask = K.reverse(mask, 1) + + initial_states = [prev_target_val, K.zeros_like(prev_target_val[:, :1])] + constants = [chain_energy] + + if mask is not None: + mask2 = K.cast(K.concatenate([mask, K.zeros_like(mask[:, :1])], axis=1), + K.floatx()) + constants.append(mask2) + + def _step(input_energy_i, states): + return self.step(input_energy_i, states, return_logZ) + + target_val_last, target_val_seq, _ = K.rnn(_step, input_energy, + initial_states, + constants=constants, + input_length=input_length, + unroll=self.unroll) + + if return_sequences: + if go_backwards: + target_val_seq = K.reverse(target_val_seq, 1) + return target_val_seq + else: + return target_val_last + + def forward_recursion(self, input_energy, **kwargs): + return self.recursion(input_energy, **kwargs) + + def backward_recursion(self, input_energy, **kwargs): + return self.recursion(input_energy, go_backwards=True, **kwargs) + + def get_marginal_prob(self, X, mask=None): + input_energy = self.activation(K.dot(X, self.kernel) + self.bias) + if self.use_boundary: + input_energy = self.add_boundary_energy(input_energy, mask, + self.left_boundary, + self.right_boundary) + input_length = K.int_shape(X)[1] + alpha = self.forward_recursion(input_energy, mask=mask, + input_length=input_length) + beta = self.backward_recursion(input_energy, mask=mask, + input_length=input_length) + if mask is not None: + input_energy = input_energy * K.expand_dims(K.cast(mask, K.floatx())) + margin = -(self.shift_right(alpha) + input_energy + self.shift_left(beta)) + return self.softmaxNd(margin) + + def viterbi_decoding(self, X, mask=None): + input_energy = self.activation(K.dot(X, self.kernel) + self.bias) + if self.use_boundary: + input_energy = self.add_boundary_energy( + input_energy, mask, self.left_boundary, self.right_boundary) + + argmin_tables = self.recursion(input_energy, mask, return_logZ=False) + argmin_tables = K.cast(argmin_tables, 'int32') + + # backward to find best path, `initial_best_idx` can be any, + # as all elements in the last argmin_table are the same + argmin_tables = K.reverse(argmin_tables, 1) + # matrix instead of vector is required by tf `K.rnn` + initial_best_idx = [K.expand_dims(argmin_tables[:, 0, 0])] + if K.backend() == 'theano': + initial_best_idx = [K.T.unbroadcast(initial_best_idx[0], 1)] + + def gather_each_row(params, indices): + n = K.shape(indices)[0] + if K.backend() == 'theano': + return params[K.T.arange(n), indices] + else: + indices = K.transpose(K.stack([K.tf.range(n), indices])) + return K.tf.gather_nd(params, indices) + + def find_path(argmin_table, best_idx): + next_best_idx = gather_each_row(argmin_table, best_idx[0][:, 0]) + next_best_idx = K.expand_dims(next_best_idx) + if K.backend() == 'theano': + next_best_idx = K.T.unbroadcast(next_best_idx, 1) + return next_best_idx, [next_best_idx] + + _, best_paths, _ = K.rnn(find_path, argmin_tables, initial_best_idx, + input_length=K.int_shape(X)[1], unroll=self.unroll) + best_paths = K.reverse(best_paths, 1) + best_paths = K.squeeze(best_paths, 2) + + return K.one_hot(best_paths, self.units) diff --git a/Ner/bert/keras_bert_ner_bi_lstm.py b/Ner/bert/keras_bert_ner_bi_lstm.py new file mode 100644 index 0000000..5b689cc --- /dev/null +++ b/Ner/bert/keras_bert_ner_bi_lstm.py @@ -0,0 +1,335 @@ +# -*- coding: UTF-8 -*- +# !/usr/bin/python +# @time :2019/5/10 18:05 +# @author :Mo +# @function :classify text of bert and bi-lstm + +from __future__ import division, absolute_import + +import logging as logger +import numpy as np +import pickle +import codecs + +# bert embedding +from Ner.bert.keras_bert_layer import CRF, crf_loss, crf_accuracy +from Ner.bert.keras_bert_embedding import KerasBertEmbedding +from Ner.bert.keras_bert_layer import NonMaskingLayer + +# bert trained path +from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau +from keras.layers import Bidirectional, CuDNNGRU, CuDNNLSTM +from keras.layers import Dense, Dropout +from keras.layers import GRU, LSTM, TimeDistributed +from keras.objectives import sparse_categorical_crossentropy +from keras.optimizers import Adam +from keras.models import Model +from keras import regularizers + +# bert sequence taging +from keras.preprocessing.sequence import pad_sequences +from keras.utils import to_categorical +from keras_bert import Tokenizer + +# corpus path +from Ner.bert import args +from conf.path_config import path_ner_people_train, path_ner_people_dev, path_ner_people_test +from conf.feature_config import vocab_file + + +class BertNerBiLstmModel(): + def __init__(self): + # logger.info("BertBiLstmModel init start!") + print("BertNerBiLstmModel init start!") + self.dict_path, self.max_seq_len, self.keep_prob, self.is_training = vocab_file, args.max_seq_len, args.keep_prob, args.is_training + # reader tokenizer + self.token_dict = {} + with codecs.open(self.dict_path, 'r', 'utf8') as reader: + for line in reader: + token = line.strip() + self.token_dict[token] = len(self.token_dict) + + self.tokenizer = Tokenizer(self.token_dict) + # 你可以选择一个model build,有bi-lstm single、bi-lstm 3-layers、bi-lstm_attention + self.build_model_bilstm_layers() + self.compile_model() + # self.build_model_bilstm_single() + # logger.info("BertBiLstmModel init end!") + print("BertNerBiLstmModel init end!") + + def process_single(self, texts): + # 文本预处理,传入一个list,返回的是ids\mask\type-ids + input_ids = [] + input_masks = [] + input_type_ids = [] + for text in texts: + if type(text) is list: + text = "".join(text) + logger.info(text) + tokens_text = self.tokenizer.tokenize(text) + logger.info('Tokens:', tokens_text) + input_id, input_type_id = self.tokenizer.encode(first=text, max_len=self.max_seq_len) + input_mask = [0 if ids == 0 else 1 for ids in input_id] + input_ids.append(input_id) + input_type_ids.append(input_type_id) + input_masks.append(input_mask) + # numpy处理list + input_ids = np.array(input_ids) + input_masks = np.array(input_masks) + input_type_ids = np.array(input_type_ids) + logger.info("process ok!") + return [input_ids, input_masks, input_type_ids] + + def process_pair(self, textss): + # 文本预处理,传入一个list,返回的是ids\mask\type-ids + input_ids = [] + input_masks = [] + input_type_ids = [] + for texts in textss: + tokens_text = self.tokenizer.tokenize(texts[0]) + logger.info('Tokens1:', tokens_text) + tokens_text2 = self.tokenizer.tokenize(texts[1]) + logger.info('Tokens2:', tokens_text2) + input_id, input_type_id = self.tokenizer.encode(first=texts[0], second=texts[1], max_len=self.max_seq_len) + input_mask = [0 if ids == 0 else 1 for ids in input_id] + input_ids.append(input_id) + input_type_ids.append(input_type_id) + input_masks.append(input_mask) + # numpy处理list + input_ids = np.array(input_ids) + input_masks = np.array(input_masks) + input_type_ids = np.array(input_type_ids) + logger.info("process ok!") + return [input_ids, input_masks, input_type_ids] + + def build_model_bilstm_layers(self): + if args.use_lstm: + if args.use_cudnn_cell: + layer_cell = CuDNNLSTM + else: + layer_cell = LSTM + else: + if args.use_cudnn_cell: + layer_cell = CuDNNGRU + else: + layer_cell = GRU + # bert embedding + bert_inputs, bert_output = KerasBertEmbedding().bert_encode() + + # Bi-LSTM + x = Bidirectional(layer_cell(units=args.units, + return_sequences=args.return_sequences, + ))(bert_output) + # 最后 + x = TimeDistributed(Dropout(self.keep_prob))(x) + dense_layer = Dense(args.max_seq_len, activation=args.activation)(x) + crf = CRF(args.label, sparse_target=False, learn_mode="join", test_mode='viterbi') + output_layers = crf(dense_layer) + self.model = Model(bert_inputs, output_layers) + self.model.summary(132) + + def compile_model(self): + self.model.compile( + optimizer=Adam(lr=args.learning_rate, beta_1=0.9, beta_2=0.999, epsilon=args.epsilon, decay=0.0), + loss=crf_loss if args.use_crf else sparse_categorical_crossentropy, + metrics=[crf_accuracy] if args.metrics is 'crf_loss' else args.metrics) + # loss=CRF.loss_function if args.use_crf else categorical_crossentropy, + # metrics=[CRF.accuracy] if args.metrics is 'crf_loss' else args.metrics) + # loss=crf.loss if args.use_crf else categorical_crossentropy, + # metrics=[crf.accuracy] if args.metrics is 'crf_loss' else args.metrics) + + def callback(self): + cb = [ModelCheckpoint(monitor='val_loss', mode='min', filepath=args.path_save_model, verbose=1, save_best_only=True, save_weights_only=False), + ReduceLROnPlateau(monitor='val_loss', mode='min', factor=0.2, patience=2, verbose=0, epsilon=1e-6, cooldown=4, min_lr=1e-8), + EarlyStopping(monitor='val_loss', mode='min', min_delta=1e-8, patience=2) + ] + return cb + + def fit(self, x_train, y_train, x_dev, y_dev): + self.model.fit(x_train, y_train, batch_size=args.batch_size, + epochs=args.epochs, validation_data=(x_dev, y_dev), + shuffle=True, + callbacks=self.callback()) + self.model.save(args.path_save_model) + + def load_model(self): + print("BertNerBiLstmModel load_model start!") + # logger.info("BertBiLstmModel load_model start!") + self.model.load_weights(args.path_save_model) + # logger.info("BertBiLstmModel load_model end+!") + print("BertNerBiLstmModel load_model end+!") + + def predict(self, sen): + input_ids, input_masks, input_type_ids = self.process_single([sen]) + probs = self.model.predict([input_ids, input_masks], batch_size=1) + probs_first = probs[0] + preds = [] + for prob_one in probs_first: + prob_max = np.argmax(prob_one) + preds.append(prob_max) + return preds + + def predict_list(self, questions): + label_preds = [] + for questions_pair in questions: + input_ids, input_masks, input_type_ids = self.process_single([questions_pair]) + label_pred = self.model.predict([input_ids, input_masks], batch_size=1) + label_preds.append(label_pred) + return label_preds + + +def get_sequence_tagging_data_from_chinese_people_daily_ner_corpus(file_path): + """ + 读取人民日报语料,其实就是一个txt阅读器 + :param file_path: str, text + :return: list, list + """ + _x_, _y_ = [], [] + with open(file_path, "r", encoding="utf-8") as fr: + lines = fr.read().splitlines() + x, y = [], [] + for line_one in lines: + rows = line_one.split(" ") + if len(rows) == 1: + _x_.append(x), _y_.append(y) + x, y = [], [] + else: + x.append(rows[0]), y.append(rows[1]) + return _x_, _y_ + + +def label_tagging(data_x_s, tag_label2index, len_max=32): + """ + 根据类别字典dict、语料y和最大文本长度l,padding和to_categorical + :param data_x_s: list + :param tag_label2index:dict + :param len_max: int + :return: list + """ + tag_labels = [] + for data_x in data_x_s: + if len(data_x) <= len_max-2: + tag_labels.append([tag_label2index['O']] + [tag_label2index[i] for i in data_x] + [tag_label2index['O'] for i in range(len_max - len(data_x) - 1)]) + else: + tag_labels.append([tag_label2index['O']] + [tag_label2index[i] for i in data_x[:len_max-1]] + [tag_label2index['O']]) + + tag_labels_pad = pad_sequences(sequences=tag_labels, maxlen=len_max, dtype='int32', + padding='post', truncating='post', value=tag_label2index['O']) + one_hot_y = to_categorical(tag_labels_pad, num_classes=len(tag_label2index)) + + label_num = len(set(["".join(str(i)) for i in tag_labels])) + # tag_labels_pad_to = to_categorical(y=tag_labels_pad.tolist(), num_classes=label_num) + return one_hot_y, label_num + + +def label_tagging_predict(y_predicts, tag_i2l): + y_preds = [] + count_y_predict = y_predicts[0].shape[1] + for y_predict in y_predicts: + temp = [] + for i in range(count_y_predict): + y_predict_list = y_predict[0][i].tolist() + y_predict_max = y_predict_list.index(max(y_predict_list)) + pred_label = tag_i2l[y_predict_max] + temp.append(pred_label) + y_preds.append(temp) + return y_preds + + +def create_label_index_dict(data_x_s): + """ + 构建类别和index标签,一一对应等 + :param data_x_s: list, labels of train data + :return: list, list + """ + # 首先构建index2label, 或者label2index + tag_label2index = {} + tag_index2label = {} + data_x_s_one = [] + for d in data_x_s: + data_x_s_one = data_x_s_one + d + label_data_x_s = list(set(data_x_s_one)) + for i in range(len(label_data_x_s)): + tag_label2index[label_data_x_s[i]] = i + tag_index2label[i] = label_data_x_s[i] + return tag_label2index, tag_index2label + + +def process_ner_y(y_data, length_max): + """ + 根据训练语料y生成喂入模型的input_y + :param y_data: list + :param length_max: int + :return: list, dict, dict + """ + # 保存类别字典 + import os + if not os.path.exists(args.path_tag_li): + tag_l2i, tag_i2l = create_label_index_dict(y_data) + with open(args.path_tag_li, 'wb') as f: + pickle.dump((tag_l2i, tag_i2l), f) + else: + with open(args.path_tag_li, 'rb') as f: + tag_l2i, tag_i2l = pickle.load(f) + tagging_index, label_num = y_data, length_max + try: + # tagging + tagging_index, label_num = label_tagging(y_data, tag_l2i, length_max) + except: + gg = 0 + + return tagging_index, label_num, tag_l2i, tag_i2l + + +def train(): + # 1. trian + bert_model = BertNerBiLstmModel() + # bert_model.compile_model() + print("process corpus start!") + # 读取语料 + x_train, y_train = get_sequence_tagging_data_from_chinese_people_daily_ner_corpus(path_ner_people_train) + x_dev, y_dev = get_sequence_tagging_data_from_chinese_people_daily_ner_corpus(path_ner_people_dev) + # ques和label index and padding + x_train = bert_model.process_single(x_train) + x_dev = bert_model.process_single(x_dev) + y_train_tagging_index, label_num, tag_l2i, tag_i2l = process_ner_y(y_train, args.max_seq_len) + y_dev_tagging_index, _, _, _ = process_ner_y(y_dev, args.max_seq_len) + # args.label = label_num + print(label_num) + print("process corpus end!") + # gg = x_train[0:2] + x_train_2 = x_train[0:2] + x_dev_2 = x_dev[0:2] + print(x_train_2.__sizeof__()) + print(x_dev_2.__sizeof__()) + y_train_2 = y_train_tagging_index + y_dev_2 = y_dev_tagging_index + + bert_model.fit(x_train_2, y_train_2, x_dev_2, y_dev_2) + + +def predict(): + # 3. predict + _, _, tag_l2i, tag_i2l = process_ner_y([], []) + bert_model = BertNerBiLstmModel() + bert_model.load_model() + pred = bert_model.predict(sen='欧美和台湾经济怎么样') + tag_labels = [] + for pre in pred: + tag_labels.append(tag_i2l[pre]) + print(tag_labels) + while True: + print("sen: ") + sen_1 = input() + pred = bert_model.predict(sen=sen_1) + tag_labels = [] + for pre in pred: + tag_labels.append(tag_i2l[pre]) + print(tag_labels) + + +if __name__ == "__main__": + train() + # predict() + + diff --git a/Ner/bert/layer_crf_bojone.py b/Ner/bert/layer_crf_bojone.py new file mode 100644 index 0000000..d38b4ac --- /dev/null +++ b/Ner/bert/layer_crf_bojone.py @@ -0,0 +1,78 @@ +# -*- coding: UTF-8 -*- +# !/usr/bin/python +# @time :2019/5/26 9:29 +# @author :Mo +# @function : + +from __future__ import absolute_import +from __future__ import division + +from keras.layers import Layer +import keras.backend as K + + +class CRF(Layer): + """ + codes from: https://github.com/bojone/crf/blob/master/crf_keras.py + 纯Keras实现CRF层 + CRF层本质上是一个带训练参数的loss计算层,因此CRF层只用来训练模型, + 而预测则需要另外建立模型。 + """ + + def __init__(self, ignore_last_label=False, **kwargs): + """ignore_last_label:定义要不要忽略最后一个标签,起到mask的效果 + """ + self.ignore_last_label = 1 if ignore_last_label else 0 + super(CRF, self).__init__(**kwargs) + + def build(self, input_shape): + self.num_labels = input_shape[-1] - self.ignore_last_label + self.trans = self.add_weight(name='crf_trans', + shape=(self.num_labels, self.num_labels), + initializer='glorot_uniform', + trainable=True) + + def log_norm_step(self, inputs, states): + """递归计算归一化因子 + 要点:1、递归计算;2、用logsumexp避免溢出。 + 技巧:通过expand_dims来对齐张量。 + """ + states = K.expand_dims(states[0], 2) # (batch_size, output_dim, 1) + trans = K.expand_dims(self.trans, 0) # (1, output_dim, output_dim) + output = K.logsumexp(states + trans, 1) # (batch_size, output_dim) + return output + inputs, [output + inputs] + + def path_score(self, inputs, labels): + """计算目标路径的相对概率(还没有归一化) + 要点:逐标签得分,加上转移概率得分。 + 技巧:用“预测”点乘“目标”的方法抽取出目标路径的得分。 + """ + point_score = K.sum(K.sum(inputs * labels, 2), 1, keepdims=True) # 逐标签得分 + labels1 = K.expand_dims(labels[:, :-1], 3) + labels2 = K.expand_dims(labels[:, 1:], 2) + labels = labels1 * labels2 # 两个错位labels,负责从转移矩阵中抽取目标转移得分 + trans = K.expand_dims(K.expand_dims(self.trans, 0), 0) + trans_score = K.sum(K.sum(trans * labels, [2, 3]), 1, keepdims=True) + return point_score + trans_score # 两部分得分之和 + + def call(self, inputs): # CRF本身不改变输出,它只是一个loss + return inputs + + def loss(self, y_true, y_pred): # 目标y_pred需要是one hot形式 + mask = 1 - y_true[:, 1:, -1] if self.ignore_last_label else None + y_true, y_pred = y_true[:, :, :self.num_labels], y_pred[:, :, :self.num_labels] + init_states = [y_pred[:, 0]] # 初始状态 + log_norm, _, _ = K.rnn(self.log_norm_step, y_pred[:, 1:], init_states, mask=mask) # 计算Z向量(对数) + log_norm = K.logsumexp(log_norm, 1, keepdims=True) # 计算Z(对数) + path_score = self.path_score(y_pred, y_true) # 计算分子(对数) + return log_norm - path_score # 即log(分子/分母) + + def accuracy(self, y_true, y_pred): # 训练过程中显示逐帧准确率的函数,排除了mask的影响 + mask = 1 - y_true[:, :, -1] if self.ignore_last_label else None + y_true, y_pred = y_true[:, :, :self.num_labels], y_pred[:, :, :self.num_labels] + isequal = K.equal(K.argmax(y_true, 2), K.argmax(y_pred, 2)) + isequal = K.cast(isequal, 'float32') + if mask == None: + return K.mean(isequal) + else: + return K.sum(isequal * mask) / K.sum(mask) diff --git a/Ner/bert/models/bilstm/useless.txt b/Ner/bert/models/bilstm/useless.txt new file mode 100644 index 0000000..eb718e8 --- /dev/null +++ b/Ner/bert/models/bilstm/useless.txt @@ -0,0 +1 @@ +useless \ No newline at end of file