nlp_xiaojiang/Ner/bert/keras_bert_ner_bi_lstm.py
2019-07-01 22:02:24 +08:00

336 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: UTF-8 -*-
# !/usr/bin/python
# @time :2019/5/10 18:05
# @author :Mo
# @function :classify text of bert and bi-lstm
from __future__ import division, absolute_import
import logging as logger
import numpy as np
import pickle
import codecs
# bert embedding
from Ner.bert.keras_bert_layer import CRF, crf_loss, crf_accuracy
from Ner.bert.keras_bert_embedding import KerasBertEmbedding
from Ner.bert.keras_bert_layer import NonMaskingLayer
# bert trained path
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from keras.layers import Bidirectional, CuDNNGRU, CuDNNLSTM
from keras.layers import Dense, Dropout
from keras.layers import GRU, LSTM, TimeDistributed
from keras.objectives import sparse_categorical_crossentropy
from keras.optimizers import Adam
from keras.models import Model
from keras import regularizers
# bert sequence taging
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras_bert import Tokenizer
# corpus path
from Ner.bert import args
from conf.path_config import path_ner_people_train, path_ner_people_dev, path_ner_people_test
from conf.feature_config import vocab_file
class BertNerBiLstmModel():
def __init__(self):
# logger.info("BertBiLstmModel init start!")
print("BertNerBiLstmModel init start!")
self.dict_path, self.max_seq_len, self.keep_prob, self.is_training = vocab_file, args.max_seq_len, args.keep_prob, args.is_training
# reader tokenizer
self.token_dict = {}
with codecs.open(self.dict_path, 'r', 'utf8') as reader:
for line in reader:
token = line.strip()
self.token_dict[token] = len(self.token_dict)
self.tokenizer = Tokenizer(self.token_dict)
# 你可以选择一个model build有bi-lstm single、bi-lstm 3-layers、bi-lstm_attention
self.build_model_bilstm_layers()
self.compile_model()
# self.build_model_bilstm_single()
# logger.info("BertBiLstmModel init end!")
print("BertNerBiLstmModel init end!")
def process_single(self, texts):
# 文本预处理传入一个list返回的是ids\mask\type-ids
input_ids = []
input_masks = []
input_type_ids = []
for text in texts:
if type(text) is list:
text = "".join(text)
logger.info(text)
tokens_text = self.tokenizer.tokenize(text)
logger.info('Tokens:', tokens_text)
input_id, input_type_id = self.tokenizer.encode(first=text, max_len=self.max_seq_len)
input_mask = [0 if ids == 0 else 1 for ids in input_id]
input_ids.append(input_id)
input_type_ids.append(input_type_id)
input_masks.append(input_mask)
# numpy处理list
input_ids = np.array(input_ids)
input_masks = np.array(input_masks)
input_type_ids = np.array(input_type_ids)
logger.info("process ok!")
return [input_ids, input_masks, input_type_ids]
def process_pair(self, textss):
# 文本预处理传入一个list返回的是ids\mask\type-ids
input_ids = []
input_masks = []
input_type_ids = []
for texts in textss:
tokens_text = self.tokenizer.tokenize(texts[0])
logger.info('Tokens1:', tokens_text)
tokens_text2 = self.tokenizer.tokenize(texts[1])
logger.info('Tokens2:', tokens_text2)
input_id, input_type_id = self.tokenizer.encode(first=texts[0], second=texts[1], max_len=self.max_seq_len)
input_mask = [0 if ids == 0 else 1 for ids in input_id]
input_ids.append(input_id)
input_type_ids.append(input_type_id)
input_masks.append(input_mask)
# numpy处理list
input_ids = np.array(input_ids)
input_masks = np.array(input_masks)
input_type_ids = np.array(input_type_ids)
logger.info("process ok!")
return [input_ids, input_masks, input_type_ids]
def build_model_bilstm_layers(self):
if args.use_lstm:
if args.use_cudnn_cell:
layer_cell = CuDNNLSTM
else:
layer_cell = LSTM
else:
if args.use_cudnn_cell:
layer_cell = CuDNNGRU
else:
layer_cell = GRU
# bert embedding
bert_inputs, bert_output = KerasBertEmbedding().bert_encode()
# Bi-LSTM
x = Bidirectional(layer_cell(units=args.units,
return_sequences=args.return_sequences,
))(bert_output)
# 最后
x = TimeDistributed(Dropout(self.keep_prob))(x)
dense_layer = Dense(args.max_seq_len, activation=args.activation)(x)
crf = CRF(args.label, sparse_target=False, learn_mode="join", test_mode='viterbi')
output_layers = crf(dense_layer)
self.model = Model(bert_inputs, output_layers)
self.model.summary(132)
def compile_model(self):
self.model.compile(
optimizer=Adam(lr=args.learning_rate, beta_1=0.9, beta_2=0.999, epsilon=args.epsilon, decay=0.0),
loss=crf_loss if args.use_crf else sparse_categorical_crossentropy,
metrics=[crf_accuracy] if args.metrics is 'crf_loss' else args.metrics)
# loss=CRF.loss_function if args.use_crf else categorical_crossentropy,
# metrics=[CRF.accuracy] if args.metrics is 'crf_loss' else args.metrics)
# loss=crf.loss if args.use_crf else categorical_crossentropy,
# metrics=[crf.accuracy] if args.metrics is 'crf_loss' else args.metrics)
def callback(self):
cb = [ModelCheckpoint(monitor='val_loss', mode='min', filepath=args.path_save_model, verbose=1, save_best_only=True, save_weights_only=False),
ReduceLROnPlateau(monitor='val_loss', mode='min', factor=0.2, patience=2, verbose=0, epsilon=1e-6, cooldown=4, min_lr=1e-8),
EarlyStopping(monitor='val_loss', mode='min', min_delta=1e-8, patience=2)
]
return cb
def fit(self, x_train, y_train, x_dev, y_dev):
self.model.fit(x_train, y_train, batch_size=args.batch_size,
epochs=args.epochs, validation_data=(x_dev, y_dev),
shuffle=True,
callbacks=self.callback())
self.model.save(args.path_save_model)
def load_model(self):
print("BertNerBiLstmModel load_model start!")
# logger.info("BertBiLstmModel load_model start!")
self.model.load_weights(args.path_save_model)
# logger.info("BertBiLstmModel load_model end+!")
print("BertNerBiLstmModel load_model end+!")
def predict(self, sen):
input_ids, input_masks, input_type_ids = self.process_single([sen])
probs = self.model.predict([input_ids, input_masks], batch_size=1)
probs_first = probs[0]
preds = []
for prob_one in probs_first:
prob_max = np.argmax(prob_one)
preds.append(prob_max)
return preds
def predict_list(self, questions):
label_preds = []
for questions_pair in questions:
input_ids, input_masks, input_type_ids = self.process_single([questions_pair])
label_pred = self.model.predict([input_ids, input_masks], batch_size=1)
label_preds.append(label_pred)
return label_preds
def get_sequence_tagging_data_from_chinese_people_daily_ner_corpus(file_path):
"""
读取人民日报语料,其实就是一个txt阅读器
:param file_path: str, text
:return: list, list
"""
_x_, _y_ = [], []
with open(file_path, "r", encoding="utf-8") as fr:
lines = fr.read().splitlines()
x, y = [], []
for line_one in lines:
rows = line_one.split(" ")
if len(rows) == 1:
_x_.append(x), _y_.append(y)
x, y = [], []
else:
x.append(rows[0]), y.append(rows[1])
return _x_, _y_
def label_tagging(data_x_s, tag_label2index, len_max=32):
"""
根据类别字典dict、语料y和最大文本长度lpadding和to_categorical
:param data_x_s: list
:param tag_label2index:dict
:param len_max: int
:return: list
"""
tag_labels = []
for data_x in data_x_s:
if len(data_x) <= len_max-2:
tag_labels.append([tag_label2index['O']] + [tag_label2index[i] for i in data_x] + [tag_label2index['O'] for i in range(len_max - len(data_x) - 1)])
else:
tag_labels.append([tag_label2index['O']] + [tag_label2index[i] for i in data_x[:len_max-1]] + [tag_label2index['O']])
tag_labels_pad = pad_sequences(sequences=tag_labels, maxlen=len_max, dtype='int32',
padding='post', truncating='post', value=tag_label2index['O'])
one_hot_y = to_categorical(tag_labels_pad, num_classes=len(tag_label2index))
label_num = len(set(["".join(str(i)) for i in tag_labels]))
# tag_labels_pad_to = to_categorical(y=tag_labels_pad.tolist(), num_classes=label_num)
return one_hot_y, label_num
def label_tagging_predict(y_predicts, tag_i2l):
y_preds = []
count_y_predict = y_predicts[0].shape[1]
for y_predict in y_predicts:
temp = []
for i in range(count_y_predict):
y_predict_list = y_predict[0][i].tolist()
y_predict_max = y_predict_list.index(max(y_predict_list))
pred_label = tag_i2l[y_predict_max]
temp.append(pred_label)
y_preds.append(temp)
return y_preds
def create_label_index_dict(data_x_s):
"""
构建类别和index标签一一对应等
:param data_x_s: list, labels of train data
:return: list, list
"""
# 首先构建index2label 或者label2index
tag_label2index = {}
tag_index2label = {}
data_x_s_one = []
for d in data_x_s:
data_x_s_one = data_x_s_one + d
label_data_x_s = list(set(data_x_s_one))
for i in range(len(label_data_x_s)):
tag_label2index[label_data_x_s[i]] = i
tag_index2label[i] = label_data_x_s[i]
return tag_label2index, tag_index2label
def process_ner_y(y_data, length_max):
"""
根据训练语料y生成喂入模型的input_y
:param y_data: list
:param length_max: int
:return: list, dict, dict
"""
# 保存类别字典
import os
if not os.path.exists(args.path_tag_li):
tag_l2i, tag_i2l = create_label_index_dict(y_data)
with open(args.path_tag_li, 'wb') as f:
pickle.dump((tag_l2i, tag_i2l), f)
else:
with open(args.path_tag_li, 'rb') as f:
tag_l2i, tag_i2l = pickle.load(f)
tagging_index, label_num = y_data, length_max
try:
# tagging
tagging_index, label_num = label_tagging(y_data, tag_l2i, length_max)
except:
gg = 0
return tagging_index, label_num, tag_l2i, tag_i2l
def train():
# 1. trian
bert_model = BertNerBiLstmModel()
# bert_model.compile_model()
print("process corpus start!")
# 读取语料
x_train, y_train = get_sequence_tagging_data_from_chinese_people_daily_ner_corpus(path_ner_people_train)
x_dev, y_dev = get_sequence_tagging_data_from_chinese_people_daily_ner_corpus(path_ner_people_dev)
# ques和label index and padding
x_train = bert_model.process_single(x_train)
x_dev = bert_model.process_single(x_dev)
y_train_tagging_index, label_num, tag_l2i, tag_i2l = process_ner_y(y_train, args.max_seq_len)
y_dev_tagging_index, _, _, _ = process_ner_y(y_dev, args.max_seq_len)
# args.label = label_num
print(label_num)
print("process corpus end!")
# gg = x_train[0:2]
x_train_2 = x_train[0:2]
x_dev_2 = x_dev[0:2]
print(x_train_2.__sizeof__())
print(x_dev_2.__sizeof__())
y_train_2 = y_train_tagging_index
y_dev_2 = y_dev_tagging_index
bert_model.fit(x_train_2, y_train_2, x_dev_2, y_dev_2)
def predict():
# 3. predict
_, _, tag_l2i, tag_i2l = process_ner_y([], [])
bert_model = BertNerBiLstmModel()
bert_model.load_model()
pred = bert_model.predict(sen='欧美和台湾经济怎么样')
tag_labels = []
for pre in pred:
tag_labels.append(tag_i2l[pre])
print(tag_labels)
while True:
print("sen: ")
sen_1 = input()
pred = bert_model.predict(sen=sen_1)
tag_labels = []
for pre in pred:
tag_labels.append(tag_i2l[pre])
print(tag_labels)
if __name__ == "__main__":
train()
# predict()