fix text_preprocess and albert

This commit is contained in:
yongzhuo 2020-08-12 16:18:18 +08:00
parent e5458ec86f
commit d1cb728d23
69 changed files with 1087 additions and 344 deletions

View File

@ -141,5 +141,16 @@ train(graph='TextCNN', # 必填, 算法名, 可选"ALBERT","BERT","XLNET","FASTT
hyper_parameters=None) # 可填, json格式, 超参数, 默认embedding为'char','random'
```
# Reference
For citing this work, you can refer to the present GitHub project. For example, with BibTeX:
```
@misc{Keras-TextClassification,
howpublished = {\url{https://github.com/yongzhuo/Keras-TextClassification}},
title = {Keras-TextClassification},
author = {Yongzhuo Mo},
publisher = {GitHub},
year = {2019}
}
```
*希望对你有所帮助!

View File

@ -5,5 +5,10 @@
# @function :
from keras_textclassification.text_classification_api import train
# from keras_textclassification.text_classification_api import train
#
# res = "假道士敷衍超渡,鬼魂一家感觉受到了屈辱,现出真身捉弄他"
# mention = "道士"
# offset = 1
# print(res[1])
# print(res[1+1])

View File

@ -83,7 +83,9 @@ class BaseEmbedding:
self.token2idx = {}
self.idx2token = {}
def sentence2idx(self, text, second_text=""):
def sentence2idx(self, text, second_text=None):
if second_text:
second_text = "[SEP]" + str(second_text).upper()
# text = extract_chinese(str(text).upper())
text = str(text).upper()
@ -296,14 +298,41 @@ class BertEmbedding(BaseEmbedding):
self.vocab_size = len(self.token_dict)
self.tokenizer = keras_bert.Tokenizer(self.token_dict)
def sentence2idx(self, text, second_text=""):
# text = extract_chinese(str(text).upper())
def build_keras4bert(self):
import bert4keras
from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import Tokenizer,load_vocab
import os
self.embedding_type = 'bert'
config_path = os.path.join(self.corpus_path, 'bert_config.json')
checkpoint_path = os.path.join(self.corpus_path, 'bert_model.ckpt')
dict_path = os.path.join(self.corpus_path, 'vocab.txt')
self.model = bert4keras.models.build_transformer_model(config_path=config_path,
checkpoint_path=checkpoint_path)
# 加载并精简词表,建立分词器
self.token_dict, keep_tokens = load_vocab(
dict_path=dict_path,
simplified=True,
startwith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
self.vocab_size = len(self.token_dict)
self.tokenizer = Tokenizer(self.token_dict, do_lower_case=True)
def sentence2idx(self, text, second_text=None):
text = extract_chinese(str(text).upper())
text = str(text).upper()
input_id, input_type_id = self.tokenizer.encode(first=text, second=second_text, max_len=self.len_max)
# input_mask = [0 if ids == 0 else 1 for ids in input_id]
# return input_id, input_type_id, input_mask
return [input_id, input_type_id]
# input_id, input_type_id = self.tokenizer.encode(first_text=text,
# second_text=second_text,
# max_length=self.len_max,
# first_length=self.len_max)
#
# input_mask = [0 if ids == 0 else 1 for ids in input_id]
# return [input_id, input_type_id, input_mask]
class XlnetEmbedding(BaseEmbedding):
def __init__(self, hyper_parameters):
@ -408,7 +437,7 @@ class XlnetEmbedding(BaseEmbedding):
self.embedding_size = self.model.output_shape[-1]
self.vocab_size = len(self.tokenizer.sp)
def sentence2idx(self, text, second_text=""):
def sentence2idx(self, text, second_text=None):
# text = extract_chinese(str(text).upper())
text = str(text).upper()
tokens = self.tokenizer.encode(text)

View File

@ -6,7 +6,7 @@
from keras_textclassification.conf.path_config import path_model, path_fineture, path_model_dir, path_hyper_parameters
from keras_textclassification.data_preprocess.generator_preprocess import PreprocessGenerator
from keras_textclassification.data_preprocess.generator_preprocess import PreprocessGenerator, PreprocessSimGenerator
from keras_textclassification.data_preprocess.text_preprocess import save_json
from keras_textclassification.keras_layers.keras_lookahead import Lookahead
from keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard
@ -57,8 +57,8 @@ class graph:
# keras, tensorflow控制GPU使用率等
import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = self.gpu_memory_fraction
# config.gpu_options.allow_growth = True
# config.gpu_options.per_process_gpu_memory_fraction = self.gpu_memory_fraction
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
K.set_session(sess)
self.create_model(hyper_parameters)
@ -101,7 +101,7 @@ class graph:
cb_em = [ TensorBoard(log_dir=os.path.join(self.path_model_dir, "logs"), batch_size=self.batch_size, update_freq='batch'),
EarlyStopping(monitor='val_loss', mode='min', min_delta=1e-8, patience=self.patience),
ModelCheckpoint(monitor='val_loss', mode='min', filepath=self.model_path, verbose=1,
save_best_only=True, save_weights_only=False),]
save_best_only=True, save_weights_only=True),]
return cb_em
def create_compile(self):
@ -109,9 +109,10 @@ class graph:
构建优化器损失函数和评价函数
:return:
"""
if self.optimizer_name.upper() == "ADAM":
self.model.compile(optimizer=Adam(lr=self.lr, beta_1=0.9, beta_2=0.999, decay=0.0),
loss=self.loss,
loss= self.loss,
metrics=[self.metrics]) # Any optimize
elif self.optimizer_name.upper() == "RADAM":
self.model.compile(optimizer=RAdam(lr=self.lr, beta_1=0.9, beta_2=0.999, decay=0.0),
@ -119,7 +120,7 @@ class graph:
metrics=[self.metrics]) # Any optimize
else:
self.model.compile(optimizer=RAdam(lr=self.lr, beta_1=0.9, beta_2=0.999, decay=0.0),
loss=self.loss,
loss= self.loss,
metrics=[self.metrics]) # Any optimize
lookahead = Lookahead(k=5, alpha=0.5) # Initialize Lookahead
lookahead.inject(self.model) # add into model
@ -139,6 +140,9 @@ class graph:
self.hyper_parameters['model']['dropout'] = 0.0
save_json(jsons=self.hyper_parameters, json_path=self.path_hyper_parameters)
# if self.is_training and os.path.exists(self.model_path):
# print("load_weights")
# self.model.load_weights(self.model_path)
# 训练模型
self.model.fit(x_train, y_train, batch_size=self.batch_size,
epochs=self.epochs, validation_data=(x_dev, y_dev),
@ -164,17 +168,19 @@ class graph:
save_json(jsons=self.hyper_parameters, json_path=self.path_hyper_parameters)
pg = PreprocessGenerator()
pg = PreprocessGenerator(self.path_model_dir)
_, len_train = pg.preprocess_get_label_set(self.hyper_parameters['data']['train_data'])
data_fit_generator = pg.preprocess_label_ques_to_idx(embedding_type=self.hyper_parameters['embedding_type'],
batch_size=self.batch_size,
path=self.hyper_parameters['data']['train_data'],
epcoh=self.epochs,
embed=embed,
rate=rate)
_, len_val = pg.preprocess_get_label_set(self.hyper_parameters['data']['val_data'])
data_dev_generator = pg.preprocess_label_ques_to_idx(embedding_type=self.hyper_parameters['embedding_type'],
batch_size=self.batch_size,
path=self.hyper_parameters['data']['val_data'],
epcoh=self.epochs,
embed=embed,
rate=rate)
steps_per_epoch = len_train // self.batch_size + 1
@ -190,6 +196,54 @@ class graph:
if self.trainable:
self.word_embedding.model.save(self.path_fineture)
def fit_generator_sim(self, embed, rate=1):
"""
:param data_fit_generator: yield, 训练数据
:param data_dev_generator: yield, 验证数据
:param steps_per_epoch: int, 训练一轮步数
:param validation_steps: int, 验证一轮步数
:return:
"""
# 保存超参数
self.hyper_parameters['model']['is_training'] = False # 预测时候这些设为False
self.hyper_parameters['model']['trainable'] = False
self.hyper_parameters['model']['dropout'] = 0.0
save_json(jsons=self.hyper_parameters, json_path=self.path_hyper_parameters)
pg = PreprocessSimGenerator(self.hyper_parameters['model']['path_model_dir'])
_, len_train = pg.preprocess_get_label_set(self.hyper_parameters['data']['train_data'])
data_fit_generator = pg.preprocess_label_ques_to_idx(embedding_type=self.hyper_parameters['embedding_type'],
batch_size=self.batch_size,
path=self.hyper_parameters['data']['train_data'],
embed=embed,
epcoh=self.epochs,
rate=rate)
_, len_val = pg.preprocess_get_label_set(self.hyper_parameters['data']['val_data'])
data_dev_generator = pg.preprocess_label_ques_to_idx(embedding_type=self.hyper_parameters['embedding_type'],
batch_size=self.batch_size,
path=self.hyper_parameters['data']['val_data'],
embed=embed,
epcoh=self.epochs,
rate=rate)
steps_per_epoch = len_train // self.batch_size + 1
validation_steps = len_val // self.batch_size + 1
# self.model.load_weights(self.model_path)
# 训练模型
self.model.fit_generator(generator=data_fit_generator,
validation_data=data_dev_generator,
callbacks=self.callback(),
epochs=self.epochs,
steps_per_epoch=32,
validation_steps=6)
# 保存embedding, 动态的
if self.trainable:
self.word_embedding.model.save(self.path_fineture)
# 1600000/6=266666
# 300000/6=50000
# 36000/6000
def load_model(self):
"""
模型下载
@ -221,3 +275,5 @@ class graph:
else:
raise RuntimeError("your input sen is wrong, it must be type of list or np.array")
return self.model.predict(sen)

View File

@ -5,12 +5,16 @@
# @function:
from keras_textclassification.data_preprocess.text_preprocess import load_json, save_json
from keras_textclassification.data_preprocess.text_preprocess import load_json, save_json, txt_read
from keras_textclassification.conf.path_config import path_model_dir
path_fast_text_model_vocab2index = path_model_dir + 'vocab2index.json'
path_fast_text_model_l2i_i2l = path_model_dir + 'l2i_i2l.json'
from keras_textclassification.conf.path_config import path_baidu_qa_2019_train, path_baidu_qa_2019_valid, \
path_ccks_2020_el_kg_dev, path_ccks_2020_el_kg_tet, path_ccks_2020_el_kg_train,\
path_ccks_2020_el_cls_dev, path_ccks_2020_el_cls_tet, path_ccks_2020_el_cls_train, \
path_root
from tqdm import tqdm
import pandas as pd
import numpy as np
import json
import os
@ -18,13 +22,15 @@ class PreprocessGenerator:
"""
数据预处理, 输入为csv格式, [label,ques]
"""
def __init__(self):
def __init__(self, path_model_dir):
self.l2i_i2l = None
if os.path.exists(path_fast_text_model_l2i_i2l):
self.l2i_i2l = load_json(path_fast_text_model_l2i_i2l)
self.path_fast_text_model_vocab2index = path_model_dir + 'vocab2index.json'
self.path_fast_text_model_l2i_i2l = path_model_dir + 'l2i_i2l.json'
if os.path.exists(self.path_fast_text_model_l2i_i2l):
self.l2i_i2l = load_json(self.path_fast_text_model_l2i_i2l)
def prereocess_idx(self, pred):
if os.path.exists(path_fast_text_model_l2i_i2l):
if os.path.exists(self.path_fast_text_model_l2i_i2l):
pred_i2l = {}
i2l = self.l2i_i2l['i2l']
for i in range(len(pred)):
@ -35,7 +41,7 @@ class PreprocessGenerator:
raise RuntimeError("path_fast_text_model_label2index is None")
def prereocess_pred_xid(self, pred):
if os.path.exists(path_fast_text_model_l2i_i2l):
if os.path.exists(self.path_fast_text_model_l2i_i2l):
pred_l2i = {}
l2i = self.l2i_i2l['l2i']
for i in range(len(pred)):
@ -63,7 +69,7 @@ class PreprocessGenerator:
def preprocess_label_ques_to_idx(self, embedding_type, batch_size, path, embed, rate=1, epcoh=20):
label_set, len_all = self.preprocess_get_label_set(path)
# 获取label转index字典等, 如果label2index存在则不转换了, dev验证集合的时候用
if not os.path.exists(path_fast_text_model_l2i_i2l):
if not os.path.exists(self.path_fast_text_model_l2i_i2l):
count = 0
label2index = {}
index2label = {}
@ -75,9 +81,9 @@ class PreprocessGenerator:
l2i_i2l = {}
l2i_i2l['l2i'] = label2index
l2i_i2l['i2l'] = index2label
save_json(l2i_i2l, path_fast_text_model_l2i_i2l)
save_json(l2i_i2l, self.path_fast_text_model_l2i_i2l)
else:
l2i_i2l = load_json(path_fast_text_model_l2i_i2l)
l2i_i2l = load_json(self.path_fast_text_model_l2i_i2l)
# 读取数据的比例
len_ql = int(rate * len_all)
@ -133,4 +139,229 @@ class PreprocessGenerator:
class PreprocessSimGenerator:
"""
数据预处理, 输入为csv格式, [label,ques]
"""
def __init__(self, path_model_dir):
self.l2i_i2l = None
self.path_fast_text_model_vocab2index = path_model_dir + 'vocab2index.json'
self.path_fast_text_model_l2i_i2l = path_model_dir + 'l2i_i2l.json'
if os.path.exists(self.path_fast_text_model_l2i_i2l):
self.l2i_i2l = load_json(self.path_fast_text_model_l2i_i2l)
def prereocess_idx(self, pred):
if os.path.exists(self.path_fast_text_model_l2i_i2l):
pred_i2l = {}
i2l = self.l2i_i2l['i2l']
for i in range(len(pred)):
pred_i2l[i2l[str(i)]] = pred[i]
pred_i2l_rank = [sorted(pred_i2l.items(), key=lambda k: k[1], reverse=True)]
return pred_i2l_rank
else:
raise RuntimeError("path_fast_text_model_label2index is None")
def prereocess_pred_xid(self, pred):
if os.path.exists(self.path_fast_text_model_l2i_i2l):
pred_l2i = {}
l2i = self.l2i_i2l['l2i']
for i in range(len(pred)):
pred_l2i[pred[i]] = l2i[pred[i]]
pred_l2i_rank = [sorted(pred_l2i.items(), key=lambda k: k[1], reverse=True)]
return pred_l2i_rank
else:
raise RuntimeError("path_fast_text_model_label2index is None")
def preprocess_get_label_set(self, path):
# 首先获取label,set,即存在的具体类
label_set = set()
len_all = 0
file_csv = open(path, "r", encoding="utf-8")
for line in file_csv:
len_all += 1
data = json.loads(line)
label_real = data['label']
label_set.add(label_real)
file_csv.close()
return label_set, len_all
def preprocess_label_ques_to_idx_old(self, embedding_type, batch_size, path, embed, rate=1, epcoh=20):
label_set, len_all = self.preprocess_get_label_set(path)
# 获取label转index字典等, 如果label2index存在则不转换了, dev验证集合的时候用
if not os.path.exists(self.path_fast_text_model_l2i_i2l):
count = 0
label2index = {}
index2label = {}
for label_one in label_set:
label2index[label_one] = count
index2label[count] = label_one
count = count + 1
l2i_i2l = {}
l2i_i2l['l2i'] = label2index
l2i_i2l['i2l'] = index2label
save_json(l2i_i2l, self.path_fast_text_model_l2i_i2l)
else:
l2i_i2l = load_json(self.path_fast_text_model_l2i_i2l)
# 读取数据的比例
len_ql = int(rate * len_all)
if len_ql <= 500: # sample时候不生效,使得语料足够训练
len_ql = len_all
def process_line(line):
# 对每一条数据操作获取label和问句index
data = json.loads(line)
label = data['label']
ques_1 = data['sentence1']
ques_2 = data['sentence2']
offset = data['offset']
mention = data["mention"]
offset_i = int(offset)
# if data.get("label_l2i"):
# ques_entity = data.get("label_l2i") + "#" + ques_1[:offset_i] + "#" + mention + "#" + ques_1[offset_i+len(mention):]
# else:
# ques_entity = ques_1[:offset_i] + "#" + mention + "#" + ques_1[offset_i+len(mention):] + "$$" + ques_2
# que_embed = embed.sentence2idx(text=ques_entity)
que_embed = embed.sentence2idx(ques_1, second_text=ques_2)
label_zeros = [0] * len(l2i_i2l['l2i'])
label_zeros[l2i_i2l['l2i'][label]] = 1
return que_embed, label_zeros
for _ in range(epcoh):
while True:
file_csv = open(path, "r", encoding="utf-8")
cout_all_line = 0
cnt = 0
x, y = [], []
# 跳出循环
if len_ql < cout_all_line:
break
for line in file_csv:
cout_all_line += 1
x_line, y_line = process_line(line)
x.append(x_line)
y.append(y_line)
cnt += 1
if cnt == batch_size:
if embedding_type in ['bert', 'albert']:
x_, y_ = np.array(x), np.array(y)
x_1 = np.array([x[0] for x in x_])
x_2 = np.array([x[1] for x in x_])
x_all = [x_1, x_2]
elif embedding_type == 'xlnet':
x_, y_ = x, np.array(y)
x_1 = np.array([x[0][0] for x in x_])
x_2 = np.array([x[1][0] for x in x_])
x_3 = np.array([x[2][0] for x in x_])
x_all = [x_1, x_2, x_3]
else:
x_all, y_ = np.array(x), np.array(y)
cnt = 0
yield (x_all, y_)
x, y =[], []
file_csv.close()
print("preprocess_label_ques_to_idx ok")
def preprocess_label_ques_to_idx(self, embedding_type, batch_size, path, embed, rate=1, epcoh=20):
label_set, len_all = self.preprocess_get_label_set(path)
# 获取label转index字典等, 如果label2index存在则不转换了, dev验证集合的时候用
if not os.path.exists(self.path_fast_text_model_l2i_i2l):
count = 0
label2index = {}
index2label = {}
for label_one in label_set:
label2index[label_one] = count
index2label[count] = label_one
count = count + 1
l2i_i2l = {}
l2i_i2l['l2i'] = label2index
l2i_i2l['i2l'] = index2label
save_json(l2i_i2l, self.path_fast_text_model_l2i_i2l)
else:
l2i_i2l = load_json(self.path_fast_text_model_l2i_i2l)
# 读取数据的比例
len_ql = int(rate * len_all)
if len_ql <= 500: # sample时候不生效,使得语料足够训练
len_ql = len_all
def process_line(line):
# 对每一条数据操作获取label和问句index
data = json.loads(line)
label = data['label']
ques_1 = data['sentence1']
ques_2 = data['sentence2']
offset = data['offset']
mention_1 = data["mention"]
offset_i = int(offset)
que_embed_1 = embed.sentence2idx(text=ques_1)
que_embed_2 = embed.sentence2idx(text=ques_2)
"""ques1"""
[input_id_1, input_type_id_1, input_mask_1] = que_embed_1
input_start_mask_1 = [0] * len(input_id_1)
input_start_mask_1[offset_i] = 1
input_end_mask_1 = [0] * len(input_id_1)
input_end_mask_1[offset_i + len(mention_1) - 1] = 1
input_entity_mask_1 = [0] * len(input_id_1)
input_entity_mask_1[offset_i:offset_i + len(mention_1)] = [1] * len(mention_1)
"""ques2"""
[input_id_2, input_type_id_2, input_mask_2] = que_embed_2
kind_2 = [0] * len(input_type_id_2)
kind_21 = [0] * len(input_type_id_2)
que_2_sp = ques_2.split("|")
if len(que_2_sp)>=2:
que_2_sp_sp = que_2_sp[0].split(":")
if len(que_2_sp_sp)==2:
kind_2_start = len(que_2_sp_sp[0]) - 1
kind_2_end = kind_2_start + len(que_2_sp_sp[1]) - 1
kind_2[kind_2_start:kind_2_end] = [1] * (kind_2_end - kind_2_start)
if "标签:" in que_2_sp[1]:
que_21_sp_sp = que_2_sp[1].split(":")
kind_21_start = len(que_2_sp[0]) + len(que_21_sp_sp[0]) - 1
kind_21_end = len(que_2_sp[0]) + len(que_21_sp_sp[0]) + len(que_21_sp_sp[1]) - 1
kind_21[kind_21_start:kind_21_end] = [1] * (kind_21_end - kind_21_start)
que_embed_x=[input_id_1, input_type_id_1, input_mask_1, input_start_mask_1, input_end_mask_1, input_entity_mask_1,
input_id_2, input_type_id_2, input_mask_2, kind_2, kind_21]
label_zeros = [0] * len(l2i_i2l['l2i'])
label_zeros[l2i_i2l['l2i'][label]] = 1
return que_embed_x, label_zeros
for _ in range(epcoh):
while True:
file_csv = open(path, "r", encoding="utf-8")
cout_all_line = 0
cnt = 0
x, y = [], []
# 跳出循环
if len_ql < cout_all_line:
break
for line in file_csv:
cout_all_line += 1
x_line, y_line = process_line(line)
x.append(x_line)
y.append(y_line)
cnt += 1
if cnt == batch_size:
if embedding_type in ['bert', 'albert']:
x_, y_ = np.array(x), np.array(y)
x_all = []
for i in range(len(x_[0])):
x_1 = np.array([x[i] for x in x_])
x_all.append(x_1)
elif embedding_type == 'xlnet':
x_, y_ = x, np.array(y)
x_1 = np.array([x[0][0] for x in x_])
x_2 = np.array([x[1][0] for x in x_])
x_3 = np.array([x[2][0] for x in x_])
x_all = [x_1, x_2, x_3]
else:
x_all, y_ = np.array(x), np.array(y)
cnt = 0
yield (x_all, y_)
x, y =[], []
file_csv.close()
print("preprocess_label_ques_to_idx ok")

View File

@ -5,16 +5,15 @@
# @function :data utils of text classification
from keras_textclassification.conf.path_config import path_model_dir
path_fast_text_model_vocab2index = path_model_dir + 'vocab2index.json'
path_fast_text_model_l2i_i2l = path_model_dir + 'l2i_i2l.json'
# from keras_textclassification.conf.path_config import path_model_dir
# path_fast_text_model_vocab2index = path_model_dir + 'vocab2index.json'
# path_fast_text_model_l2i_i2l = path_model_dir + 'l2i_i2l.json'
from collections import Counter
from tqdm import tqdm
import pandas as pd
import numpy as np
import random
import jieba
# import jieba
import json
import re
import os
@ -199,13 +198,15 @@ class PreprocessText:
"""
数据预处理, 输入为csv格式, [label,ques]
"""
def __init__(self):
def __init__(self, path_model_dir):
self.l2i_i2l = None
if os.path.exists(path_fast_text_model_l2i_i2l):
self.l2i_i2l = load_json(path_fast_text_model_l2i_i2l)
self.path_fast_text_model_vocab2index = path_model_dir + 'vocab2index.json'
self.path_fast_text_model_l2i_i2l = path_model_dir + 'l2i_i2l.json'
if os.path.exists(self.path_fast_text_model_l2i_i2l):
self.l2i_i2l = load_json(self.path_fast_text_model_l2i_i2l)
def prereocess_idx(self, pred):
if os.path.exists(path_fast_text_model_l2i_i2l):
if os.path.exists(self.path_fast_text_model_l2i_i2l):
pred_i2l = {}
i2l = self.l2i_i2l['i2l']
for i in range(len(pred)):
@ -216,7 +217,7 @@ class PreprocessText:
raise RuntimeError("path_fast_text_model_label2index is None")
def prereocess_pred_xid(self, pred):
if os.path.exists(path_fast_text_model_l2i_i2l):
if os.path.exists(self.path_fast_text_model_l2i_i2l):
pred_l2i = {}
l2i = self.l2i_i2l['l2i']
for i in range(len(pred)):
@ -239,7 +240,7 @@ class PreprocessText:
random.shuffle(indexs)
ques, label = ques[indexs].tolist(), label[indexs].tolist()
# 如果label2index存在则不转换了
if not os.path.exists(path_fast_text_model_l2i_i2l):
if not os.path.exists(self.path_fast_text_model_l2i_i2l):
label_set = set(label)
count = 0
label2index = {}
@ -252,9 +253,9 @@ class PreprocessText:
l2i_i2l = {}
l2i_i2l['l2i'] = label2index
l2i_i2l['i2l'] = index2label
save_json(l2i_i2l, path_fast_text_model_l2i_i2l)
save_json(l2i_i2l, self.path_fast_text_model_l2i_i2l)
else:
l2i_i2l = load_json(path_fast_text_model_l2i_i2l)
l2i_i2l = load_json(self.path_fast_text_model_l2i_i2l)
len_ql = int(rate * len(ques))
if len_ql <= 500: # sample时候不生效,使得语料足够训练
@ -307,13 +308,15 @@ class PreprocessTextMulti:
"""
数据预处理, 输入为csv格式, [label,ques]
"""
def __init__(self):
def __init__(self, path_model_dir):
self.l2i_i2l = None
if os.path.exists(path_fast_text_model_l2i_i2l):
self.l2i_i2l = load_json(path_fast_text_model_l2i_i2l)
self.path_fast_text_model_vocab2index = path_model_dir + 'vocab2index.json'
self.path_fast_text_model_l2i_i2l = path_model_dir + 'l2i_i2l.json'
if os.path.exists(self.path_fast_text_model_l2i_i2l):
self.l2i_i2l = load_json(self.path_fast_text_model_l2i_i2l)
def prereocess_idx(self, pred):
if os.path.exists(path_fast_text_model_l2i_i2l):
if os.path.exists(self.path_fast_text_model_l2i_i2l):
pred_i2l = {}
i2l = self.l2i_i2l['i2l']
for i in range(len(pred)):
@ -324,7 +327,7 @@ class PreprocessTextMulti:
raise RuntimeError("path_fast_text_model_label2index is None")
def prereocess_pred_xid(self, pred):
if os.path.exists(path_fast_text_model_l2i_i2l):
if os.path.exists(self.path_fast_text_model_l2i_i2l):
pred_l2i = {}
l2i = self.l2i_i2l['l2i']
for i in range(len(pred)):
@ -365,7 +368,7 @@ class PreprocessTextMulti:
random.shuffle(indexs)
ques, label = ques[indexs].tolist(), label[indexs].tolist()
if not os.path.exists(path_fast_text_model_l2i_i2l):
if not os.path.exists(self.path_fast_text_model_l2i_i2l):
from keras_textclassification.conf.path_config import path_byte_multi_news_label
byte_multi_news_label = txt_read(path_byte_multi_news_label)
byte_multi_news_label = [i.strip().upper() for i in byte_multi_news_label]
@ -383,9 +386,9 @@ class PreprocessTextMulti:
l2i_i2l = {}
l2i_i2l['l2i'] = label2index
l2i_i2l['i2l'] = index2label
save_json(l2i_i2l, path_fast_text_model_l2i_i2l)
save_json(l2i_i2l, self.path_fast_text_model_l2i_i2l)
else:
l2i_i2l = load_json(path_fast_text_model_l2i_i2l)
l2i_i2l = load_json(self.path_fast_text_model_l2i_i2l)
len_label_set = len(l2i_i2l['l2i'])
@ -438,13 +441,15 @@ class PreprocessSim:
"""
数据预处理, 输入为csv格式, [label,ques]
"""
def __init__(self):
def __init__(self, path_model_dir):
self.l2i_i2l = None
if os.path.exists(path_fast_text_model_l2i_i2l):
self.l2i_i2l = load_json(path_fast_text_model_l2i_i2l)
self.path_fast_text_model_vocab2index = path_model_dir + 'vocab2index.json'
self.path_fast_text_model_l2i_i2l = path_model_dir + 'l2i_i2l.json'
if os.path.exists(self.path_fast_text_model_l2i_i2l):
self.l2i_i2l = load_json(self.path_fast_text_model_l2i_i2l)
def prereocess_idx(self, pred):
if os.path.exists(path_fast_text_model_l2i_i2l):
if os.path.exists(self.path_fast_text_model_l2i_i2l):
pred_i2l = {}
i2l = self.l2i_i2l['i2l']
for i in range(len(pred)):
@ -455,7 +460,7 @@ class PreprocessSim:
raise RuntimeError("path_fast_text_model_label2index is None")
def prereocess_pred_xid(self, pred):
if os.path.exists(path_fast_text_model_l2i_i2l):
if os.path.exists(self.path_fast_text_model_l2i_i2l):
pred_l2i = {}
l2i = self.l2i_i2l['l2i']
for i in range(len(pred)):
@ -467,23 +472,48 @@ class PreprocessSim:
def preprocess_label_ques_to_idx(self, embedding_type, path, embed,
rate=1, shuffle=True, graph=None):
data = pd.read_csv(path)
ques_1 = data['sentence1'].tolist()
ques_2 = data['sentence2'].tolist()
label = data['label'].tolist()
if "json" in path:
datas = txt_read(path)
ques_1 = []
ques_2 = []
label = []
offset = []
mention = []
for data_str in datas:
data = json.loads(data_str)
ques_1 += [data['sentence1']]
ques_2 += [data['sentence2']]
mention += [data['mention']]
label += [data['label']]
offset += [data['offset']]
elif "csv" in path:
data = pd.read_csv(path)
ques_1 = data['sentence1'].tolist()
ques_2 = data['sentence2'].tolist()
label = data['label'].tolist()
offset = data['offset'].tolist()
ques_1 = [str(q1).upper() for q1 in ques_1]
ques_2 = [str(q2).upper() for q2 in ques_2]
label = [str(l).upper() for l in label]
# label = [str(l).upper() for l in label]
label = [str(l) for l in label]
if shuffle:
ques_1 = np.array(ques_1)
ques_2 = np.array(ques_2)
label = np.array(label)
mention = np.array(mention)
offset = np.array(offset)
indexs = [ids for ids in range(len(label))]
random.shuffle(indexs)
ques_1, ques_2, label = ques_1[indexs].tolist(), ques_2[indexs].tolist(), label[indexs].tolist()
ques_1 = ques_1[indexs].tolist()
ques_2 = ques_2[indexs].tolist()
label = label[indexs].tolist()
mention = mention[indexs].tolist()
offset = offset[indexs].tolist()
# 如果label2index存在则不转换了
if not os.path.exists(path_fast_text_model_l2i_i2l):
if not os.path.exists(self.path_fast_text_model_l2i_i2l):
label_set = set(label)
count = 0
label2index = {}
@ -496,12 +526,12 @@ class PreprocessSim:
l2i_i2l = {}
l2i_i2l['l2i'] = label2index
l2i_i2l['i2l'] = index2label
save_json(l2i_i2l, path_fast_text_model_l2i_i2l)
save_json(l2i_i2l, self.path_fast_text_model_l2i_i2l)
else:
l2i_i2l = load_json(path_fast_text_model_l2i_i2l)
l2i_i2l = load_json(self.path_fast_text_model_l2i_i2l)
len_ql = int(rate * len(label))
if len_ql <= 500: # sample时候不生效,使得语料足够训练
if len_ql <= 1: # sample时候不生效,使得语料足够训练
len_ql = len(label)
x = []
@ -509,8 +539,78 @@ class PreprocessSim:
for i in tqdm(range(len_ql)):
que_1 = ques_1[i]
que_2 = ques_2[i]
que_embed = embed.sentence2idx(text=que_1, second_text=que_2)
x.append(que_embed) # [[], ]
mention_1 = mention[i]
# que_embed = embed.sentence2idx(text=que_1, second_text=que_2)
# x.append(que_embed) # [[], ]
offset_i = int(offset[i])
# ques_entity = que_1 + "##" + que_1[offset_i+len(que_2):]
# ques_entity = que_1
# que_embed1 = embed.sentence2idx(text=que_1, second_text=que_2)
if embedding_type in ['bert', 'albert']:
########################################1111111##############
# [input_id, input_type_id] = que_embed
# input_entity_mask = [0] * len(input_id)
# input_entity_mask[offset_i:offset_i+len(que_2)] = [1] * len(que_2)
# # x.append(que_embed) # [[], ]
# x.append([input_id, input_type_id, input_entity_mask])
# # x.append([input_id, input_type_id, input_entity_mask, offset_i])
########################################2222222指针网络######################################
# [input_id, input_type_id] = que_embed
# input_start_mask = [0] * len(input_id)
# input_start_mask[offset_i] = 1
# input_end_mask = [0] * len(input_id)
# input_end_mask[offset_i + len(mention_1) - 1] = 1
# x.append([input_id, input_type_id, input_start_mask, input_start_mask])
########################################分开两个句子###################################################
que_embed_1 = embed.sentence2idx(text=que_1)
# que_embed_1 = [que[:54] for que in que_embed_1]
que_embed_2 = embed.sentence2idx(text=que_2)
# que_embed_2 = [que[:256-54] for que in que_embed_2]
try:
"""ques1"""
[input_id_1, input_type_id_1, input_mask_1] = que_embed_1
input_start_mask_1 = [0] * len(input_id_1)
input_start_mask_1[offset_i] = 1
input_end_mask_1 = [0] * len(input_id_1)
input_end_mask_1[offset_i+len(mention_1)-1] = 1
input_entity_mask_1 = [0] * len(input_id_1)
input_entity_mask_1[offset_i:offset_i+len(mention_1)] = [1] * len(mention_1)
"""ques2"""
[input_id_2, input_type_id_2, input_mask_2] = que_embed_2
kind_2 = [0] * len(input_type_id_2)
que_2_sp = que_2.split("|")
que_2_sp_sp = que_2_sp[0].split(":")
kind_2_start = len(que_2_sp_sp[0]) - 1
kind_2_end = kind_2_start + len(que_2_sp_sp[1]) - 1
kind_2[kind_2_start:kind_2_end] = [1] * (kind_2_end-kind_2_start)
kind_21 = [0] * len(input_type_id_2)
if "标签" in que_2_sp[1]:
que_21_sp_sp = que_2_sp[1].split(":")
kind_21_start = len(que_2_sp[0]) + len(que_21_sp_sp[0]) - 1
kind_21_end = len(que_2_sp[0]) + len(que_21_sp_sp[0]) + len(que_21_sp_sp[1]) - 1
kind_21[kind_21_start:kind_21_end] = [1] * (kind_21_end - kind_21_start)
except Exception as e:
print(str(e))
gg = 0
x.append([input_id_1, input_type_id_1, input_mask_1, input_start_mask_1, input_end_mask_1, input_entity_mask_1,
input_id_2, input_type_id_2, input_mask_2, kind_2, kind_21])
elif embedding_type == 'xlnet':
if embed.trainable:
[token_input, segment_input, memory_length_input, mask_input] = que_embed
input_entity_mask = [0] * len(token_input)
input_entity_mask[offset_i:offset_i + len(que_2)] = [1] * len(que_2)
# x.append(que_embed) # [[], ]
x.append([token_input, segment_input, memory_length_input, mask_input, input_entity_mask])
else:
[token_input, segment_input, memory_length_input] = que_embed
input_entity_mask = [0] * len(token_input)
input_entity_mask[offset_i:offset_i + len(que_2)] = [1] * len(que_2)
x.append([token_input, segment_input, memory_length_input, input_entity_mask])
label_zo = []
print("label to onehot start!")
label_len_ql = label[0:len_ql]
@ -522,20 +622,26 @@ class PreprocessSim:
if embedding_type in ['bert', 'albert']:
x_, y_ = np.array(x), np.array(label_zo)
x_1 = np.array([x[0] for x in x_])
x_2 = np.array([x[1] for x in x_])
x_all = [x_1, x_2]
# x_1 = np.array([x[0] for x in x_])
# x_2 = np.array([x[1] for x in x_])
# x_3 = np.array([x[2] for x in x_])
# x_4 = np.array([x[3] for x in x_])
# x_all = [x_1, x_2, x_3, x_4]
x_all = []
for i in range(len(x_[0])):
x_all.append(np.array([x[i] for x in x_]))
return x_all, y_
elif embedding_type == 'xlnet':
x_, y_ = x, np.array(label_zo)
x_1 = np.array([x[0][0] for x in x_])
x_2 = np.array([x[1][0] for x in x_])
x_3 = np.array([x[2][0] for x in x_])
x_4 = np.array([x[3][0] for x in x_])
if embed.trainable:
x_4 = np.array([x[3][0] for x in x_])
x_all = [x_1, x_2, x_3, x_4]
x_5 = np.array([x[4][0] for x in x_])
x_all = [x_1, x_2, x_3, x_4, x_5]
else:
x_all = [x_1, x_2, x_3]
x_all = [x_1, x_2, x_3, x_4]
return x_all, y_
else:
x_, y_ = np.array(x), np.array(label_zo)
@ -546,13 +652,15 @@ class PreprocessSimConv2019:
"""
数据预处理, 输入为csv格式, [label,ques]
"""
def __init__(self):
def __init__(self, path_model_dir):
self.l2i_i2l = None
if os.path.exists(path_fast_text_model_l2i_i2l):
self.l2i_i2l = load_json(path_fast_text_model_l2i_i2l)
self.path_fast_text_model_vocab2index = path_model_dir + 'vocab2index.json'
self.path_fast_text_model_l2i_i2l = path_model_dir + 'l2i_i2l.json'
if os.path.exists(self.path_fast_text_model_l2i_i2l):
self.l2i_i2l = load_json(self.path_fast_text_model_l2i_i2l)
def prereocess_idx(self, pred):
if os.path.exists(path_fast_text_model_l2i_i2l):
if os.path.exists(self.path_fast_text_model_l2i_i2l):
pred_i2l = {}
i2l = self.l2i_i2l['i2l']
for i in range(len(pred)):
@ -563,7 +671,7 @@ class PreprocessSimConv2019:
raise RuntimeError("path_fast_text_model_label2index is None")
def prereocess_pred_xid(self, pred):
if os.path.exists(path_fast_text_model_l2i_i2l):
if os.path.exists(self.path_fast_text_model_l2i_i2l):
pred_l2i = {}
l2i = self.l2i_i2l['l2i']
for i in range(len(pred)):
@ -593,7 +701,7 @@ class PreprocessSimConv2019:
random.shuffle(indexs)
ques_1, ques_2, label, category = ques_1[indexs].tolist(), ques_2[indexs].tolist(), label[indexs].tolist(), category[indexs].tolist()
# 如果label2index存在则不转换了
if not os.path.exists(path_fast_text_model_l2i_i2l):
if not os.path.exists(self.path_fast_text_model_l2i_i2l):
label_set = set(label)
count = 0
label2index = {}
@ -606,9 +714,9 @@ class PreprocessSimConv2019:
l2i_i2l = {}
l2i_i2l['l2i'] = label2index
l2i_i2l['i2l'] = index2label
save_json(l2i_i2l, path_fast_text_model_l2i_i2l)
save_json(l2i_i2l, self.path_fast_text_model_l2i_i2l)
else:
l2i_i2l = load_json(path_fast_text_model_l2i_i2l)
l2i_i2l = load_json(self.path_fast_text_model_l2i_i2l)
len_ql = int(rate * len(label))
if len_ql <= 500: # sample时候不生效,使得语料足够训练

View File

@ -2,10 +2,10 @@
# -*- coding: utf-8 -*-
# @time : 2020/3/31 19:10
# @author : Mo
# @function:
# @function: Attention of dot
from keras.regularizers import L1L2, Regularizer
from keras.regularizers import L1L2
# from keras.engine.topology import Layer
from keras.layers import Layer
from keras import backend as K

View File

@ -1,12 +1,13 @@
# -*- coding: UTF-8 -*-
# !/usr/bin/python
# @time :2019/6/22 7:35
# @time :2019/6/22 19:35
# @author :Mo
# @function :self Attention()
# @function :Attention of itself
from keras.regularizers import L1L2, Regularizer
from keras.engine.topology import Layer
# from keras.engine.topology import Layer
from keras.layers import Layer
from keras import backend as K

View File

@ -4,6 +4,7 @@
# @author :Mo
# @function :
from keras.layers import Layer
import tensorflow as tf

View File

@ -15,7 +15,6 @@ from keras import backend as K
from keras import regularizers
from keras_textclassification.base.graph import graph
import numpy as np
@ -52,6 +51,9 @@ class AlbertGraph(graph):
# x = Concatenate(axis=1)(concat_out)
# x = Dropout(self.dropout)(x)
x = Flatten()(x)
x = Dropout(self.dropout)(x)
x = Dense(128, activation="tanh")(x)
x = Dropout(self.dropout)(x)
# 最后就是softmax
dense_layer = Dense(self.label, activation=self.activate_classify)(x)
output_layers = [dense_layer]

View File

@ -12,9 +12,9 @@ import os
project_path = str(pathlib.Path(os.path.abspath(__file__)).parent.parent.parent)
sys.path.append(project_path)
# 地址
from keras_textclassification.conf.path_config import path_model, path_fineture, path_model_dir, path_hyper_parameters
from keras_textclassification.conf.path_config import path_model, path_fineture, path_hyper_parameters # , path_model_dir
# 训练验证数据地址
from keras_textclassification.conf.path_config import path_baidu_qa_2019_train, path_baidu_qa_2019_valid
from keras_textclassification.conf.path_config import path_baidu_qa_2019_train, path_baidu_qa_2019_valid, path_root
# 数据预处理, 删除文件目录下文件
from keras_textclassification.data_preprocess.text_preprocess import PreprocessText, read_and_process, load_json
# 模型图
@ -26,6 +26,8 @@ import time
import numpy as np
path_model_dir = path_root + "/data/model/ccks_2020_el_cls_albert/"
def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.0):
"""
@ -46,7 +48,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
print("graph load ok!")
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
y, x = read_and_process(hyper_parameters['data']['val_data'])
# 取该数据集的百分之几的语料测试
len_rate = int(len(y) * rate)
@ -57,7 +59,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
for x_one in x:
count += 1
ques_embed = ra_ed.sentence2idx(x_one)
if hyper_parameters['embedding_type'] == 'bert': # bert数据处理, token
if hyper_parameters['embedding_type'] in ['bert', 'albert']: # bert数据处理, token
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]
@ -91,7 +93,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
"""
# 加载超参数
hyper_parameters = load_json(path_hyper_parameter)
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
# 模式初始化和加载
graph = Graph(hyper_parameters)
graph.load_model()
@ -99,7 +101,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
ques = '我要打王者荣耀'
# str to token
ques_embed = ra_ed.sentence2idx(ques)
if hyper_parameters['embedding_type'] == 'bert':
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]
@ -115,7 +117,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
ques = input()
ques_embed = ra_ed.sentence2idx(ques)
print(ques_embed)
if hyper_parameters['embedding_type'] == 'bert':
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]
@ -127,12 +129,13 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
if __name__=="__main__":
path_fineture = path_root + "/data/model/ccks_2020_el_cls_albert/hyper_parameters.json"
# 测试集预测
pred_tet(path_test=path_baidu_qa_2019_valid, rate=1) # sample条件下设为1,否则训练语料可能会很少
pred_tet(path_hyper_parameter=path_fineture, path_test=path_baidu_qa_2019_valid, rate=1) # sample条件下设为1,否则训练语料可能会很少
# 可输入 input 预测
pred_input()
pred_input(path_hyper_parameter=path_fineture)
# pred
# precision recall f1-score support

View File

@ -19,15 +19,31 @@ sys.path.append(project_path)
# 地址
from keras_textclassification.conf.path_config import path_model, path_fineture, path_model_dir, path_hyper_parameters
# 训练验证数据地址
from keras_textclassification.conf.path_config import path_baidu_qa_2019_train, path_baidu_qa_2019_valid
from keras_textclassification.conf.path_config import path_baidu_qa_2019_train, path_baidu_qa_2019_valid,\
path_ccks_2020_nil_train, path_ccks_2020_nil_dev, path_root
# 数据预处理, 删除文件目录下文件
from keras_textclassification.data_preprocess.text_preprocess import PreprocessText, delete_file
from keras_textclassification.data_preprocess.text_preprocess import PreprocessText, PreprocessSim, delete_file
# 模型图
from keras_textclassification.m00_Albert.graph import AlbertGraph as Graph
# 计算时间
import time
# fast_text config
# 模型目录
path_model_dir = path_root + "/data/model/ccks_2020_el_cls_albert/"
# 语料地址
path_model = path_root + '/data/model/ccks_2020_el_cls_albert/model_fast_text.h5'
# 超参数保存地址
path_hyper_parameters = path_root + '/data/model/ccks_2020_el_cls_albert/hyper_parameters.json'
# embedding微调保存地址
path_fineture = path_root + "/data/model/ccks_2020_el_cls_albert/embedding_trainable.h5"
if not os.path.exists(path_model_dir):
os.mkdir(path_model_dir)
def train(hyper_parameters=None, rate=1.0):
"""
训练函数
@ -37,24 +53,24 @@ def train(hyper_parameters=None, rate=1.0):
"""
if not hyper_parameters:
hyper_parameters = {
'len_max': 20, # 句子最大长度, 固定 推荐20-50
'len_max': 50, # 句子最大长度, 固定 推荐20-50
'embed_size': 768, # 字/词向量维度
'vocab_size': 20000, # 这里随便填的,会根据代码里修改
'trainable': True, # embedding是静态的还是动态的, 即控制可不可以微调
'level_type': 'char', # 级别, 最小单元, 字/词, 填 'char' or 'word'
'embedding_type': 'albert', # 级别, 嵌入类型, 还可以填'xlnet'、'random'、 'bert'、 'albert' or 'word2vec"
'gpu_memory_fraction': 0.76, #gpu使用率
'gpu_memory_fraction': 0.78, #gpu使用率
'model': {'label': 17, # 类别数
'batch_size': 32, # 批处理尺寸, 感觉原则上越大越好,尤其是样本不均衡的时候, batch_size设置影响比较大
'filters': [2, 3, 4, 5], # 卷积核尺寸
'filters_num': 300, # 卷积个数 text-cnn:300-600
'channel_size': 1, # CNN通道数
'dropout': 0.5, # 随机失活, 概率
'decay_step': 100, # 学习率衰减step, 每N个step衰减一次
'decay_rate': 0.9, # 学习率衰减系数, 乘法
'epochs': 20, # 训练最大轮次
'decay_step': 1000, # 学习率衰减step, 每N个step衰减一次
'decay_rate': 0.999, # 学习率衰减系数, 乘法
'epochs': 1, # 训练最大轮次
'patience': 3, # 早停,2-3就好
'lr': 5e-5, # 学习率, 对训练会有比较大的影响, 如果准确率一直上不去,可以考虑调这个参数
'lr': 5e-3, # 学习率, 对训练会有比较大的影响, 如果准确率一直上不去,可以考虑调这个参数
'l2': 1e-9, # l2正则化
'activate_classify': 'softmax', # 最后一个layer, 即分类激活函数
'loss': 'categorical_crossentropy', # 损失函数
@ -66,11 +82,14 @@ def train(hyper_parameters=None, rate=1.0):
'path_hyper_parameters': path_hyper_parameters, # 模型(包括embedding),超参数地址,
'path_fineture': path_fineture, # 保存embedding trainable地址, 例如字向量、词向量、bert向量等
},
'embedding': {'layer_indexes': [i for i in range(25)] + [-i for i in range(25)], # bert取的层数,包括embedding层
'embedding': {'layer_indexes': [11] # [i for i in range(25)] + [-i for i in range(25)], # bert取的层数,包括embedding层
# 'corpus_path': 'D:/soft_install/dataset/bert-model/albert_tiny_489k', # embedding预训练数据地址,不配则会默认取conf里边默认的地址, keras-bert可以加载谷歌版bert,百度版ernie(需转换https://github.com/ArthurRizar/tensorflow_ernie),哈工大版bert-wwm(tf框架https://github.com/ymcui/Chinese-BERT-wwm)
},
'data':{'train_data': path_baidu_qa_2019_train, # 训练数据
'data':{
'train_data': path_baidu_qa_2019_train, # 训练数据
'val_data': path_baidu_qa_2019_valid # 验证数据
# 'train_data': path_ccks_2020_nil_train, # 训练数据
# 'val_data': path_ccks_2020_nil_dev # 验证数据
},
}
@ -82,7 +101,8 @@ def train(hyper_parameters=None, rate=1.0):
print("graph init ok!")
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
# pt = PreprocessSim(path_model_dir)
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
hyper_parameters['data']['train_data'],
ra_ed, rate=rate, shuffle=True)

View File

@ -51,6 +51,7 @@ class BertGraph(graph):
# concat_out.append(x)
# x = Concatenate(axis=1)(concat_out)
# x = Dropout(self.dropout)(x)
x = Flatten()(x)
# 最后就是softmax
dense_layer = Dense(self.label, activation=self.activate_classify)(x)

View File

@ -46,7 +46,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
print("graph load ok!")
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
y, x = read_and_process(hyper_parameters['data']['val_data'])
# 取该数据集的百分之几的语料测试
len_rate = int(len(y) * rate)
@ -57,7 +57,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
for x_one in x:
count += 1
ques_embed = ra_ed.sentence2idx(x_one)
if hyper_parameters['embedding_type'] == 'bert': # bert数据处理, token
if hyper_parameters['embedding_type'] in ['bert', 'albert']: # bert数据处理, token
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]
@ -91,7 +91,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
"""
# 加载超参数
hyper_parameters = load_json(path_hyper_parameter)
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
# 模式初始化和加载
graph = Graph(hyper_parameters)
graph.load_model()
@ -99,7 +99,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
ques = '我要打王者荣耀'
# str to token
ques_embed = ra_ed.sentence2idx(ques)
if hyper_parameters['embedding_type'] == 'bert':
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]
@ -115,7 +115,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
ques = input()
ques_embed = ra_ed.sentence2idx(ques)
print(ques_embed)
if hyper_parameters['embedding_type'] == 'bert':
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]

View File

@ -82,7 +82,7 @@ def train(hyper_parameters=None, rate=1.0):
print("graph init ok!")
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
hyper_parameters['data']['train_data'],
ra_ed, rate=rate, shuffle=True)

View File

@ -46,7 +46,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
print("graph load ok!")
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
y, x = read_and_process(hyper_parameters['data']['val_data'])
# 取该数据集的百分之几的语料测试
len_rate = int(len(y) * rate)
@ -57,7 +57,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
for x_one in x:
count += 1
ques_embed = ra_ed.sentence2idx(x_one)
if hyper_parameters['embedding_type'] == 'bert': # bert数据处理, token
if hyper_parameters['embedding_type'] in ['bert', 'albert']: # bert数据处理, token
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]
@ -91,7 +91,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
"""
# 加载超参数
hyper_parameters = load_json(path_hyper_parameter)
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
# 模式初始化和加载
graph = Graph(hyper_parameters)
graph.load_model()
@ -99,7 +99,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
ques = '我要打王者荣耀'
# str to token
ques_embed = ra_ed.sentence2idx(ques)
if hyper_parameters['embedding_type'] == 'bert':
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]
@ -115,7 +115,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
ques = input()
ques_embed = ra_ed.sentence2idx(ques)
print(ques_embed)
if hyper_parameters['embedding_type'] == 'bert':
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]

View File

@ -85,7 +85,7 @@ def train(hyper_parameters=None, rate=1.0):
print("graph init ok!")
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
hyper_parameters['data']['train_data'],
ra_ed, rate=rate, shuffle=True)

View File

@ -7,9 +7,10 @@
from keras_textclassification.base.graph import graph
from keras.layers import GlobalMaxPooling1D
from keras.layers import Dense
from keras.layers import GlobalMaxPooling1D, GlobalAveragePooling1D, Concatenate
from keras.layers import Dense, Dropout
from keras.models import Model
import keras.backend as K
class FastTextGraph(graph):
@ -29,9 +30,129 @@ class FastTextGraph(graph):
"""
super().create_model(hyper_parameters)
embedding = self.word_embedding.output
x = GlobalMaxPooling1D()(embedding)
x_m = GlobalMaxPooling1D()(embedding)
x_g = GlobalAveragePooling1D()(embedding)
x = Concatenate()([x_g, x_m])
x = Dense(128, activation="tanh")(x)
x = Dropout(self.dropout)(x)
output = Dense(self.label, activation=self.activate_classify)(x)
self.model = Model(inputs=self.word_embedding.input, outputs=output)
self.model.summary(132)
# def focal_loss(self, gamma=2, alpha=0.75): # 0.25, 0.5
def focal_loss(self, gamma=2, alpha=0.75, batch_size=None, label_num=None, epsilon=1.e-7, multi_dim=False, use_softmax=True):
from tensorflow.python.ops import array_ops
import tensorflow as tf
def focal_loss_fixed(y_true, y_pred): # with tensorflow
eps = 1e-12
y_pred = K.clip(y_pred, eps, 1. - eps) # improve the stability of the focal loss and see issues 1 for more information
pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))
loss = -K.sum(alpha * K.pow(1. - pt_1, gamma) * K.log(pt_1)) - K.sum((1 - alpha) * K.pow(pt_0, gamma) * K.log(1. - pt_0))
return loss
def focal_loss_all(prediction_tensor, target_tensor):
r"""Compute focal loss for predictions.
Multi-labels Focal loss formula:
FL = -alpha * (z-p)^gamma * log(p) -(1-alpha) * p^gamma * log(1-p)
,which alpha = 0.25, gamma = 2, p = sigmoid(x), z = target_tensor.
Args:
prediction_tensor: A float tensor of shape [batch_size, num_anchors,
num_classes] representing the predicted logits for each class
target_tensor: A float tensor of shape [batch_size, num_anchors,
num_classes] representing one-hot encoded classification targets
weights: A float tensor of shape [batch_size, num_anchors]
alpha: A scalar tensor for focal loss alpha hyper-parameter
gamma: A scalar tensor for focal loss gamma hyper-parameter
Returns:
loss: A (scalar) tensor representing the value of the loss function
"""
sigmoid_p = tf.nn.sigmoid(prediction_tensor)
zeros = array_ops.zeros_like(sigmoid_p, dtype=sigmoid_p.dtype)
# For poitive prediction, only need consider front part loss, back part is 0;
# target_tensor > zeros <=> z=1, so poitive coefficient = z - p.
pos_p_sub = array_ops.where(target_tensor > zeros, target_tensor - sigmoid_p, zeros)
# For negative prediction, only need consider back part loss, front part is 0;
# target_tensor > zeros <=> z=1, so negative coefficient = 0.
neg_p_sub = array_ops.where(target_tensor > zeros, zeros, sigmoid_p)
per_entry_cross_ent = - alpha * (pos_p_sub ** gamma) * tf.log(tf.clip_by_value(sigmoid_p, 1e-8, 1.0)) \
- (1 - alpha) * (neg_p_sub ** gamma) * tf.log(tf.clip_by_value(1.0 - sigmoid_p, 1e-8, 1.0))
return tf.reduce_sum(per_entry_cross_ent)
def focal_loss_category(logits, labels):
'''
:param logits: [batch_size, n_class]
:param labels: [batch_size] not one-hot !!!
:return: -alpha*(1-y)^r * log(y)
它是在哪实现 1- y 通过gather选择的就是1-p,而不是通过计算实现的
logits soft max之后是多个类别的概率也就是二分类时候的1-P和P多分类的时候不是1-p了
怎么把alpha的权重加上去
通过gather把alpha选择后变成batch长度同时达到了选择和维度变换的目的
是否需要对logits转换后的概率值进行限制
需要的避免极端情况的影响
针对输入是 (NPC ) (NP)怎么处理
先把他转换为和常规的一样形状N*PC N*P,
bug:
ValueError: Cannot convert an unknown Dimension to a Tensor: ?
因为输入的尺寸有时是未知的导致了该bug,如果batchsize是确定的可以直接修改为batchsize
'''
if multi_dim:
logits = tf.reshape(logits, [-1, logits.shape[2]])
labels = tf.reshape(labels, [-1])
# (Class ,1)
alpha = tf.constant([0.5]*batch_size, dtype=tf.float32)
labels = tf.argmax(labels) #
labels = tf.cast(labels, dtype=tf.int32)
logits = tf.cast(logits, tf.float32)
if use_softmax:
# (N,Class) > N*Class
softmax = tf.reshape(tf.nn.softmax(logits), [-1]) # [batch_size * n_class]
else:
softmax = tf.reshape(tf.nn.sigmoid(logits), [-1]) # [batch_size * n_class]
# (N,) > (N,) ,但是数值变换了变成了每个label在N*Class中的位置
# labels_shift = tf.range(0, logits.shape[0]) * logits.shape[1] + labels
labels_shift = tf.range(0, label_num) * batch_size + labels
# (N*Class,) > (N,)
prob = tf.gather(softmax, labels_shift)
# 预防预测概率值为0的情况 ; (N,)
prob = tf.clip_by_value(prob, epsilon, 1. - epsilon)
# (Class ,1) > (N,)
alpha_choice = tf.gather(alpha, labels)
# (N,) > (N,)
weight = tf.pow(tf.subtract(1., prob), gamma)
weight = tf.multiply(alpha_choice, weight)
# (N,) > 1
loss = -tf.reduce_sum(tf.multiply(weight, tf.log(prob)))
return loss
return focal_loss_fixed
def create_compile(self):
"""
构建优化器损失函数和评价函数
:return:
"""
from keras_textclassification.keras_layers.keras_radam import RAdam
from keras.optimizers import Adam
# self.model.compile(optimizer=Adam(lr=self.lr, beta_1=0.9, beta_2=0.999, decay=0.0),
# loss=[self.focal_loss(alpha=.25, gamma=2)],
# metrics=['accuracy'])
self.model.compile(optimizer=Adam(lr=self.lr, beta_1=0.9, beta_2=0.999, decay=0.0),
loss=[self.focal_loss(alpha=.25, gamma=2)], # self.loss, #
# loss_weights=[0.6, 0.5],
# loss=[self.focal_loss(gamma=2, alpha=0.25, batch_size=self.batch_size, label_num=self.label, epsilon=1.e-7, multi_dim=False, use_softmax=False)],
# loss=[self.focal_loss(gamma=2, alpha=0.75)],
metrics=['accuracy']) # Any optimize

View File

@ -46,7 +46,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
print("graph load ok!")
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
y, x = read_and_process(hyper_parameters['data']['val_data'])
# 取该数据集的百分之几的语料测试
len_rate = int(len(y) * rate)
@ -57,7 +57,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
for x_one in x:
count += 1
ques_embed = ra_ed.sentence2idx(x_one)
if hyper_parameters['embedding_type'] == 'bert': # bert数据处理, token
if hyper_parameters['embedding_type'] in ['bert', 'albert']: # bert数据处理, token
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]
@ -91,7 +91,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
"""
# 加载超参数
hyper_parameters = load_json(path_hyper_parameter)
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
# 模式初始化和加载
graph = Graph(hyper_parameters)
graph.load_model()
@ -99,7 +99,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
ques = '我要打王者荣耀'
# str to token
ques_embed = ra_ed.sentence2idx(ques)
if hyper_parameters['embedding_type'] == 'bert':
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]
@ -115,7 +115,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
ques = input()
ques_embed = ra_ed.sentence2idx(ques)
print(ques_embed)
if hyper_parameters['embedding_type'] == 'bert':
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]

View File

@ -9,35 +9,73 @@
import pathlib
import sys
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
project_path = str(pathlib.Path(os.path.abspath(__file__)).parent.parent.parent)
sys.path.append(project_path)
# 地址
from keras_textclassification.conf.path_config import path_model, path_fineture, path_model_dir, path_hyper_parameters
# 训练验证数据地址
from keras_textclassification.conf.path_config import path_baidu_qa_2019_train, path_baidu_qa_2019_valid
# 训练验证数据地址
from keras_textclassification.conf.path_config import path_baidu_qa_2019_train, path_baidu_qa_2019_valid, \
path_ccks_2020_el_dev, path_ccks_2020_el_tet, path_ccks_2020_el_train,\
path_ccks_2020_el_cls_dev, path_ccks_2020_el_cls_tet, path_ccks_2020_el_cls_train, \
path_root
# 数据预处理, 删除文件目录下文件
from keras_textclassification.data_preprocess.text_preprocess import PreprocessText, delete_file
from keras_textclassification.data_preprocess.text_preprocess import PreprocessText, PreprocessSim, delete_file
# 模型图
from keras_textclassification.m01_FastText.graph import FastTextGraph as Graph
# from keras_textclassification.m02_TextCNN.graph import TextCNNGraph as Graph
# 计算时间
import time
# # fast_text config
# # 模型目录
# path_model_dir = path_root + "/data/model/ccks_2020_el_cls_albert_fasttext/"
# # 语料地址
# path_model = path_root + '/data/model/ccks_2020_el_cls_albert_fasttext/model_fast_text.h5'
# # 超参数保存地址
# path_hyper_parameters = path_root + '/data/model/ccks_2020_el_cls_albert_fasttext/hyper_parameters.json'
# # embedding微调保存地址
# path_fineture = path_root + "/data/model/ccks_2020_el_cls_albert_fasttext/embedding_trainable.h5"
# fast_text config
# # 模型目录
# path_model_dir = path_root + "/data/model/ccks_2020_el_cls_random_fasttext/"
# # 语料地址
# path_model = path_root + '/data/model/ccks_2020_el_cls_random_fasttext/model_fast_text.h5'
# # 超参数保存地址
# path_hyper_parameters = path_root + '/data/model/ccks_2020_el_cls_random_fasttext/hyper_parameters.json'
# # embedding微调保存地址
# path_fineture = path_root + "/data/model/ccks_2020_el_cls_random_fasttext/embedding_trainable.h5"
# if not os.path.exists(path_model_dir):
# os.mkdir(path_model_dir)
def train(hyper_parameters=None, rate=1.0):
if not hyper_parameters:
hyper_parameters = {
'len_max': 50, # 句子最大长度, 固定推荐20-50, bert越长会越慢, 占用空间也会变大, 小心OOM
'embed_size': 150, # 字/词向量维度, bert取768, word取300, char可以更小些
'len_max': 56, # 句子最大长度, 固定推荐20-50, bert越长会越慢, 占用空间也会变大, 小心OOM
'embed_size': 300, # 字/词向量维度, bert取768, word取300, char可以更小些
'vocab_size': 20000, # 这里随便填的,会根据代码里修改
'trainable': True, # embedding是静态的还是动态的, 即控制可不可以微调
'level_type': 'ngram', # 级别, 最小单元, 字/词, 填 'char' or 'word', 注意:word2vec模式下训练语料要首先切好
'level_type': 'char', # 级别, 最小单元, 字/词, 填 'char' or 'word', 注意:word2vec模式下训练语料要首先切好
'embedding_type': 'random', # 级别, 嵌入类型, 还可以填'xlnet'、'random'、 'bert'、 'albert' or 'word2vec"
# 'gpu_memory_fraction': 0.76, #gpu使用率
'model': {'label': 17, # 类别数
'batch_size': 64, # 批处理尺寸, 感觉原则上越大越好,尤其是样本不均衡的时候, batch_size设置影响比较大
'batch_size': 256, # 批处理尺寸, 感觉原则上越大越好,尤其是样本不均衡的时候, batch_size设置影响比较大
'dropout': 0.5, # 随机失活, 概率
'decay_step': 1000, # 学习率衰减step, 每N个step衰减一次
'decay_rate': 0.9, # 学习率衰减系数, 乘法
'decay_rate': 0.999, # 学习率衰减系数, 乘法
'filters': [3, 7, 7],
'filters_num': 300, # 卷积个数 论文中 filters_num=150,300
'epochs': 20, # 训练最大轮次
'patience': 3, # 早停,2-3就好
'lr': 1e-3, # 学习率,bert取5e-5,其他取1e-3, 对训练会有比较大的影响, 如果准确率一直上不去,可以考虑调这个参数
@ -54,11 +92,11 @@ def train(hyper_parameters=None, rate=1.0):
'path_fineture': path_fineture, # 保存embedding trainable地址, 例如字向量、词向量、bert向量等
},
'embedding': {'layer_indexes': [24], # bert取的层数
'ngram_ns': [3],
'corpus_path': path_baidu_qa_2019_train,
# 'ngram_ns': [3],
# 'corpus_path': path_baidu_qa_2019_train,
},
'data':{'train_data': path_baidu_qa_2019_train, # 训练数据
'val_data': path_baidu_qa_2019_valid # 验证数据
'data':{'train_data': path_ccks_2020_el_cls_train, # 训练数据
'val_data': path_ccks_2020_el_cls_dev # 验证数据
},
}
@ -70,7 +108,7 @@ def train(hyper_parameters=None, rate=1.0):
print("graph init ok!")
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessText()
pt = PreprocessSim(path_model_dir)
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
hyper_parameters['data']['train_data'],
ra_ed, rate=rate, shuffle=True)

View File

@ -36,7 +36,7 @@ class TextCNNGraph(graph):
kernel_size = (filter, self.embed_size),
padding = 'valid',
kernel_initializer = 'normal',
activation = 'relu',
activation = 'tanh',
)(embedding_reshape)
pooled = MaxPool2D(pool_size = (self.len_max - filter + 1, 1),
strides = (1, 1),
@ -45,8 +45,132 @@ class TextCNNGraph(graph):
conv_pools.append(pooled)
# 拼接
x = Concatenate(axis=-1)(conv_pools)
x = Dropout(self.dropout)(x)
x = Flatten()(x)
x = Dense(units=64, activation='tanh')(x)
x = Dropout(self.dropout)(x)
output = Dense(units=self.label, activation=self.activate_classify)(x)
self.model = Model(inputs=self.word_embedding.input, outputs=output)
self.model.summary(120)
# def focal_loss(self, gamma=2, alpha=0.75): # 0.25, 0.5
def focal_loss(self, gamma=2, alpha=0.75, batch_size=None, label_num=None, epsilon=1.e-7, multi_dim=False, use_softmax=True):
from tensorflow.python.ops import array_ops
import keras.backend as K
import tensorflow as tf
def focal_loss_fixed(y_true, y_pred): # with tensorflow
eps = 1e-12
y_pred = K.clip(y_pred, eps, 1. - eps) # improve the stability of the focal loss and see issues 1 for more information
pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))
loss = -K.sum(alpha * K.pow(1. - pt_1, gamma) * K.log(pt_1)) - K.sum((1 - alpha) * K.pow(pt_0, gamma) * K.log(1. - pt_0))
return loss
def focal_loss_all(prediction_tensor, target_tensor):
r"""Compute focal loss for predictions.
Multi-labels Focal loss formula:
FL = -alpha * (z-p)^gamma * log(p) -(1-alpha) * p^gamma * log(1-p)
,which alpha = 0.25, gamma = 2, p = sigmoid(x), z = target_tensor.
Args:
prediction_tensor: A float tensor of shape [batch_size, num_anchors,
num_classes] representing the predicted logits for each class
target_tensor: A float tensor of shape [batch_size, num_anchors,
num_classes] representing one-hot encoded classification targets
weights: A float tensor of shape [batch_size, num_anchors]
alpha: A scalar tensor for focal loss alpha hyper-parameter
gamma: A scalar tensor for focal loss gamma hyper-parameter
Returns:
loss: A (scalar) tensor representing the value of the loss function
"""
sigmoid_p = tf.nn.sigmoid(prediction_tensor)
zeros = array_ops.zeros_like(sigmoid_p, dtype=sigmoid_p.dtype)
# For poitive prediction, only need consider front part loss, back part is 0;
# target_tensor > zeros <=> z=1, so poitive coefficient = z - p.
pos_p_sub = array_ops.where(target_tensor > zeros, target_tensor - sigmoid_p, zeros)
# For negative prediction, only need consider back part loss, front part is 0;
# target_tensor > zeros <=> z=1, so negative coefficient = 0.
neg_p_sub = array_ops.where(target_tensor > zeros, zeros, sigmoid_p)
per_entry_cross_ent = - alpha * (pos_p_sub ** gamma) * tf.log(tf.clip_by_value(sigmoid_p, 1e-8, 1.0)) \
- (1 - alpha) * (neg_p_sub ** gamma) * tf.log(tf.clip_by_value(1.0 - sigmoid_p, 1e-8, 1.0))
return tf.reduce_sum(per_entry_cross_ent)
def focal_loss_category(logits, labels):
'''
:param logits: [batch_size, n_class]
:param labels: [batch_size] not one-hot !!!
:return: -alpha*(1-y)^r * log(y)
它是在哪实现 1- y 通过gather选择的就是1-p,而不是通过计算实现的
logits soft max之后是多个类别的概率也就是二分类时候的1-P和P多分类的时候不是1-p了
怎么把alpha的权重加上去
通过gather把alpha选择后变成batch长度同时达到了选择和维度变换的目的
是否需要对logits转换后的概率值进行限制
需要的避免极端情况的影响
针对输入是 (NPC ) (NP)怎么处理
先把他转换为和常规的一样形状N*PC N*P,
bug:
ValueError: Cannot convert an unknown Dimension to a Tensor: ?
因为输入的尺寸有时是未知的导致了该bug,如果batchsize是确定的可以直接修改为batchsize
'''
if multi_dim:
logits = tf.reshape(logits, [-1, logits.shape[2]])
labels = tf.reshape(labels, [-1])
# (Class ,1)
alpha = tf.constant([0.5]*batch_size, dtype=tf.float32)
labels = tf.argmax(labels) #
labels = tf.cast(labels, dtype=tf.int32)
logits = tf.cast(logits, tf.float32)
if use_softmax:
# (N,Class) > N*Class
softmax = tf.reshape(tf.nn.softmax(logits), [-1]) # [batch_size * n_class]
else:
softmax = tf.reshape(tf.nn.sigmoid(logits), [-1]) # [batch_size * n_class]
# (N,) > (N,) ,但是数值变换了变成了每个label在N*Class中的位置
# labels_shift = tf.range(0, logits.shape[0]) * logits.shape[1] + labels
labels_shift = tf.range(0, label_num) * batch_size + labels
# (N*Class,) > (N,)
prob = tf.gather(softmax, labels_shift)
# 预防预测概率值为0的情况 ; (N,)
prob = tf.clip_by_value(prob, epsilon, 1. - epsilon)
# (Class ,1) > (N,)
alpha_choice = tf.gather(alpha, labels)
# (N,) > (N,)
weight = tf.pow(tf.subtract(1., prob), gamma)
weight = tf.multiply(alpha_choice, weight)
# (N,) > 1
loss = -tf.reduce_sum(tf.multiply(weight, tf.log(prob)))
return loss
return focal_loss_fixed
def create_compile(self):
"""
构建优化器损失函数和评价函数
:return:
"""
from keras_textclassification.keras_layers.keras_radam import RAdam
from keras.optimizers import Adam
# self.model.compile(optimizer=Adam(lr=self.lr, beta_1=0.9, beta_2=0.999, decay=0.0),
# loss=[self.focal_loss(alpha=.25, gamma=2)],
# metrics=['accuracy'])
self.model.compile(optimizer=Adam(lr=self.lr, beta_1=0.9, beta_2=0.999, decay=0.0),
loss=[self.focal_loss(alpha=.25, gamma=2)], # self.loss, #
# loss_weights=[0.6, 0.5],
# loss=[self.focal_loss(gamma=2, alpha=0.25, batch_size=self.batch_size, label_num=self.label, epsilon=1.e-7, multi_dim=False, use_softmax=False)],
# loss=[self.focal_loss(gamma=2, alpha=0.75)],
metrics=['accuracy']) # Any optimize

View File

@ -40,7 +40,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
print("graph load ok!")
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
y, x = read_and_process(hyper_parameters['data']['val_data'])
# 取该数据集的百分之几的语料测试
len_rate = int(len(y) * rate)
@ -87,7 +87,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
# 输入预测
# 加载超参数
hyper_parameters = load_json(path_hyper_parameter)
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
# 模式初始化和加载
graph = Graph(hyper_parameters)
graph.load_model()
@ -95,7 +95,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
ques = '我要打王者荣耀'
# str to token
ques_embed = ra_ed.sentence2idx(ques)
if hyper_parameters['embedding_type'] == 'bert':
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]
@ -111,7 +111,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
ques = input()
ques_embed = ra_ed.sentence2idx(ques)
print(ques_embed)
if hyper_parameters['embedding_type'] == 'bert':
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]

View File

@ -70,7 +70,7 @@ def train(hyper_parameters=None, rate=1.0):
print("graph init ok!")
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
hyper_parameters['data']['train_data'],
ra_ed, rate=rate, shuffle=True)
@ -79,8 +79,7 @@ def train(hyper_parameters=None, rate=1.0):
ra_ed, rate=rate, shuffle=True)
print("data propress ok!")
print(len(y_train))
# 训练
graph.fit(x_train, y_train, x_val, y_val)
# 训练 graph.fit(x_train, y_train, x_val, y_val)
print("耗时:" + str(time.time()-time_start))

View File

@ -41,7 +41,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
print("graph load ok!")
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
y, x = read_and_process(hyper_parameters['data']['val_data'])
# 取该数据集的百分之几的语料测试
len_rate = int(len(y) * rate)
@ -52,7 +52,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
for x_one in x:
count += 1
ques_embed = ra_ed.sentence2idx(x_one)
if hyper_parameters['embedding_type'] == 'bert': # bert数据处理, token
if hyper_parameters['embedding_type'] in ['bert', 'albert']: # bert数据处理, token
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]
@ -82,7 +82,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
# 输入预测
# 加载超参数
hyper_parameters = load_json(path_hyper_parameter)
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
# 模式初始化和加载
graph = Graph(hyper_parameters)
graph.load_model()
@ -90,7 +90,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
ques = '我要打王者荣耀'
# str to token
ques_embed = ra_ed.sentence2idx(ques)
if hyper_parameters['embedding_type'] == 'bert':
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]
@ -106,7 +106,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
ques = input()
ques_embed = ra_ed.sentence2idx(ques)
print(ques_embed)
if hyper_parameters['embedding_type'] == 'bert':
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]

View File

@ -83,7 +83,7 @@ def train(hyper_parameters=None, rate=1.0):
print("graph init ok!")
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
hyper_parameters['data']['train_data'],
ra_ed, rate=rate, shuffle=True)

View File

@ -82,7 +82,7 @@ def train(hyper_parameters=None, rate=1.0):
print("graph init ok!")
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
hyper_parameters['data']['train_data'],
ra_ed, rate=rate, shuffle=True)

View File

@ -39,7 +39,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
print("graph load ok!")
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
y, x = read_and_process(hyper_parameters['data']['val_data'])
# 取该数据集的百分之几的语料测试
len_rate = int(len(y) * rate)
@ -50,7 +50,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
for x_one in x:
count += 1
ques_embed = ra_ed.sentence2idx(x_one)
if hyper_parameters['embedding_type'] == 'bert': # bert数据处理, token
if hyper_parameters['embedding_type'] in ['bert', 'albert']: # bert数据处理, token
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]
@ -80,7 +80,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
# 输入预测
# 加载超参数
hyper_parameters = load_json(path_hyper_parameter)
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
# 模式初始化和加载
graph = Graph(hyper_parameters)
graph.load_model()
@ -88,7 +88,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
ques = '我要打王者荣耀'
# str to token
ques_embed = ra_ed.sentence2idx(ques)
if hyper_parameters['embedding_type'] == 'bert':
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]
@ -104,7 +104,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
ques = input()
ques_embed = ra_ed.sentence2idx(ques)
print(ques_embed)
if hyper_parameters['embedding_type'] == 'bert':
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]

View File

@ -69,7 +69,7 @@ def train(hyper_parameters=None, rate=1.0):
print("graph init ok!")
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
hyper_parameters['data']['train_data'],
ra_ed, rate=rate, shuffle=True)
@ -85,13 +85,4 @@ def train(hyper_parameters=None, rate=1.0):
if __name__=="__main__":
train(rate=1)
# 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。
# 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的
# win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory
# 开始时候一支部队, learning_rate=0.01才好起来
# rate=0.01,random,char下训练结果, random随机embedding对结果影响很大
# 11s 737us/step - loss: 1.3494 - acc: 0.6646 - val_loss: 1.9863 - val_acc: 0.5501
# Epoch 00003: val_loss improved from 2.09217 to 1.98626, saving model to
# Epoch 4/20

View File

@ -80,9 +80,11 @@ class RCNNGraph(graph):
conv_pools.append(pooled)
# 拼接
x = Concatenate()(conv_pools)
x = Dropout(self.dropout)(x)
x = Flatten()(x)
#########################################################################
x = Dense(units=128, activation="tanh")(x)
x = Dropout(self.dropout)(x)
output = Dense(units=self.label, activation=self.activate_classify)(x)
self.model = Model(inputs=self.word_embedding.input, outputs=output)
self.model.summary(120)

View File

@ -39,7 +39,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
print("graph load ok!")
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
y, x = read_and_process(hyper_parameters['data']['val_data'])
# 取该数据集的百分之几的语料测试
len_rate = int(len(y) * rate)
@ -50,7 +50,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
for x_one in x:
count += 1
ques_embed = ra_ed.sentence2idx(x_one)
if hyper_parameters['embedding_type'] == 'bert': # bert数据处理, token
if hyper_parameters['embedding_type'] in ['bert', 'albert']: # bert数据处理, token
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]
@ -80,7 +80,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
# 输入预测
# 加载超参数
hyper_parameters = load_json(path_hyper_parameter)
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
# 模式初始化和加载
graph = Graph(hyper_parameters)
graph.load_model()
@ -88,7 +88,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
ques = '我要打王者荣耀'
# str to token
ques_embed = ra_ed.sentence2idx(ques)
if hyper_parameters['embedding_type'] == 'bert':
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]
@ -104,7 +104,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
ques = input()
ques_embed = ra_ed.sentence2idx(ques)
print(ques_embed)
if hyper_parameters['embedding_type'] == 'bert':
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]

View File

@ -79,7 +79,7 @@ def train(hyper_parameters=None, rate=1.0):
print("graph init ok!")
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
hyper_parameters['data']['train_data'],
ra_ed, rate=rate, shuffle=True)

View File

@ -39,7 +39,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
print("graph load ok!")
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
y, x = read_and_process(hyper_parameters['data']['val_data'])
# 取该数据集的百分之几的语料测试
len_rate = int(len(y) * rate)
@ -50,7 +50,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
for x_one in x:
count += 1
ques_embed = ra_ed.sentence2idx(x_one)
if hyper_parameters['embedding_type'] == 'bert': # bert数据处理, token
if hyper_parameters['embedding_type'] in ['bert', 'albert']: # bert数据处理, token
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]
@ -80,7 +80,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
# 输入预测
# 加载超参数
hyper_parameters = load_json(path_hyper_parameter)
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
# 模式初始化和加载
graph = Graph(hyper_parameters)
graph.load_model()
@ -88,7 +88,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
ques = '我要打王者荣耀'
# str to token
ques_embed = ra_ed.sentence2idx(ques)
if hyper_parameters['embedding_type'] == 'bert':
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]
@ -104,7 +104,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
ques = input()
ques_embed = ra_ed.sentence2idx(ques)
print(ques_embed)
if hyper_parameters['embedding_type'] == 'bert':
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]

View File

@ -9,6 +9,7 @@
import pathlib
import sys
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
project_path = str(pathlib.Path(os.path.abspath(__file__)).parent.parent.parent)
sys.path.append(project_path)
# 地址
@ -34,7 +35,7 @@ def train(hyper_parameters=None, rate=1.0):
'embedding_type': 'random', # 级别, 嵌入类型, 还可以填'xlnet'、'random'、 'bert'、 'albert' or 'word2vec"
'gpu_memory_fraction': 0.66, #gpu使用率
'model': {'label': 17, # 类别数
'batch_size': 32, # 批处理尺寸, 感觉原则上越大越好,尤其是样本不均衡的时候, batch_size设置影响比较大
'batch_size': 16, # 批处理尺寸, 感觉原则上越大越好,尤其是样本不均衡的时候, batch_size设置影响比较大
'filters': [[10, 7, 5], [6, 4, 3]], # 3层的时候
# 'filters': [[10, 7], [5, 3]], # 2层的时候
# 'filters': [[5, 3], [4, 2]], #2层的时候
@ -71,7 +72,7 @@ def train(hyper_parameters=None, rate=1.0):
print("graph init ok!")
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
hyper_parameters['data']['train_data'],
ra_ed, rate=rate, shuffle=True)
@ -87,11 +88,4 @@ def train(hyper_parameters=None, rate=1.0):
if __name__=="__main__":
train(rate=1)
# 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。
# 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的
# win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory
# 14251/14251 [==============================] - 40s 3ms/step - loss: 0.8393 - acc: 0.7466 - val_loss: 1.0829 - val_acc: 0.6637
# Epoch 00003: val_loss improved from 1.12679 to 1.08295, saving model to
# Epoch 4/20

View File

@ -39,7 +39,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
print("graph load ok!")
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
y, x = read_and_process(hyper_parameters['data']['val_data'])
# 取该数据集的百分之几的语料测试
len_rate = int(len(y) * rate)
@ -50,7 +50,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
for x_one in x:
count += 1
ques_embed = ra_ed.sentence2idx(x_one)
if hyper_parameters['embedding_type'] == 'bert': # bert数据处理, token
if hyper_parameters['embedding_type'] in ['bert', 'albert']: # bert数据处理, token
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]
@ -80,7 +80,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
# 输入预测
# 加载超参数
hyper_parameters = load_json(path_hyper_parameter)
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
# 模式初始化和加载
graph = Graph(hyper_parameters)
graph.load_model()
@ -88,7 +88,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
ques = '我要打王者荣耀'
# str to token
ques_embed = ra_ed.sentence2idx(ques)
if hyper_parameters['embedding_type'] == 'bert':
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]
@ -104,7 +104,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
ques = input()
ques_embed = ra_ed.sentence2idx(ques)
print(ques_embed)
if hyper_parameters['embedding_type'] == 'bert':
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]

View File

@ -75,7 +75,7 @@ def train(hyper_parameters=None, rate=1.0):
print("graph init ok!")
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
hyper_parameters['data']['train_data'],
ra_ed, rate=rate, shuffle=True)
@ -91,10 +91,4 @@ def train(hyper_parameters=None, rate=1.0):
if __name__=="__main__":
train(rate=1)
# 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。
# 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的
# win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory
# 14251/14251 [==============================] - 13s 931us/step - loss: 1.0622 - acc: 0.6821 - val_loss: 1.6637 - val_acc: 0.6214
# Epoch 00003: val_loss improved from 1.74499 to 1.66371, saving model
# Epoch 4/20

View File

@ -40,7 +40,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
print("graph load ok!")
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
y, x = read_and_process(hyper_parameters['data']['val_data'])
# 取该数据集的百分之几的语料测试
len_rate = int(len(y) * rate)
@ -51,7 +51,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
for x_one in x:
count += 1
ques_embed = ra_ed.sentence2idx(x_one)
if hyper_parameters['embedding_type'] == 'bert': # bert数据处理, token
if hyper_parameters['embedding_type'] in ['bert', 'albert']: # bert数据处理, token
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]
@ -81,7 +81,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
# 输入预测
# 加载超参数
hyper_parameters = load_json(path_hyper_parameter)
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
# 模式初始化和加载
graph = Graph(hyper_parameters)
graph.load_model()
@ -89,7 +89,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
ques = '我要打王者荣耀'
# str to token
ques_embed = ra_ed.sentence2idx(ques)
if hyper_parameters['embedding_type'] == 'bert':
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]
@ -105,7 +105,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
ques = input()
ques_embed = ra_ed.sentence2idx(ques)
print(ques_embed)
if hyper_parameters['embedding_type'] == 'bert':
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]

View File

@ -83,7 +83,7 @@ def train(hyper_parameters=None, rate=1.0):
print("graph init ok!")
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
hyper_parameters['data']['train_data'],
ra_ed, rate=rate, shuffle=True)

View File

@ -40,7 +40,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
print("graph load ok!")
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
y, x = read_and_process(hyper_parameters['data']['val_data'])
# 取该数据集的百分之几的语料测试
len_rate = int(len(y) * rate)
@ -51,7 +51,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
for x_one in x:
count += 1
ques_embed = ra_ed.sentence2idx(x_one)
if hyper_parameters['embedding_type'] == 'bert': # bert数据处理, token
if hyper_parameters['embedding_type'] in ['bert', 'albert']: # bert数据处理, token
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]
@ -81,7 +81,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
# 输入预测
# 加载超参数
hyper_parameters = load_json(path_hyper_parameter)
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
# 模式初始化和加载
graph = Graph(hyper_parameters)
graph.load_model()
@ -89,7 +89,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
ques = '我要打王者荣耀'
# str to token
ques_embed = ra_ed.sentence2idx(ques)
if hyper_parameters['embedding_type'] == 'bert':
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]
@ -105,7 +105,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
ques = input()
ques_embed = ra_ed.sentence2idx(ques)
print(ques_embed)
if hyper_parameters['embedding_type'] == 'bert':
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]

View File

@ -79,7 +79,7 @@ def train(hyper_parameters=None, rate=1.0):
print("graph init ok!")
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
hyper_parameters['data']['train_data'],
ra_ed, rate=rate, shuffle=True)

View File

@ -83,6 +83,9 @@ class DeepMojiGraph(graph):
x, weights = x
x = Dropout(self.dropout)(x)
x = Dense(128, activation="tanh")(x)
x = Dropout(self.dropout)(x)
# x = Flatten()(x)
# 最后就是softmax
dense_layer = Dense(self.label, activation=self.activate_classify)(x)

View File

@ -40,7 +40,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
print("graph load ok!")
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
y, x = read_and_process(hyper_parameters['data']['val_data'])
# 取该数据集的百分之几的语料测试
len_rate = int(len(y) * rate)
@ -51,7 +51,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
for x_one in x:
count += 1
ques_embed = ra_ed.sentence2idx(x_one)
if hyper_parameters['embedding_type'] == 'bert': # bert数据处理, token
if hyper_parameters['embedding_type'] in ['bert', 'albert']: # bert数据处理, token
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]
@ -81,7 +81,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
# 输入预测
# 加载超参数
hyper_parameters = load_json(path_hyper_parameter)
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
# 模式初始化和加载
graph = Graph(hyper_parameters)
graph.load_model()
@ -89,7 +89,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
ques = '我要打王者荣耀'
# str to token
ques_embed = ra_ed.sentence2idx(ques)
if hyper_parameters['embedding_type'] == 'bert':
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]
@ -105,7 +105,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
ques = input()
ques_embed = ra_ed.sentence2idx(ques)
print(ques_embed)
if hyper_parameters['embedding_type'] == 'bert':
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]

View File

@ -13,18 +13,37 @@ import os
project_path = str(pathlib.Path(os.path.abspath(__file__)).parent.parent.parent)
sys.path.append(project_path)
# 地址
from keras_textclassification.conf.path_config import path_model, path_fineture, path_model_dir, \
path_hyper_parameters
from keras_textclassification.conf.path_config import path_model, path_fineture, \
path_model_dir, path_hyper_parameters
# 训练验证数据地址
from keras_textclassification.conf.path_config import path_baidu_qa_2019_train, path_baidu_qa_2019_valid
from keras_textclassification.conf.path_config import path_baidu_qa_2019_train, path_baidu_qa_2019_valid, \
path_ccks_2020_el_dev, path_ccks_2020_el_tet, path_ccks_2020_el_train,\
path_ccks_2020_el_cls_dev, path_ccks_2020_el_cls_tet, path_ccks_2020_el_cls_train, \
path_root
# 数据预处理, 删除文件目录下文件
from keras_textclassification.data_preprocess.text_preprocess import PreprocessText, delete_file
from keras_textclassification.data_preprocess.text_preprocess import PreprocessText, PreprocessSim, delete_file
# 模型图
from keras_textclassification.m10_DeepMoji.graph import DeepMojiGraph as Graph
# 计算时间
import time
# fast_text config
# 模型目录
path_model_dir = path_root + "/data/model/ccks_2020_el_cls_deepmoji/"
# 语料地址
path_model = path_model_dir + 'model_fast_text.h5'
# 超参数保存地址
path_hyper_parameters = path_model_dir + 'hyper_parameters.json'
# embedding微调保存地址
path_fineture = path_model_dir + "embedding_trainable.h5"
if not os.path.exists(path_model_dir):
os.mkdir(path_model_dir)
def train(hyper_parameters=None, rate=1.0):
"""
训练函数
@ -34,20 +53,20 @@ def train(hyper_parameters=None, rate=1.0):
"""
if not hyper_parameters:
hyper_parameters = {
'len_max': 50, # 句子最大长度, 固定 推荐20-50
'len_max': 1376, # 句子最大长度, 固定 推荐20-50
'embed_size': 300, # 字/词向量维度
'vocab_size': 20000, # 这里随便填的,会根据代码里修改
'trainable': True, # embedding是静态的还是动态的, 即控制可不可以微调
'level_type': 'char', # 级别, 最小单元, 字/词, 填 'char' or 'word'
'embedding_type': 'random', # 级别, 嵌入类型, 还可以填'xlnet'、'random'、 'bert'、 'albert' or 'word2vec"
'gpu_memory_fraction': 0.66, # gpu使用率
'model': {'label': 17, # 类别数
'batch_size': 64, # 批处理尺寸, 感觉原则上越大越好,尤其是样本不均衡的时候, batch_size设置影响比较大
'gpu_memory_fraction': 0.8, # gpu使用率
'model': {'label': 23, # 类别数
'batch_size': 16, # 批处理尺寸, 感觉原则上越大越好,尤其是样本不均衡的时候, batch_size设置影响比较大
'dropout': 0.5, # 随机失活, 概率
'decay_step': 100, # 学习率衰减step, 每N个step衰减一次
'decay_rate': 0.9, # 学习率衰减系数, 乘法
'decay_rate': 0.999, # 学习率衰减系数, 乘法
'epochs': 20, # 训练最大轮次
'patience': 3, # 早停,2-3就好
'patience': 6, # 早停,2-3就好
'lr': 1e-3, # 学习率, 对训练会有比较大的影响, 如果准确率一直上不去,可以考虑调这个参数
'l2': 1e-6, # l2正则化
'activate_classify': 'softmax', # 最后一个layer, 即分类激活函数
@ -58,25 +77,25 @@ def train(hyper_parameters=None, rate=1.0):
'path_hyper_parameters': path_hyper_parameters, # 模型(包括embedding),超参数地址,
'path_fineture': path_fineture, # 保存embedding trainable地址, 例如字向量、词向量、bert向量等
'rnn_type': 'GRU', # type of rnn, select 'LSTM', 'GRU', 'Bidirectional-GRU'
'rnn_units': 256, # RNN隐藏层,
'rnn_units': 150, # RNN隐藏层,
},
'embedding': {'layer_indexes': [12], # bert取的层数,
# 'corpus_path': '', # embedding预训练数据地址,不配则会默认取conf里边默认的地址, keras-bert可以加载谷歌版bert,百度版ernie(需转换https://github.com/ArthurRizar/tensorflow_ernie),哈工大版bert-wwm(tf框架https://github.com/ymcui/Chinese-BERT-wwm)
},
'data': {'train_data': path_baidu_qa_2019_train, # 训练数据
'val_data': path_baidu_qa_2019_valid # 验证数据
'data': {'train_data': path_ccks_2020_el_cls_train, # 训练数据
'val_data': path_ccks_2020_el_cls_dev # 验证数据
},
}
# 删除先前存在的模型\embedding微调模型等
delete_file(path_model_dir)
# delete_file(path_model_dir)
time_start = time.time()
# graph初始化
graph = Graph(hyper_parameters)
print("graph init ok!")
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessText()
pt = PreprocessSim(path_model_dir)
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
hyper_parameters['data']['train_data'],
ra_ed, rate=rate, shuffle=True)
@ -91,12 +110,4 @@ def train(hyper_parameters=None, rate=1.0):
if __name__ == "__main__":
train(rate=1) # sample条件下设为1,否则训练语料可能会很少
# 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。
# 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的
# win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory
# 参数较多,不适合用bert,会比较慢和OOM
# 1425/1425 [==============================] - 6s 4ms/step - loss: 1.0751 - acc: 0.6618 - val_loss: 1.6913 - val_acc: 0.5227
# Epoch 00007: val_loss improved from 1.71417 to 1.69127, saving model
# Epoch 8/20
train(rate=1)

View File

@ -8,7 +8,7 @@
from keras import regularizers
from keras.layers import Dense
from keras.layers import Dropout, Flatten
from keras.layers import SpatialDropout1D, GlobalMaxPooling1D, MaxPooling1D
from keras.layers import SpatialDropout1D, GlobalMaxPooling1D, GlobalAveragePooling1D, Concatenate
from keras.models import Model
from keras_textclassification.keras_layers.attention_self import AttentionSelf
@ -34,9 +34,13 @@ class SelfAttentionGraph(graph):
x = self.word_embedding.output
x = SpatialDropout1D(self.dropout_spatial)(x)
x = AttentionSelf(self.word_embedding.embed_size)(x)
x = GlobalMaxPooling1D()(x)
x_max = GlobalMaxPooling1D()(x)
x_avg = GlobalAveragePooling1D()(x)
x = Concatenate()([x_max, x_avg])
x = Dropout(self.dropout)(x)
# x = Flatten()(x)
x = Dense(72, activation="tanh")(x)
x = Dropout(self.dropout)(x)
# 最后就是softmax
dense_layer = Dense(self.label, activation=self.activate_classify)(x)
output = [dense_layer]

View File

@ -40,7 +40,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
print("graph load ok!")
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
y, x = read_and_process(hyper_parameters['data']['val_data'])
# 取该数据集的百分之几的语料测试
len_rate = int(len(y) * rate)
@ -51,7 +51,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
for x_one in x:
count += 1
ques_embed = ra_ed.sentence2idx(x_one)
if hyper_parameters['embedding_type'] == 'bert': # bert数据处理, token
if hyper_parameters['embedding_type'] in ['bert', 'albert']: # bert数据处理, token
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]
@ -81,7 +81,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
# 输入预测
# 加载超参数
hyper_parameters = load_json(path_hyper_parameter)
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
# 模式初始化和加载
graph = Graph(hyper_parameters)
graph.load_model()
@ -89,7 +89,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
ques = '我要打王者荣耀'
# str to token
ques_embed = ra_ed.sentence2idx(ques)
if hyper_parameters['embedding_type'] == 'bert':
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]
@ -105,7 +105,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
ques = input()
ques_embed = ra_ed.sentence2idx(ques)
print(ques_embed)
if hyper_parameters['embedding_type'] == 'bert':
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]

View File

@ -13,9 +13,14 @@ sys.path.append(project_path)
# 地址
from keras_textclassification.conf.path_config import path_model, path_fineture, path_model_dir, path_hyper_parameters
# 训练验证数据地址
from keras_textclassification.conf.path_config import path_baidu_qa_2019_train, path_baidu_qa_2019_valid
from keras_textclassification.conf.path_config import path_baidu_qa_2019_train, path_baidu_qa_2019_valid, \
path_ccks_2020_el_dev, path_ccks_2020_el_tet, path_ccks_2020_el_train,\
path_ccks_2020_el_cls_dev, path_ccks_2020_el_cls_tet, path_ccks_2020_el_cls_train, \
path_root
# 数据预处理, 删除文件目录下文件
from keras_textclassification.data_preprocess.text_preprocess import PreprocessText, delete_file
from keras_textclassification.data_preprocess.text_preprocess import PreprocessText, PreprocessSim, delete_file
# 模型图
from keras_textclassification.m11_SelfAttention.graph import SelfAttentionGraph as Graph
# 计算时间
@ -31,18 +36,18 @@ def train(hyper_parameters=None, rate=1.0):
"""
if not hyper_parameters:
hyper_parameters = {
'len_max': 50, # 句子最大长度, 固定 推荐20-50
'len_max': 1376, # 句子最大长度, 固定 推荐20-50
'embed_size': 300, # 字/词向量维度
'vocab_size': 20000, # 这里随便填的,会根据代码里修改
'trainable': True, # embedding是静态的还是动态的, 即控制可不可以微调
'level_type': 'char', # 级别, 最小单元, 字/词, 填 'char' or 'word'
'embedding_type': 'random', # 级别, 嵌入类型, 还可以填'xlnet'、'random'、 'bert'、 'albert' or 'word2vec"
'gpu_memory_fraction': 0.66, # gpu使用率
'model': {'label': 17, # 类别数
'batch_size': 64, # 批处理尺寸, 感觉原则上越大越好,尤其是样本不均衡的时候, batch_size设置影响比较大
'gpu_memory_fraction': 0.76, # gpu使用率
'model': {'label': 23, # 类别数
'batch_size': 8, # 批处理尺寸, 感觉原则上越大越好,尤其是样本不均衡的时候, batch_size设置影响比较大
'dropout': 0.5, # 随机失活, 概率
'decay_step': 100, # 学习率衰减step, 每N个step衰减一次
'decay_rate': 0.9, # 学习率衰减系数, 乘法
'decay_step': 1000, # 学习率衰减step, 每N个step衰减一次
'decay_rate': 0.999, # 学习率衰减系数, 乘法
'epochs': 20, # 训练最大轮次
'patience': 3, # 早停,2-3就好
'lr': 1e-3, # 学习率, 对训练会有比较大的影响, 如果准确率一直上不去,可以考虑调这个参数
@ -58,8 +63,8 @@ def train(hyper_parameters=None, rate=1.0):
'embedding': {'layer_indexes': [12], # bert取的层数,
# 'corpus_path': '', # embedding预训练数据地址,不配则会默认取conf里边默认的地址, keras-bert可以加载谷歌版bert,百度版ernie(需转换https://github.com/ArthurRizar/tensorflow_ernie),哈工大版bert-wwm(tf框架https://github.com/ymcui/Chinese-BERT-wwm)
},
'data': {'train_data': path_baidu_qa_2019_train, # 训练数据
'val_data': path_baidu_qa_2019_valid # 验证数据
'data': {'train_data': path_ccks_2020_el_cls_train, # 训练数据
'val_data': path_ccks_2020_el_cls_dev # 验证数据
},
}
@ -71,7 +76,7 @@ def train(hyper_parameters=None, rate=1.0):
print("graph init ok!")
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessText()
pt = PreprocessSim(path_model_dir)
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
hyper_parameters['data']['train_data'],
ra_ed, rate=rate, shuffle=True)
@ -86,13 +91,4 @@ def train(hyper_parameters=None, rate=1.0):
if __name__ == "__main__":
train(rate=1) # sample条件下设为1,否则训练语料可能会很少
# 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。
# 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的
# win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory
# 参数较多,不适合用bert,会比较慢和OOM
#
# 1425/1425 [==============================] - 0s 283us/step - loss: 1.0207 - acc: 0.7396 - val_loss: 1.8706 - val_acc: 0.5000
# Epoch 00012: val_loss improved from 1.89859 to 1.87060, saving model to
# Epoch 13/20
train(rate=1)

View File

@ -5,8 +5,8 @@
# @function :Hierarchical Attention Networks for Document Classification(https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf)
from keras.layers import Dense, Dropout, SpatialDropout1D, Flatten, Input
from keras.layers import Bidirectional, LSTM, GRU, TimeDistributed
from keras.layers import Dense, Dropout, Flatten, Input
from keras.layers import Bidirectional, GRU
from keras import regularizers
from keras.models import Model
import keras.backend as K

View File

@ -40,7 +40,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
print("graph load ok!")
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
y, x = read_and_process(hyper_parameters['data']['val_data'])
# 取该数据集的百分之几的语料测试
len_rate = int(len(y) * rate)
@ -51,7 +51,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
for x_one in x:
count += 1
ques_embed = ra_ed.sentence2idx(x_one)
if hyper_parameters['embedding_type'] == 'bert': # bert数据处理, token
if hyper_parameters['embedding_type'] in ['bert', 'albert']: # bert数据处理, token
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]
@ -81,7 +81,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
# 输入预测
# 加载超参数
hyper_parameters = load_json(path_hyper_parameter)
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
# 模式初始化和加载
graph = Graph(hyper_parameters)
graph.load_model()
@ -89,7 +89,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
ques = '我要打王者荣耀'
# str to token
ques_embed = ra_ed.sentence2idx(ques)
if hyper_parameters['embedding_type'] == 'bert':
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]
@ -105,7 +105,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
ques = input()
ques_embed = ra_ed.sentence2idx(ques)
print(ques_embed)
if hyper_parameters['embedding_type'] == 'bert':
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]

View File

@ -75,7 +75,7 @@ def train(hyper_parameters=None, rate=1.0):
print("graph init ok!")
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
hyper_parameters['data']['train_data'],
ra_ed, rate=rate, shuffle=True)

View File

@ -40,7 +40,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
print("graph load ok!")
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
y, x = read_and_process(hyper_parameters['data']['val_data'])
# 取该数据集的百分之几的语料测试
len_rate = int(len(y) * rate)
@ -51,7 +51,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
for x_one in x:
count += 1
ques_embed = ra_ed.sentence2idx(x_one)
if hyper_parameters['embedding_type'] == 'bert': # bert数据处理, token
if hyper_parameters['embedding_type'] in ['bert', 'albert']: # bert数据处理, token
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]
@ -88,7 +88,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
# 输入预测
# 加载超参数
hyper_parameters = load_json(path_hyper_parameter)
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
# 模式初始化和加载
graph = Graph(hyper_parameters)
graph.load_model()
@ -96,7 +96,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
ques = '我要打王者荣耀'
# str to token
ques_embed = ra_ed.sentence2idx(ques)
if hyper_parameters['embedding_type'] == 'bert':
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]
@ -112,7 +112,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
ques = input()
ques_embed = ra_ed.sentence2idx(ques)
print(ques_embed)
if hyper_parameters['embedding_type'] == 'bert':
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]

View File

@ -75,7 +75,7 @@ def train(hyper_parameters=None, rate=1.0):
print("graph init ok!")
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
hyper_parameters['data']['train_data'],
ra_ed, rate=rate, shuffle=True)

View File

@ -26,6 +26,8 @@ import time
import numpy as np
# path_hyper_parameters=path_model_dir + "hyper_parameters.json"
def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.0):
# 测试集的准确率
@ -40,7 +42,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
print("graph load ok!")
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
y, x = read_and_process(hyper_parameters['data']['val_data'])
# 取该数据集的百分之几的语料测试
len_rate = int(len(y) * rate)
@ -51,7 +53,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
for x_one in x:
count += 1
ques_embed = ra_ed.sentence2idx(x_one)
if hyper_parameters['embedding_type'] == 'bert': # bert数据处理, token
if hyper_parameters['embedding_type'] in ['bert', 'albert']: # bert数据处理, token
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]
@ -81,7 +83,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
# 输入预测
# 加载超参数
hyper_parameters = load_json(path_hyper_parameter)
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
# 模式初始化和加载
graph = Graph(hyper_parameters)
graph.load_model()
@ -89,7 +91,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
ques = '我要打王者荣耀'
# str to token
ques_embed = ra_ed.sentence2idx(ques)
if hyper_parameters['embedding_type'] == 'bert':
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]
@ -105,7 +107,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
ques = input()
ques_embed = ra_ed.sentence2idx(ques)
print(ques_embed)
if hyper_parameters['embedding_type'] == 'bert':
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]

View File

@ -36,7 +36,7 @@ def train(hyper_parameters=None, rate=1.0):
'vocab_size': 20000, # 这里随便填的,会根据代码里修改
'trainable': True, # embedding是静态的还是动态的
'level_type': 'char', # 级别, 最小单元, 字/词, 填 'char' or 'word'
'embedding_type': 'random', # 级别, 嵌入类型, 还可以填'xlnet'、'random'、 'bert'、 'albert' or 'word2vec"
'embedding_type': 'word2vec', # 级别, 嵌入类型, 还可以填'xlnet'、'random'、 'bert'、 'albert' or 'word2vec"
'gpu_memory_fraction': 0.66, # gpu使用率
'model': {'label': 17, # 类别数
'batch_size': 64, # 批处理尺寸, 感觉原则上越大越好,尤其是样本不均衡的时候, batch_size设置影响比较大
@ -80,7 +80,7 @@ def train(hyper_parameters=None, rate=1.0):
print("graph init ok!")
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
hyper_parameters['data']['train_data'],
ra_ed, rate=rate, shuffle=True)
@ -95,10 +95,4 @@ def train(hyper_parameters=None, rate=1.0):
if __name__ == "__main__":
train(rate=1) # sample条件下设为1,否则训练语料可能会很少
# 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。
# 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的
# win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory
# 参数较多,不适合用bert,会比较慢和OOM
# 速度很慢呐,字/词维度(embed_size)一设大就OOM了不喜欢用
train(rate=1)

View File

@ -27,6 +27,8 @@ import time
import numpy as np
def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.0):
"""
测试集测试与模型评估
@ -46,7 +48,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
print("graph load ok!")
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
y, x = read_and_process(hyper_parameters['data']['val_data'])
# 取该数据集的百分之几的语料测试
len_rate = int(len(y) * rate)
@ -57,7 +59,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
for x_one in x:
count += 1
ques_embed = ra_ed.sentence2idx(x_one)
if hyper_parameters['embedding_type'] == 'bert': # bert数据处理, token
if hyper_parameters['embedding_type'] in ['bert', 'albert']: # bert数据处理, token
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]
@ -91,7 +93,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
"""
# 加载超参数
hyper_parameters = load_json(path_hyper_parameter)
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
# 模式初始化和加载
graph = Graph(hyper_parameters)
graph.load_model()
@ -99,7 +101,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
ques = '我要打王者荣耀'
# str to token
ques_embed = ra_ed.sentence2idx(ques)
if hyper_parameters['embedding_type'] == 'bert':
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]
@ -115,7 +117,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
ques = input()
ques_embed = ra_ed.sentence2idx(ques)
print(ques_embed)
if hyper_parameters['embedding_type'] == 'bert':
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]

View File

@ -72,7 +72,7 @@ def train(hyper_parameters=None, rate=1.0):
print('graph init ok!')
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
hyper_parameters['data']['train_data'],
ra_ed, rate=rate, shuffle=True)

View File

@ -46,7 +46,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
print("graph load ok!")
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
y, x = read_and_process(hyper_parameters['data']['val_data'])
# 取该数据集的百分之几的语料测试
len_rate = int(len(y) * rate)
@ -57,7 +57,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
for x_one in x:
count += 1
ques_embed = ra_ed.sentence2idx(x_one)
if hyper_parameters['embedding_type'] == 'bert': # bert数据处理, token
if hyper_parameters['embedding_type'] in ['bert', 'albert']: # bert数据处理, token
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]
@ -91,7 +91,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
"""
# 加载超参数
hyper_parameters = load_json(path_hyper_parameter)
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
# 模式初始化和加载
graph = Graph(hyper_parameters)
graph.load_model()
@ -99,7 +99,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
ques = '我要打王者荣耀'
# str to token
ques_embed = ra_ed.sentence2idx(ques)
if hyper_parameters['embedding_type'] == 'bert':
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]
@ -115,7 +115,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
ques = input()
ques_embed = ra_ed.sentence2idx(ques)
print(ques_embed)
if hyper_parameters['embedding_type'] == 'bert':
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]

View File

@ -71,7 +71,7 @@ def train(hyper_parameters=None, rate=1.0):
print('graph init ok!')
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
hyper_parameters['data']['train_data'],
ra_ed, rate=rate, shuffle=True)
@ -154,7 +154,7 @@ def train_sim(hyper_parameters=None, rate=1.0):
if __name__=='__main__':
# train(rate=1)
train_sim()
train(rate=1)
# train_sim()

View File

@ -166,3 +166,4 @@ def train(graph='TextCNN', label=17, rate=1.0, hyper_parameters=None, path_train
if __name__ == "__main__":
train(graph='TextCNN', label=17, rate=1, path_train_data=None, path_dev_data=None,hyper_parameters=None)

View File

@ -1,13 +1,12 @@
gensim>=3.7.1
jieba>=0.39
numpy>=1.16.2
pandas>=0.23.4
scikit-learn>=0.19.1
tflearn>=0.3.2
tqdm>=4.31.1
passlib>=1.7.1
gensim==3.7.1
jieba==0.39
numpy==1.16.2
pandas==0.23.4
scikit-learn==0.19.1
tflearn==0.3.2
tqdm==4.31.1
passlib==1.7.1
keras==2.2.4
tensorflow-gpu==1.12.0
keras-bert>=0.80.0
keras-xlnet>=0.16.0
keras-adaptive-softmax>=0.6.0
keras-bert==0.80.0
keras-xlnet==0.16.0
keras-adaptive-softmax==0.6.0

View File

@ -40,7 +40,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
print("graph load ok!")
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
y, x = read_and_process(hyper_parameters['data']['val_data'])
# 取该数据集的百分之几的语料测试
len_rate = int(len(y) * rate)
@ -51,7 +51,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
for x_one in x:
count += 1
ques_embed = ra_ed.sentence2idx(x_one)
if hyper_parameters['embedding_type'] == 'bert': # bert数据处理, token
if hyper_parameters['embedding_type'] in ['bert', 'albert']: # bert数据处理, token
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]
@ -86,7 +86,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
# 输入预测
# 加载超参数
hyper_parameters = load_json(path_hyper_parameter)
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
# 模式初始化和加载
graph = Graph(hyper_parameters)
graph.load_model()
@ -94,7 +94,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
ques = '我要打王者荣耀'
# str to token
ques_embed = ra_ed.sentence2idx(ques)
if hyper_parameters['embedding_type'] == 'bert':
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]
@ -110,7 +110,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
ques = input()
ques_embed = ra_ed.sentence2idx(ques)
print(ques_embed)
if hyper_parameters['embedding_type'] == 'bert':
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]

View File

@ -39,7 +39,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
ques = '我要打王者荣耀'
# str to token
ques_embed = ra_ed.sentence2idx(ques)
if hyper_parameters['embedding_type'] == 'bert':
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]
@ -61,7 +61,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
ques = input()
ques_embed = ra_ed.sentence2idx(ques)
print(ques_embed)
if hyper_parameters['embedding_type'] == 'bert':
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]

View File

@ -43,7 +43,7 @@ def train(hyper_parameters=None, rate=1.0):
'lr': 1e-3, # 学习率, bert取5e-5, 其他取1e-3, 对训练会有比较大的影响, 如果准确率一直上不去,可以考虑调这个参数
'l2': 1e-9, # l2正则化
'activate_classify': 'sigmoid', # 'sigmoid', # 最后一个layer, 即分类激活函数
'loss': 'categorical_crossentropy', # 损失函数, 可能有问题, 可以自己定义
'loss': 'binary_crossentropy', # 损失函数, 可能有问题, 可以自己定义
'metrics': 'top_k_categorical_accuracy', # 1070个类, 太多了先用topk, 这里数据k设置为最大:33
# 'metrics': 'categorical_accuracy', # 保存更好模型的评价标准
'is_training': True, # 训练后者是测试模型
@ -68,7 +68,7 @@ def train(hyper_parameters=None, rate=1.0):
print("graph init ok!")
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessTextMulti()
pt = PreprocessTextMulti(path_model_dir)
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
hyper_parameters['data']['train_data'],
ra_ed, rate=rate, shuffle=True)

View File

@ -40,7 +40,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
print("graph load ok!")
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
y, x = read_and_process(hyper_parameters['data']['val_data'])
# 取该数据集的百分之几的语料测试
len_rate = int(len(y) * rate)
@ -51,7 +51,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
for x_one in x:
count += 1
ques_embed = ra_ed.sentence2idx(x_one)
if hyper_parameters['embedding_type'] == 'bert': # bert数据处理, token
if hyper_parameters['embedding_type'] in ['bert', 'albert']: # bert数据处理, token
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]
@ -81,7 +81,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
# 输入预测
# 加载超参数
hyper_parameters = load_json(path_hyper_parameter)
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
# 模式初始化和加载
graph = Graph(hyper_parameters)
graph.load_model()
@ -89,7 +89,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
ques = '我要打王者荣耀'
# str to token
ques_embed = ra_ed.sentence2idx(ques)
if hyper_parameters['embedding_type'] == 'bert':
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]
@ -105,7 +105,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
ques = input()
ques_embed = ra_ed.sentence2idx(ques)
print(ques_embed)
if hyper_parameters['embedding_type'] == 'bert':
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
x_val_1 = np.array([ques_embed[0]])
x_val_2 = np.array([ques_embed[1]])
x_val = [x_val_1, x_val_2]

View File

@ -67,7 +67,7 @@ def train(hyper_parameters=None, rate=1.0):
print("graph init ok!")
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
hyper_parameters['data']['train_data'],
ra_ed, rate=rate, shuffle=True)

View File

@ -67,7 +67,7 @@ def train(hyper_parameters=None, rate=1.0):
print("graph init ok!")
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
hyper_parameters['data']['train_data'],
ra_ed, rate=rate, shuffle=True)

View File

@ -66,7 +66,7 @@ def train(hyper_parameters=None, rate=1.0):
print("graph init ok!")
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
hyper_parameters['data']['train_data'],
ra_ed, rate=rate, shuffle=True)

View File

@ -67,7 +67,7 @@ def train(hyper_parameters=None, rate=1.0):
print("graph init ok!")
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
hyper_parameters['data']['train_data'],
ra_ed, rate=rate, shuffle=True)

View File

@ -75,7 +75,7 @@ def train(hyper_parameters=None, rate=1.0):
print("graph init ok!")
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
hyper_parameters['data']['train_data'],
ra_ed, rate=rate, shuffle=True)

View File

@ -66,7 +66,7 @@ def train(hyper_parameters=None, rate=1.0):
print("graph init ok!")
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
hyper_parameters['data']['train_data'],
ra_ed, rate=rate, shuffle=True)

View File

@ -66,7 +66,7 @@ def train(hyper_parameters=None, rate=1.0):
print("graph init ok!")
ra_ed = graph.word_embedding
# 数据预处理
pt = PreprocessText()
pt = PreprocessText(path_model_dir)
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
hyper_parameters['data']['train_data'],
ra_ed, rate=rate, shuffle=True)