fix text_preprocess and albert
This commit is contained in:
parent
e5458ec86f
commit
d1cb728d23
11
README.md
11
README.md
@ -141,5 +141,16 @@ train(graph='TextCNN', # 必填, 算法名, 可选"ALBERT","BERT","XLNET","FASTT
|
||||
hyper_parameters=None) # 可填, json格式, 超参数, 默认embedding为'char','random'
|
||||
```
|
||||
|
||||
# Reference
|
||||
For citing this work, you can refer to the present GitHub project. For example, with BibTeX:
|
||||
```
|
||||
@misc{Keras-TextClassification,
|
||||
howpublished = {\url{https://github.com/yongzhuo/Keras-TextClassification}},
|
||||
title = {Keras-TextClassification},
|
||||
author = {Yongzhuo Mo},
|
||||
publisher = {GitHub},
|
||||
year = {2019}
|
||||
}
|
||||
```
|
||||
|
||||
*希望对你有所帮助!
|
||||
|
@ -5,5 +5,10 @@
|
||||
# @function :
|
||||
|
||||
|
||||
from keras_textclassification.text_classification_api import train
|
||||
|
||||
# from keras_textclassification.text_classification_api import train
|
||||
#
|
||||
# res = "假道士敷衍超渡,鬼魂一家感觉受到了屈辱,现出真身捉弄他"
|
||||
# mention = "道士"
|
||||
# offset = 1
|
||||
# print(res[1])
|
||||
# print(res[1+1])
|
@ -83,7 +83,9 @@ class BaseEmbedding:
|
||||
self.token2idx = {}
|
||||
self.idx2token = {}
|
||||
|
||||
def sentence2idx(self, text, second_text=""):
|
||||
def sentence2idx(self, text, second_text=None):
|
||||
if second_text:
|
||||
second_text = "[SEP]" + str(second_text).upper()
|
||||
# text = extract_chinese(str(text).upper())
|
||||
text = str(text).upper()
|
||||
|
||||
@ -296,14 +298,41 @@ class BertEmbedding(BaseEmbedding):
|
||||
self.vocab_size = len(self.token_dict)
|
||||
self.tokenizer = keras_bert.Tokenizer(self.token_dict)
|
||||
|
||||
def sentence2idx(self, text, second_text=""):
|
||||
# text = extract_chinese(str(text).upper())
|
||||
def build_keras4bert(self):
|
||||
import bert4keras
|
||||
from bert4keras.models import build_transformer_model
|
||||
from bert4keras.tokenizers import Tokenizer,load_vocab
|
||||
import os
|
||||
self.embedding_type = 'bert'
|
||||
config_path = os.path.join(self.corpus_path, 'bert_config.json')
|
||||
checkpoint_path = os.path.join(self.corpus_path, 'bert_model.ckpt')
|
||||
dict_path = os.path.join(self.corpus_path, 'vocab.txt')
|
||||
self.model = bert4keras.models.build_transformer_model(config_path=config_path,
|
||||
checkpoint_path=checkpoint_path)
|
||||
|
||||
# 加载并精简词表,建立分词器
|
||||
self.token_dict, keep_tokens = load_vocab(
|
||||
dict_path=dict_path,
|
||||
simplified=True,
|
||||
startwith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
|
||||
)
|
||||
self.vocab_size = len(self.token_dict)
|
||||
self.tokenizer = Tokenizer(self.token_dict, do_lower_case=True)
|
||||
|
||||
def sentence2idx(self, text, second_text=None):
|
||||
text = extract_chinese(str(text).upper())
|
||||
text = str(text).upper()
|
||||
input_id, input_type_id = self.tokenizer.encode(first=text, second=second_text, max_len=self.len_max)
|
||||
# input_mask = [0 if ids == 0 else 1 for ids in input_id]
|
||||
# return input_id, input_type_id, input_mask
|
||||
return [input_id, input_type_id]
|
||||
|
||||
# input_id, input_type_id = self.tokenizer.encode(first_text=text,
|
||||
# second_text=second_text,
|
||||
# max_length=self.len_max,
|
||||
# first_length=self.len_max)
|
||||
#
|
||||
# input_mask = [0 if ids == 0 else 1 for ids in input_id]
|
||||
# return [input_id, input_type_id, input_mask]
|
||||
|
||||
|
||||
class XlnetEmbedding(BaseEmbedding):
|
||||
def __init__(self, hyper_parameters):
|
||||
@ -408,7 +437,7 @@ class XlnetEmbedding(BaseEmbedding):
|
||||
self.embedding_size = self.model.output_shape[-1]
|
||||
self.vocab_size = len(self.tokenizer.sp)
|
||||
|
||||
def sentence2idx(self, text, second_text=""):
|
||||
def sentence2idx(self, text, second_text=None):
|
||||
# text = extract_chinese(str(text).upper())
|
||||
text = str(text).upper()
|
||||
tokens = self.tokenizer.encode(text)
|
||||
|
@ -6,7 +6,7 @@
|
||||
|
||||
|
||||
from keras_textclassification.conf.path_config import path_model, path_fineture, path_model_dir, path_hyper_parameters
|
||||
from keras_textclassification.data_preprocess.generator_preprocess import PreprocessGenerator
|
||||
from keras_textclassification.data_preprocess.generator_preprocess import PreprocessGenerator, PreprocessSimGenerator
|
||||
from keras_textclassification.data_preprocess.text_preprocess import save_json
|
||||
from keras_textclassification.keras_layers.keras_lookahead import Lookahead
|
||||
from keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard
|
||||
@ -57,8 +57,8 @@ class graph:
|
||||
# keras, tensorflow控制GPU使用率等
|
||||
import tensorflow as tf
|
||||
config = tf.ConfigProto()
|
||||
config.gpu_options.per_process_gpu_memory_fraction = self.gpu_memory_fraction
|
||||
# config.gpu_options.allow_growth = True
|
||||
# config.gpu_options.per_process_gpu_memory_fraction = self.gpu_memory_fraction
|
||||
config.gpu_options.allow_growth = True
|
||||
sess = tf.Session(config=config)
|
||||
K.set_session(sess)
|
||||
self.create_model(hyper_parameters)
|
||||
@ -101,7 +101,7 @@ class graph:
|
||||
cb_em = [ TensorBoard(log_dir=os.path.join(self.path_model_dir, "logs"), batch_size=self.batch_size, update_freq='batch'),
|
||||
EarlyStopping(monitor='val_loss', mode='min', min_delta=1e-8, patience=self.patience),
|
||||
ModelCheckpoint(monitor='val_loss', mode='min', filepath=self.model_path, verbose=1,
|
||||
save_best_only=True, save_weights_only=False),]
|
||||
save_best_only=True, save_weights_only=True),]
|
||||
return cb_em
|
||||
|
||||
def create_compile(self):
|
||||
@ -109,9 +109,10 @@ class graph:
|
||||
构建优化器、损失函数和评价函数
|
||||
:return:
|
||||
"""
|
||||
|
||||
if self.optimizer_name.upper() == "ADAM":
|
||||
self.model.compile(optimizer=Adam(lr=self.lr, beta_1=0.9, beta_2=0.999, decay=0.0),
|
||||
loss=self.loss,
|
||||
loss= self.loss,
|
||||
metrics=[self.metrics]) # Any optimize
|
||||
elif self.optimizer_name.upper() == "RADAM":
|
||||
self.model.compile(optimizer=RAdam(lr=self.lr, beta_1=0.9, beta_2=0.999, decay=0.0),
|
||||
@ -119,7 +120,7 @@ class graph:
|
||||
metrics=[self.metrics]) # Any optimize
|
||||
else:
|
||||
self.model.compile(optimizer=RAdam(lr=self.lr, beta_1=0.9, beta_2=0.999, decay=0.0),
|
||||
loss=self.loss,
|
||||
loss= self.loss,
|
||||
metrics=[self.metrics]) # Any optimize
|
||||
lookahead = Lookahead(k=5, alpha=0.5) # Initialize Lookahead
|
||||
lookahead.inject(self.model) # add into model
|
||||
@ -139,6 +140,9 @@ class graph:
|
||||
self.hyper_parameters['model']['dropout'] = 0.0
|
||||
|
||||
save_json(jsons=self.hyper_parameters, json_path=self.path_hyper_parameters)
|
||||
# if self.is_training and os.path.exists(self.model_path):
|
||||
# print("load_weights")
|
||||
# self.model.load_weights(self.model_path)
|
||||
# 训练模型
|
||||
self.model.fit(x_train, y_train, batch_size=self.batch_size,
|
||||
epochs=self.epochs, validation_data=(x_dev, y_dev),
|
||||
@ -164,17 +168,19 @@ class graph:
|
||||
|
||||
save_json(jsons=self.hyper_parameters, json_path=self.path_hyper_parameters)
|
||||
|
||||
pg = PreprocessGenerator()
|
||||
pg = PreprocessGenerator(self.path_model_dir)
|
||||
_, len_train = pg.preprocess_get_label_set(self.hyper_parameters['data']['train_data'])
|
||||
data_fit_generator = pg.preprocess_label_ques_to_idx(embedding_type=self.hyper_parameters['embedding_type'],
|
||||
batch_size=self.batch_size,
|
||||
path=self.hyper_parameters['data']['train_data'],
|
||||
epcoh=self.epochs,
|
||||
embed=embed,
|
||||
rate=rate)
|
||||
_, len_val = pg.preprocess_get_label_set(self.hyper_parameters['data']['val_data'])
|
||||
data_dev_generator = pg.preprocess_label_ques_to_idx(embedding_type=self.hyper_parameters['embedding_type'],
|
||||
batch_size=self.batch_size,
|
||||
path=self.hyper_parameters['data']['val_data'],
|
||||
epcoh=self.epochs,
|
||||
embed=embed,
|
||||
rate=rate)
|
||||
steps_per_epoch = len_train // self.batch_size + 1
|
||||
@ -190,6 +196,54 @@ class graph:
|
||||
if self.trainable:
|
||||
self.word_embedding.model.save(self.path_fineture)
|
||||
|
||||
def fit_generator_sim(self, embed, rate=1):
|
||||
"""
|
||||
|
||||
:param data_fit_generator: yield, 训练数据
|
||||
:param data_dev_generator: yield, 验证数据
|
||||
:param steps_per_epoch: int, 训练一轮步数
|
||||
:param validation_steps: int, 验证一轮步数
|
||||
:return:
|
||||
"""
|
||||
# 保存超参数
|
||||
self.hyper_parameters['model']['is_training'] = False # 预测时候这些设为False
|
||||
self.hyper_parameters['model']['trainable'] = False
|
||||
self.hyper_parameters['model']['dropout'] = 0.0
|
||||
|
||||
save_json(jsons=self.hyper_parameters, json_path=self.path_hyper_parameters)
|
||||
|
||||
pg = PreprocessSimGenerator(self.hyper_parameters['model']['path_model_dir'])
|
||||
_, len_train = pg.preprocess_get_label_set(self.hyper_parameters['data']['train_data'])
|
||||
data_fit_generator = pg.preprocess_label_ques_to_idx(embedding_type=self.hyper_parameters['embedding_type'],
|
||||
batch_size=self.batch_size,
|
||||
path=self.hyper_parameters['data']['train_data'],
|
||||
embed=embed,
|
||||
epcoh=self.epochs,
|
||||
rate=rate)
|
||||
_, len_val = pg.preprocess_get_label_set(self.hyper_parameters['data']['val_data'])
|
||||
data_dev_generator = pg.preprocess_label_ques_to_idx(embedding_type=self.hyper_parameters['embedding_type'],
|
||||
batch_size=self.batch_size,
|
||||
path=self.hyper_parameters['data']['val_data'],
|
||||
embed=embed,
|
||||
epcoh=self.epochs,
|
||||
rate=rate)
|
||||
steps_per_epoch = len_train // self.batch_size + 1
|
||||
validation_steps = len_val // self.batch_size + 1
|
||||
# self.model.load_weights(self.model_path)
|
||||
# 训练模型
|
||||
self.model.fit_generator(generator=data_fit_generator,
|
||||
validation_data=data_dev_generator,
|
||||
callbacks=self.callback(),
|
||||
epochs=self.epochs,
|
||||
steps_per_epoch=32,
|
||||
validation_steps=6)
|
||||
# 保存embedding, 动态的
|
||||
if self.trainable:
|
||||
self.word_embedding.model.save(self.path_fineture)
|
||||
# 1600000/6=266666
|
||||
# 300000/6=50000
|
||||
|
||||
# 36000/6000
|
||||
def load_model(self):
|
||||
"""
|
||||
模型下载
|
||||
@ -221,3 +275,5 @@ class graph:
|
||||
else:
|
||||
raise RuntimeError("your input sen is wrong, it must be type of list or np.array")
|
||||
return self.model.predict(sen)
|
||||
|
||||
|
||||
|
@ -5,12 +5,16 @@
|
||||
# @function:
|
||||
|
||||
|
||||
from keras_textclassification.data_preprocess.text_preprocess import load_json, save_json
|
||||
from keras_textclassification.data_preprocess.text_preprocess import load_json, save_json, txt_read
|
||||
from keras_textclassification.conf.path_config import path_model_dir
|
||||
path_fast_text_model_vocab2index = path_model_dir + 'vocab2index.json'
|
||||
path_fast_text_model_l2i_i2l = path_model_dir + 'l2i_i2l.json'
|
||||
|
||||
from keras_textclassification.conf.path_config import path_baidu_qa_2019_train, path_baidu_qa_2019_valid, \
|
||||
path_ccks_2020_el_kg_dev, path_ccks_2020_el_kg_tet, path_ccks_2020_el_kg_train,\
|
||||
path_ccks_2020_el_cls_dev, path_ccks_2020_el_cls_tet, path_ccks_2020_el_cls_train, \
|
||||
path_root
|
||||
from tqdm import tqdm
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import json
|
||||
import os
|
||||
|
||||
|
||||
@ -18,13 +22,15 @@ class PreprocessGenerator:
|
||||
"""
|
||||
数据预处理, 输入为csv格式, [label,ques]
|
||||
"""
|
||||
def __init__(self):
|
||||
def __init__(self, path_model_dir):
|
||||
self.l2i_i2l = None
|
||||
if os.path.exists(path_fast_text_model_l2i_i2l):
|
||||
self.l2i_i2l = load_json(path_fast_text_model_l2i_i2l)
|
||||
self.path_fast_text_model_vocab2index = path_model_dir + 'vocab2index.json'
|
||||
self.path_fast_text_model_l2i_i2l = path_model_dir + 'l2i_i2l.json'
|
||||
if os.path.exists(self.path_fast_text_model_l2i_i2l):
|
||||
self.l2i_i2l = load_json(self.path_fast_text_model_l2i_i2l)
|
||||
|
||||
def prereocess_idx(self, pred):
|
||||
if os.path.exists(path_fast_text_model_l2i_i2l):
|
||||
if os.path.exists(self.path_fast_text_model_l2i_i2l):
|
||||
pred_i2l = {}
|
||||
i2l = self.l2i_i2l['i2l']
|
||||
for i in range(len(pred)):
|
||||
@ -35,7 +41,7 @@ class PreprocessGenerator:
|
||||
raise RuntimeError("path_fast_text_model_label2index is None")
|
||||
|
||||
def prereocess_pred_xid(self, pred):
|
||||
if os.path.exists(path_fast_text_model_l2i_i2l):
|
||||
if os.path.exists(self.path_fast_text_model_l2i_i2l):
|
||||
pred_l2i = {}
|
||||
l2i = self.l2i_i2l['l2i']
|
||||
for i in range(len(pred)):
|
||||
@ -63,7 +69,7 @@ class PreprocessGenerator:
|
||||
def preprocess_label_ques_to_idx(self, embedding_type, batch_size, path, embed, rate=1, epcoh=20):
|
||||
label_set, len_all = self.preprocess_get_label_set(path)
|
||||
# 获取label转index字典等, 如果label2index存在则不转换了, dev验证集合的时候用
|
||||
if not os.path.exists(path_fast_text_model_l2i_i2l):
|
||||
if not os.path.exists(self.path_fast_text_model_l2i_i2l):
|
||||
count = 0
|
||||
label2index = {}
|
||||
index2label = {}
|
||||
@ -75,9 +81,9 @@ class PreprocessGenerator:
|
||||
l2i_i2l = {}
|
||||
l2i_i2l['l2i'] = label2index
|
||||
l2i_i2l['i2l'] = index2label
|
||||
save_json(l2i_i2l, path_fast_text_model_l2i_i2l)
|
||||
save_json(l2i_i2l, self.path_fast_text_model_l2i_i2l)
|
||||
else:
|
||||
l2i_i2l = load_json(path_fast_text_model_l2i_i2l)
|
||||
l2i_i2l = load_json(self.path_fast_text_model_l2i_i2l)
|
||||
|
||||
# 读取数据的比例
|
||||
len_ql = int(rate * len_all)
|
||||
@ -133,4 +139,229 @@ class PreprocessGenerator:
|
||||
|
||||
|
||||
|
||||
class PreprocessSimGenerator:
|
||||
"""
|
||||
数据预处理, 输入为csv格式, [label,ques]
|
||||
"""
|
||||
def __init__(self, path_model_dir):
|
||||
self.l2i_i2l = None
|
||||
self.path_fast_text_model_vocab2index = path_model_dir + 'vocab2index.json'
|
||||
self.path_fast_text_model_l2i_i2l = path_model_dir + 'l2i_i2l.json'
|
||||
if os.path.exists(self.path_fast_text_model_l2i_i2l):
|
||||
self.l2i_i2l = load_json(self.path_fast_text_model_l2i_i2l)
|
||||
|
||||
def prereocess_idx(self, pred):
|
||||
if os.path.exists(self.path_fast_text_model_l2i_i2l):
|
||||
pred_i2l = {}
|
||||
i2l = self.l2i_i2l['i2l']
|
||||
for i in range(len(pred)):
|
||||
pred_i2l[i2l[str(i)]] = pred[i]
|
||||
pred_i2l_rank = [sorted(pred_i2l.items(), key=lambda k: k[1], reverse=True)]
|
||||
return pred_i2l_rank
|
||||
else:
|
||||
raise RuntimeError("path_fast_text_model_label2index is None")
|
||||
|
||||
def prereocess_pred_xid(self, pred):
|
||||
if os.path.exists(self.path_fast_text_model_l2i_i2l):
|
||||
pred_l2i = {}
|
||||
l2i = self.l2i_i2l['l2i']
|
||||
for i in range(len(pred)):
|
||||
pred_l2i[pred[i]] = l2i[pred[i]]
|
||||
pred_l2i_rank = [sorted(pred_l2i.items(), key=lambda k: k[1], reverse=True)]
|
||||
return pred_l2i_rank
|
||||
else:
|
||||
raise RuntimeError("path_fast_text_model_label2index is None")
|
||||
|
||||
def preprocess_get_label_set(self, path):
|
||||
# 首先获取label,set,即存在的具体类
|
||||
label_set = set()
|
||||
len_all = 0
|
||||
file_csv = open(path, "r", encoding="utf-8")
|
||||
for line in file_csv:
|
||||
len_all += 1
|
||||
data = json.loads(line)
|
||||
label_real = data['label']
|
||||
label_set.add(label_real)
|
||||
file_csv.close()
|
||||
return label_set, len_all
|
||||
|
||||
def preprocess_label_ques_to_idx_old(self, embedding_type, batch_size, path, embed, rate=1, epcoh=20):
|
||||
label_set, len_all = self.preprocess_get_label_set(path)
|
||||
# 获取label转index字典等, 如果label2index存在则不转换了, dev验证集合的时候用
|
||||
if not os.path.exists(self.path_fast_text_model_l2i_i2l):
|
||||
count = 0
|
||||
label2index = {}
|
||||
index2label = {}
|
||||
for label_one in label_set:
|
||||
label2index[label_one] = count
|
||||
index2label[count] = label_one
|
||||
count = count + 1
|
||||
|
||||
l2i_i2l = {}
|
||||
l2i_i2l['l2i'] = label2index
|
||||
l2i_i2l['i2l'] = index2label
|
||||
save_json(l2i_i2l, self.path_fast_text_model_l2i_i2l)
|
||||
else:
|
||||
l2i_i2l = load_json(self.path_fast_text_model_l2i_i2l)
|
||||
|
||||
# 读取数据的比例
|
||||
len_ql = int(rate * len_all)
|
||||
if len_ql <= 500: # sample时候不生效,使得语料足够训练
|
||||
len_ql = len_all
|
||||
|
||||
def process_line(line):
|
||||
# 对每一条数据操作,获取label和问句index
|
||||
data = json.loads(line)
|
||||
label = data['label']
|
||||
ques_1 = data['sentence1']
|
||||
ques_2 = data['sentence2']
|
||||
offset = data['offset']
|
||||
mention = data["mention"]
|
||||
offset_i = int(offset)
|
||||
# if data.get("label_l2i"):
|
||||
# ques_entity = data.get("label_l2i") + "#" + ques_1[:offset_i] + "#" + mention + "#" + ques_1[offset_i+len(mention):]
|
||||
# else:
|
||||
# ques_entity = ques_1[:offset_i] + "#" + mention + "#" + ques_1[offset_i+len(mention):] + "$$" + ques_2
|
||||
# que_embed = embed.sentence2idx(text=ques_entity)
|
||||
que_embed = embed.sentence2idx(ques_1, second_text=ques_2)
|
||||
label_zeros = [0] * len(l2i_i2l['l2i'])
|
||||
label_zeros[l2i_i2l['l2i'][label]] = 1
|
||||
return que_embed, label_zeros
|
||||
|
||||
for _ in range(epcoh):
|
||||
while True:
|
||||
file_csv = open(path, "r", encoding="utf-8")
|
||||
cout_all_line = 0
|
||||
cnt = 0
|
||||
x, y = [], []
|
||||
# 跳出循环
|
||||
if len_ql < cout_all_line:
|
||||
break
|
||||
for line in file_csv:
|
||||
cout_all_line += 1
|
||||
x_line, y_line = process_line(line)
|
||||
x.append(x_line)
|
||||
y.append(y_line)
|
||||
cnt += 1
|
||||
if cnt == batch_size:
|
||||
if embedding_type in ['bert', 'albert']:
|
||||
x_, y_ = np.array(x), np.array(y)
|
||||
x_1 = np.array([x[0] for x in x_])
|
||||
x_2 = np.array([x[1] for x in x_])
|
||||
x_all = [x_1, x_2]
|
||||
elif embedding_type == 'xlnet':
|
||||
x_, y_ = x, np.array(y)
|
||||
x_1 = np.array([x[0][0] for x in x_])
|
||||
x_2 = np.array([x[1][0] for x in x_])
|
||||
x_3 = np.array([x[2][0] for x in x_])
|
||||
x_all = [x_1, x_2, x_3]
|
||||
else:
|
||||
x_all, y_ = np.array(x), np.array(y)
|
||||
|
||||
cnt = 0
|
||||
yield (x_all, y_)
|
||||
x, y =[], []
|
||||
file_csv.close()
|
||||
print("preprocess_label_ques_to_idx ok")
|
||||
|
||||
def preprocess_label_ques_to_idx(self, embedding_type, batch_size, path, embed, rate=1, epcoh=20):
|
||||
label_set, len_all = self.preprocess_get_label_set(path)
|
||||
# 获取label转index字典等, 如果label2index存在则不转换了, dev验证集合的时候用
|
||||
if not os.path.exists(self.path_fast_text_model_l2i_i2l):
|
||||
count = 0
|
||||
label2index = {}
|
||||
index2label = {}
|
||||
for label_one in label_set:
|
||||
label2index[label_one] = count
|
||||
index2label[count] = label_one
|
||||
count = count + 1
|
||||
|
||||
l2i_i2l = {}
|
||||
l2i_i2l['l2i'] = label2index
|
||||
l2i_i2l['i2l'] = index2label
|
||||
save_json(l2i_i2l, self.path_fast_text_model_l2i_i2l)
|
||||
else:
|
||||
l2i_i2l = load_json(self.path_fast_text_model_l2i_i2l)
|
||||
|
||||
# 读取数据的比例
|
||||
len_ql = int(rate * len_all)
|
||||
if len_ql <= 500: # sample时候不生效,使得语料足够训练
|
||||
len_ql = len_all
|
||||
|
||||
def process_line(line):
|
||||
# 对每一条数据操作,获取label和问句index
|
||||
data = json.loads(line)
|
||||
label = data['label']
|
||||
ques_1 = data['sentence1']
|
||||
ques_2 = data['sentence2']
|
||||
offset = data['offset']
|
||||
mention_1 = data["mention"]
|
||||
offset_i = int(offset)
|
||||
que_embed_1 = embed.sentence2idx(text=ques_1)
|
||||
que_embed_2 = embed.sentence2idx(text=ques_2)
|
||||
"""ques1"""
|
||||
[input_id_1, input_type_id_1, input_mask_1] = que_embed_1
|
||||
input_start_mask_1 = [0] * len(input_id_1)
|
||||
input_start_mask_1[offset_i] = 1
|
||||
input_end_mask_1 = [0] * len(input_id_1)
|
||||
input_end_mask_1[offset_i + len(mention_1) - 1] = 1
|
||||
input_entity_mask_1 = [0] * len(input_id_1)
|
||||
input_entity_mask_1[offset_i:offset_i + len(mention_1)] = [1] * len(mention_1)
|
||||
"""ques2"""
|
||||
[input_id_2, input_type_id_2, input_mask_2] = que_embed_2
|
||||
kind_2 = [0] * len(input_type_id_2)
|
||||
kind_21 = [0] * len(input_type_id_2)
|
||||
que_2_sp = ques_2.split("|")
|
||||
if len(que_2_sp)>=2:
|
||||
que_2_sp_sp = que_2_sp[0].split(":")
|
||||
if len(que_2_sp_sp)==2:
|
||||
kind_2_start = len(que_2_sp_sp[0]) - 1
|
||||
kind_2_end = kind_2_start + len(que_2_sp_sp[1]) - 1
|
||||
kind_2[kind_2_start:kind_2_end] = [1] * (kind_2_end - kind_2_start)
|
||||
if "标签:" in que_2_sp[1]:
|
||||
que_21_sp_sp = que_2_sp[1].split(":")
|
||||
kind_21_start = len(que_2_sp[0]) + len(que_21_sp_sp[0]) - 1
|
||||
kind_21_end = len(que_2_sp[0]) + len(que_21_sp_sp[0]) + len(que_21_sp_sp[1]) - 1
|
||||
kind_21[kind_21_start:kind_21_end] = [1] * (kind_21_end - kind_21_start)
|
||||
que_embed_x=[input_id_1, input_type_id_1, input_mask_1, input_start_mask_1, input_end_mask_1, input_entity_mask_1,
|
||||
input_id_2, input_type_id_2, input_mask_2, kind_2, kind_21]
|
||||
label_zeros = [0] * len(l2i_i2l['l2i'])
|
||||
label_zeros[l2i_i2l['l2i'][label]] = 1
|
||||
return que_embed_x, label_zeros
|
||||
|
||||
for _ in range(epcoh):
|
||||
while True:
|
||||
file_csv = open(path, "r", encoding="utf-8")
|
||||
cout_all_line = 0
|
||||
cnt = 0
|
||||
x, y = [], []
|
||||
# 跳出循环
|
||||
if len_ql < cout_all_line:
|
||||
break
|
||||
for line in file_csv:
|
||||
cout_all_line += 1
|
||||
x_line, y_line = process_line(line)
|
||||
x.append(x_line)
|
||||
y.append(y_line)
|
||||
cnt += 1
|
||||
if cnt == batch_size:
|
||||
if embedding_type in ['bert', 'albert']:
|
||||
x_, y_ = np.array(x), np.array(y)
|
||||
x_all = []
|
||||
for i in range(len(x_[0])):
|
||||
x_1 = np.array([x[i] for x in x_])
|
||||
x_all.append(x_1)
|
||||
elif embedding_type == 'xlnet':
|
||||
x_, y_ = x, np.array(y)
|
||||
x_1 = np.array([x[0][0] for x in x_])
|
||||
x_2 = np.array([x[1][0] for x in x_])
|
||||
x_3 = np.array([x[2][0] for x in x_])
|
||||
x_all = [x_1, x_2, x_3]
|
||||
else:
|
||||
x_all, y_ = np.array(x), np.array(y)
|
||||
|
||||
cnt = 0
|
||||
yield (x_all, y_)
|
||||
x, y =[], []
|
||||
file_csv.close()
|
||||
print("preprocess_label_ques_to_idx ok")
|
||||
|
@ -5,16 +5,15 @@
|
||||
# @function :data utils of text classification
|
||||
|
||||
|
||||
from keras_textclassification.conf.path_config import path_model_dir
|
||||
path_fast_text_model_vocab2index = path_model_dir + 'vocab2index.json'
|
||||
path_fast_text_model_l2i_i2l = path_model_dir + 'l2i_i2l.json'
|
||||
|
||||
# from keras_textclassification.conf.path_config import path_model_dir
|
||||
# path_fast_text_model_vocab2index = path_model_dir + 'vocab2index.json'
|
||||
# path_fast_text_model_l2i_i2l = path_model_dir + 'l2i_i2l.json'
|
||||
from collections import Counter
|
||||
from tqdm import tqdm
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import random
|
||||
import jieba
|
||||
# import jieba
|
||||
import json
|
||||
import re
|
||||
import os
|
||||
@ -199,13 +198,15 @@ class PreprocessText:
|
||||
"""
|
||||
数据预处理, 输入为csv格式, [label,ques]
|
||||
"""
|
||||
def __init__(self):
|
||||
def __init__(self, path_model_dir):
|
||||
self.l2i_i2l = None
|
||||
if os.path.exists(path_fast_text_model_l2i_i2l):
|
||||
self.l2i_i2l = load_json(path_fast_text_model_l2i_i2l)
|
||||
self.path_fast_text_model_vocab2index = path_model_dir + 'vocab2index.json'
|
||||
self.path_fast_text_model_l2i_i2l = path_model_dir + 'l2i_i2l.json'
|
||||
if os.path.exists(self.path_fast_text_model_l2i_i2l):
|
||||
self.l2i_i2l = load_json(self.path_fast_text_model_l2i_i2l)
|
||||
|
||||
def prereocess_idx(self, pred):
|
||||
if os.path.exists(path_fast_text_model_l2i_i2l):
|
||||
if os.path.exists(self.path_fast_text_model_l2i_i2l):
|
||||
pred_i2l = {}
|
||||
i2l = self.l2i_i2l['i2l']
|
||||
for i in range(len(pred)):
|
||||
@ -216,7 +217,7 @@ class PreprocessText:
|
||||
raise RuntimeError("path_fast_text_model_label2index is None")
|
||||
|
||||
def prereocess_pred_xid(self, pred):
|
||||
if os.path.exists(path_fast_text_model_l2i_i2l):
|
||||
if os.path.exists(self.path_fast_text_model_l2i_i2l):
|
||||
pred_l2i = {}
|
||||
l2i = self.l2i_i2l['l2i']
|
||||
for i in range(len(pred)):
|
||||
@ -239,7 +240,7 @@ class PreprocessText:
|
||||
random.shuffle(indexs)
|
||||
ques, label = ques[indexs].tolist(), label[indexs].tolist()
|
||||
# 如果label2index存在则不转换了
|
||||
if not os.path.exists(path_fast_text_model_l2i_i2l):
|
||||
if not os.path.exists(self.path_fast_text_model_l2i_i2l):
|
||||
label_set = set(label)
|
||||
count = 0
|
||||
label2index = {}
|
||||
@ -252,9 +253,9 @@ class PreprocessText:
|
||||
l2i_i2l = {}
|
||||
l2i_i2l['l2i'] = label2index
|
||||
l2i_i2l['i2l'] = index2label
|
||||
save_json(l2i_i2l, path_fast_text_model_l2i_i2l)
|
||||
save_json(l2i_i2l, self.path_fast_text_model_l2i_i2l)
|
||||
else:
|
||||
l2i_i2l = load_json(path_fast_text_model_l2i_i2l)
|
||||
l2i_i2l = load_json(self.path_fast_text_model_l2i_i2l)
|
||||
|
||||
len_ql = int(rate * len(ques))
|
||||
if len_ql <= 500: # sample时候不生效,使得语料足够训练
|
||||
@ -307,13 +308,15 @@ class PreprocessTextMulti:
|
||||
"""
|
||||
数据预处理, 输入为csv格式, [label,ques]
|
||||
"""
|
||||
def __init__(self):
|
||||
def __init__(self, path_model_dir):
|
||||
self.l2i_i2l = None
|
||||
if os.path.exists(path_fast_text_model_l2i_i2l):
|
||||
self.l2i_i2l = load_json(path_fast_text_model_l2i_i2l)
|
||||
self.path_fast_text_model_vocab2index = path_model_dir + 'vocab2index.json'
|
||||
self.path_fast_text_model_l2i_i2l = path_model_dir + 'l2i_i2l.json'
|
||||
if os.path.exists(self.path_fast_text_model_l2i_i2l):
|
||||
self.l2i_i2l = load_json(self.path_fast_text_model_l2i_i2l)
|
||||
|
||||
def prereocess_idx(self, pred):
|
||||
if os.path.exists(path_fast_text_model_l2i_i2l):
|
||||
if os.path.exists(self.path_fast_text_model_l2i_i2l):
|
||||
pred_i2l = {}
|
||||
i2l = self.l2i_i2l['i2l']
|
||||
for i in range(len(pred)):
|
||||
@ -324,7 +327,7 @@ class PreprocessTextMulti:
|
||||
raise RuntimeError("path_fast_text_model_label2index is None")
|
||||
|
||||
def prereocess_pred_xid(self, pred):
|
||||
if os.path.exists(path_fast_text_model_l2i_i2l):
|
||||
if os.path.exists(self.path_fast_text_model_l2i_i2l):
|
||||
pred_l2i = {}
|
||||
l2i = self.l2i_i2l['l2i']
|
||||
for i in range(len(pred)):
|
||||
@ -365,7 +368,7 @@ class PreprocessTextMulti:
|
||||
random.shuffle(indexs)
|
||||
ques, label = ques[indexs].tolist(), label[indexs].tolist()
|
||||
|
||||
if not os.path.exists(path_fast_text_model_l2i_i2l):
|
||||
if not os.path.exists(self.path_fast_text_model_l2i_i2l):
|
||||
from keras_textclassification.conf.path_config import path_byte_multi_news_label
|
||||
byte_multi_news_label = txt_read(path_byte_multi_news_label)
|
||||
byte_multi_news_label = [i.strip().upper() for i in byte_multi_news_label]
|
||||
@ -383,9 +386,9 @@ class PreprocessTextMulti:
|
||||
l2i_i2l = {}
|
||||
l2i_i2l['l2i'] = label2index
|
||||
l2i_i2l['i2l'] = index2label
|
||||
save_json(l2i_i2l, path_fast_text_model_l2i_i2l)
|
||||
save_json(l2i_i2l, self.path_fast_text_model_l2i_i2l)
|
||||
else:
|
||||
l2i_i2l = load_json(path_fast_text_model_l2i_i2l)
|
||||
l2i_i2l = load_json(self.path_fast_text_model_l2i_i2l)
|
||||
len_label_set = len(l2i_i2l['l2i'])
|
||||
|
||||
|
||||
@ -438,13 +441,15 @@ class PreprocessSim:
|
||||
"""
|
||||
数据预处理, 输入为csv格式, [label,ques]
|
||||
"""
|
||||
def __init__(self):
|
||||
def __init__(self, path_model_dir):
|
||||
self.l2i_i2l = None
|
||||
if os.path.exists(path_fast_text_model_l2i_i2l):
|
||||
self.l2i_i2l = load_json(path_fast_text_model_l2i_i2l)
|
||||
self.path_fast_text_model_vocab2index = path_model_dir + 'vocab2index.json'
|
||||
self.path_fast_text_model_l2i_i2l = path_model_dir + 'l2i_i2l.json'
|
||||
if os.path.exists(self.path_fast_text_model_l2i_i2l):
|
||||
self.l2i_i2l = load_json(self.path_fast_text_model_l2i_i2l)
|
||||
|
||||
def prereocess_idx(self, pred):
|
||||
if os.path.exists(path_fast_text_model_l2i_i2l):
|
||||
if os.path.exists(self.path_fast_text_model_l2i_i2l):
|
||||
pred_i2l = {}
|
||||
i2l = self.l2i_i2l['i2l']
|
||||
for i in range(len(pred)):
|
||||
@ -455,7 +460,7 @@ class PreprocessSim:
|
||||
raise RuntimeError("path_fast_text_model_label2index is None")
|
||||
|
||||
def prereocess_pred_xid(self, pred):
|
||||
if os.path.exists(path_fast_text_model_l2i_i2l):
|
||||
if os.path.exists(self.path_fast_text_model_l2i_i2l):
|
||||
pred_l2i = {}
|
||||
l2i = self.l2i_i2l['l2i']
|
||||
for i in range(len(pred)):
|
||||
@ -467,23 +472,48 @@ class PreprocessSim:
|
||||
|
||||
def preprocess_label_ques_to_idx(self, embedding_type, path, embed,
|
||||
rate=1, shuffle=True, graph=None):
|
||||
data = pd.read_csv(path)
|
||||
ques_1 = data['sentence1'].tolist()
|
||||
ques_2 = data['sentence2'].tolist()
|
||||
label = data['label'].tolist()
|
||||
if "json" in path:
|
||||
datas = txt_read(path)
|
||||
ques_1 = []
|
||||
ques_2 = []
|
||||
label = []
|
||||
offset = []
|
||||
mention = []
|
||||
for data_str in datas:
|
||||
data = json.loads(data_str)
|
||||
ques_1 += [data['sentence1']]
|
||||
ques_2 += [data['sentence2']]
|
||||
mention += [data['mention']]
|
||||
label += [data['label']]
|
||||
offset += [data['offset']]
|
||||
elif "csv" in path:
|
||||
data = pd.read_csv(path)
|
||||
ques_1 = data['sentence1'].tolist()
|
||||
ques_2 = data['sentence2'].tolist()
|
||||
label = data['label'].tolist()
|
||||
offset = data['offset'].tolist()
|
||||
|
||||
ques_1 = [str(q1).upper() for q1 in ques_1]
|
||||
ques_2 = [str(q2).upper() for q2 in ques_2]
|
||||
|
||||
label = [str(l).upper() for l in label]
|
||||
# label = [str(l).upper() for l in label]
|
||||
label = [str(l) for l in label]
|
||||
if shuffle:
|
||||
ques_1 = np.array(ques_1)
|
||||
ques_2 = np.array(ques_2)
|
||||
label = np.array(label)
|
||||
mention = np.array(mention)
|
||||
offset = np.array(offset)
|
||||
|
||||
indexs = [ids for ids in range(len(label))]
|
||||
random.shuffle(indexs)
|
||||
ques_1, ques_2, label = ques_1[indexs].tolist(), ques_2[indexs].tolist(), label[indexs].tolist()
|
||||
ques_1 = ques_1[indexs].tolist()
|
||||
ques_2 = ques_2[indexs].tolist()
|
||||
label = label[indexs].tolist()
|
||||
mention = mention[indexs].tolist()
|
||||
offset = offset[indexs].tolist()
|
||||
# 如果label2index存在则不转换了
|
||||
if not os.path.exists(path_fast_text_model_l2i_i2l):
|
||||
if not os.path.exists(self.path_fast_text_model_l2i_i2l):
|
||||
label_set = set(label)
|
||||
count = 0
|
||||
label2index = {}
|
||||
@ -496,12 +526,12 @@ class PreprocessSim:
|
||||
l2i_i2l = {}
|
||||
l2i_i2l['l2i'] = label2index
|
||||
l2i_i2l['i2l'] = index2label
|
||||
save_json(l2i_i2l, path_fast_text_model_l2i_i2l)
|
||||
save_json(l2i_i2l, self.path_fast_text_model_l2i_i2l)
|
||||
else:
|
||||
l2i_i2l = load_json(path_fast_text_model_l2i_i2l)
|
||||
l2i_i2l = load_json(self.path_fast_text_model_l2i_i2l)
|
||||
|
||||
len_ql = int(rate * len(label))
|
||||
if len_ql <= 500: # sample时候不生效,使得语料足够训练
|
||||
if len_ql <= 1: # sample时候不生效,使得语料足够训练
|
||||
len_ql = len(label)
|
||||
|
||||
x = []
|
||||
@ -509,8 +539,78 @@ class PreprocessSim:
|
||||
for i in tqdm(range(len_ql)):
|
||||
que_1 = ques_1[i]
|
||||
que_2 = ques_2[i]
|
||||
que_embed = embed.sentence2idx(text=que_1, second_text=que_2)
|
||||
x.append(que_embed) # [[], ]
|
||||
mention_1 = mention[i]
|
||||
# que_embed = embed.sentence2idx(text=que_1, second_text=que_2)
|
||||
# x.append(que_embed) # [[], ]
|
||||
offset_i = int(offset[i])
|
||||
# ques_entity = que_1 + "##" + que_1[offset_i+len(que_2):]
|
||||
# ques_entity = que_1
|
||||
# que_embed1 = embed.sentence2idx(text=que_1, second_text=que_2)
|
||||
if embedding_type in ['bert', 'albert']:
|
||||
########################################1111111##############
|
||||
# [input_id, input_type_id] = que_embed
|
||||
# input_entity_mask = [0] * len(input_id)
|
||||
# input_entity_mask[offset_i:offset_i+len(que_2)] = [1] * len(que_2)
|
||||
# # x.append(que_embed) # [[], ]
|
||||
# x.append([input_id, input_type_id, input_entity_mask])
|
||||
# # x.append([input_id, input_type_id, input_entity_mask, offset_i])
|
||||
########################################2222222指针网络######################################
|
||||
# [input_id, input_type_id] = que_embed
|
||||
# input_start_mask = [0] * len(input_id)
|
||||
# input_start_mask[offset_i] = 1
|
||||
# input_end_mask = [0] * len(input_id)
|
||||
# input_end_mask[offset_i + len(mention_1) - 1] = 1
|
||||
# x.append([input_id, input_type_id, input_start_mask, input_start_mask])
|
||||
########################################分开两个句子###################################################
|
||||
que_embed_1 = embed.sentence2idx(text=que_1)
|
||||
# que_embed_1 = [que[:54] for que in que_embed_1]
|
||||
|
||||
que_embed_2 = embed.sentence2idx(text=que_2)
|
||||
# que_embed_2 = [que[:256-54] for que in que_embed_2]
|
||||
try:
|
||||
"""ques1"""
|
||||
[input_id_1, input_type_id_1, input_mask_1] = que_embed_1
|
||||
input_start_mask_1 = [0] * len(input_id_1)
|
||||
input_start_mask_1[offset_i] = 1
|
||||
input_end_mask_1 = [0] * len(input_id_1)
|
||||
input_end_mask_1[offset_i+len(mention_1)-1] = 1
|
||||
input_entity_mask_1 = [0] * len(input_id_1)
|
||||
input_entity_mask_1[offset_i:offset_i+len(mention_1)] = [1] * len(mention_1)
|
||||
"""ques2"""
|
||||
[input_id_2, input_type_id_2, input_mask_2] = que_embed_2
|
||||
kind_2 = [0] * len(input_type_id_2)
|
||||
que_2_sp = que_2.split("|")
|
||||
que_2_sp_sp = que_2_sp[0].split(":")
|
||||
kind_2_start = len(que_2_sp_sp[0]) - 1
|
||||
kind_2_end = kind_2_start + len(que_2_sp_sp[1]) - 1
|
||||
kind_2[kind_2_start:kind_2_end] = [1] * (kind_2_end-kind_2_start)
|
||||
kind_21 = [0] * len(input_type_id_2)
|
||||
if "标签" in que_2_sp[1]:
|
||||
que_21_sp_sp = que_2_sp[1].split(":")
|
||||
kind_21_start = len(que_2_sp[0]) + len(que_21_sp_sp[0]) - 1
|
||||
kind_21_end = len(que_2_sp[0]) + len(que_21_sp_sp[0]) + len(que_21_sp_sp[1]) - 1
|
||||
kind_21[kind_21_start:kind_21_end] = [1] * (kind_21_end - kind_21_start)
|
||||
except Exception as e:
|
||||
print(str(e))
|
||||
gg = 0
|
||||
|
||||
x.append([input_id_1, input_type_id_1, input_mask_1, input_start_mask_1, input_end_mask_1, input_entity_mask_1,
|
||||
input_id_2, input_type_id_2, input_mask_2, kind_2, kind_21])
|
||||
|
||||
|
||||
elif embedding_type == 'xlnet':
|
||||
if embed.trainable:
|
||||
[token_input, segment_input, memory_length_input, mask_input] = que_embed
|
||||
input_entity_mask = [0] * len(token_input)
|
||||
input_entity_mask[offset_i:offset_i + len(que_2)] = [1] * len(que_2)
|
||||
# x.append(que_embed) # [[], ]
|
||||
x.append([token_input, segment_input, memory_length_input, mask_input, input_entity_mask])
|
||||
else:
|
||||
[token_input, segment_input, memory_length_input] = que_embed
|
||||
input_entity_mask = [0] * len(token_input)
|
||||
input_entity_mask[offset_i:offset_i + len(que_2)] = [1] * len(que_2)
|
||||
x.append([token_input, segment_input, memory_length_input, input_entity_mask])
|
||||
|
||||
label_zo = []
|
||||
print("label to onehot start!")
|
||||
label_len_ql = label[0:len_ql]
|
||||
@ -522,20 +622,26 @@ class PreprocessSim:
|
||||
|
||||
if embedding_type in ['bert', 'albert']:
|
||||
x_, y_ = np.array(x), np.array(label_zo)
|
||||
x_1 = np.array([x[0] for x in x_])
|
||||
x_2 = np.array([x[1] for x in x_])
|
||||
x_all = [x_1, x_2]
|
||||
# x_1 = np.array([x[0] for x in x_])
|
||||
# x_2 = np.array([x[1] for x in x_])
|
||||
# x_3 = np.array([x[2] for x in x_])
|
||||
# x_4 = np.array([x[3] for x in x_])
|
||||
# x_all = [x_1, x_2, x_3, x_4]
|
||||
x_all = []
|
||||
for i in range(len(x_[0])):
|
||||
x_all.append(np.array([x[i] for x in x_]))
|
||||
return x_all, y_
|
||||
elif embedding_type == 'xlnet':
|
||||
x_, y_ = x, np.array(label_zo)
|
||||
x_1 = np.array([x[0][0] for x in x_])
|
||||
x_2 = np.array([x[1][0] for x in x_])
|
||||
x_3 = np.array([x[2][0] for x in x_])
|
||||
x_4 = np.array([x[3][0] for x in x_])
|
||||
if embed.trainable:
|
||||
x_4 = np.array([x[3][0] for x in x_])
|
||||
x_all = [x_1, x_2, x_3, x_4]
|
||||
x_5 = np.array([x[4][0] for x in x_])
|
||||
x_all = [x_1, x_2, x_3, x_4, x_5]
|
||||
else:
|
||||
x_all = [x_1, x_2, x_3]
|
||||
x_all = [x_1, x_2, x_3, x_4]
|
||||
return x_all, y_
|
||||
else:
|
||||
x_, y_ = np.array(x), np.array(label_zo)
|
||||
@ -546,13 +652,15 @@ class PreprocessSimConv2019:
|
||||
"""
|
||||
数据预处理, 输入为csv格式, [label,ques]
|
||||
"""
|
||||
def __init__(self):
|
||||
def __init__(self, path_model_dir):
|
||||
self.l2i_i2l = None
|
||||
if os.path.exists(path_fast_text_model_l2i_i2l):
|
||||
self.l2i_i2l = load_json(path_fast_text_model_l2i_i2l)
|
||||
self.path_fast_text_model_vocab2index = path_model_dir + 'vocab2index.json'
|
||||
self.path_fast_text_model_l2i_i2l = path_model_dir + 'l2i_i2l.json'
|
||||
if os.path.exists(self.path_fast_text_model_l2i_i2l):
|
||||
self.l2i_i2l = load_json(self.path_fast_text_model_l2i_i2l)
|
||||
|
||||
def prereocess_idx(self, pred):
|
||||
if os.path.exists(path_fast_text_model_l2i_i2l):
|
||||
if os.path.exists(self.path_fast_text_model_l2i_i2l):
|
||||
pred_i2l = {}
|
||||
i2l = self.l2i_i2l['i2l']
|
||||
for i in range(len(pred)):
|
||||
@ -563,7 +671,7 @@ class PreprocessSimConv2019:
|
||||
raise RuntimeError("path_fast_text_model_label2index is None")
|
||||
|
||||
def prereocess_pred_xid(self, pred):
|
||||
if os.path.exists(path_fast_text_model_l2i_i2l):
|
||||
if os.path.exists(self.path_fast_text_model_l2i_i2l):
|
||||
pred_l2i = {}
|
||||
l2i = self.l2i_i2l['l2i']
|
||||
for i in range(len(pred)):
|
||||
@ -593,7 +701,7 @@ class PreprocessSimConv2019:
|
||||
random.shuffle(indexs)
|
||||
ques_1, ques_2, label, category = ques_1[indexs].tolist(), ques_2[indexs].tolist(), label[indexs].tolist(), category[indexs].tolist()
|
||||
# 如果label2index存在则不转换了
|
||||
if not os.path.exists(path_fast_text_model_l2i_i2l):
|
||||
if not os.path.exists(self.path_fast_text_model_l2i_i2l):
|
||||
label_set = set(label)
|
||||
count = 0
|
||||
label2index = {}
|
||||
@ -606,9 +714,9 @@ class PreprocessSimConv2019:
|
||||
l2i_i2l = {}
|
||||
l2i_i2l['l2i'] = label2index
|
||||
l2i_i2l['i2l'] = index2label
|
||||
save_json(l2i_i2l, path_fast_text_model_l2i_i2l)
|
||||
save_json(l2i_i2l, self.path_fast_text_model_l2i_i2l)
|
||||
else:
|
||||
l2i_i2l = load_json(path_fast_text_model_l2i_i2l)
|
||||
l2i_i2l = load_json(self.path_fast_text_model_l2i_i2l)
|
||||
|
||||
len_ql = int(rate * len(label))
|
||||
if len_ql <= 500: # sample时候不生效,使得语料足够训练
|
||||
|
@ -2,10 +2,10 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @time : 2020/3/31 19:10
|
||||
# @author : Mo
|
||||
# @function:
|
||||
# @function: Attention of dot
|
||||
|
||||
|
||||
from keras.regularizers import L1L2, Regularizer
|
||||
from keras.regularizers import L1L2
|
||||
# from keras.engine.topology import Layer
|
||||
from keras.layers import Layer
|
||||
from keras import backend as K
|
||||
|
@ -1,12 +1,13 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
# !/usr/bin/python
|
||||
# @time :2019/6/22 7:35
|
||||
# @time :2019/6/22 19:35
|
||||
# @author :Mo
|
||||
# @function :self Attention()
|
||||
# @function :Attention of itself
|
||||
|
||||
|
||||
from keras.regularizers import L1L2, Regularizer
|
||||
from keras.engine.topology import Layer
|
||||
# from keras.engine.topology import Layer
|
||||
from keras.layers import Layer
|
||||
from keras import backend as K
|
||||
|
||||
|
||||
|
@ -4,6 +4,7 @@
|
||||
# @author :Mo
|
||||
# @function :
|
||||
|
||||
|
||||
from keras.layers import Layer
|
||||
import tensorflow as tf
|
||||
|
||||
|
@ -15,7 +15,6 @@ from keras import backend as K
|
||||
from keras import regularizers
|
||||
|
||||
from keras_textclassification.base.graph import graph
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
@ -52,6 +51,9 @@ class AlbertGraph(graph):
|
||||
# x = Concatenate(axis=1)(concat_out)
|
||||
# x = Dropout(self.dropout)(x)
|
||||
x = Flatten()(x)
|
||||
x = Dropout(self.dropout)(x)
|
||||
x = Dense(128, activation="tanh")(x)
|
||||
x = Dropout(self.dropout)(x)
|
||||
# 最后就是softmax
|
||||
dense_layer = Dense(self.label, activation=self.activate_classify)(x)
|
||||
output_layers = [dense_layer]
|
||||
|
@ -12,9 +12,9 @@ import os
|
||||
project_path = str(pathlib.Path(os.path.abspath(__file__)).parent.parent.parent)
|
||||
sys.path.append(project_path)
|
||||
# 地址
|
||||
from keras_textclassification.conf.path_config import path_model, path_fineture, path_model_dir, path_hyper_parameters
|
||||
from keras_textclassification.conf.path_config import path_model, path_fineture, path_hyper_parameters # , path_model_dir
|
||||
# 训练验证数据地址
|
||||
from keras_textclassification.conf.path_config import path_baidu_qa_2019_train, path_baidu_qa_2019_valid
|
||||
from keras_textclassification.conf.path_config import path_baidu_qa_2019_train, path_baidu_qa_2019_valid, path_root
|
||||
# 数据预处理, 删除文件目录下文件
|
||||
from keras_textclassification.data_preprocess.text_preprocess import PreprocessText, read_and_process, load_json
|
||||
# 模型图
|
||||
@ -26,6 +26,8 @@ import time
|
||||
|
||||
import numpy as np
|
||||
|
||||
path_model_dir = path_root + "/data/model/ccks_2020_el_cls_albert/"
|
||||
|
||||
|
||||
def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.0):
|
||||
"""
|
||||
@ -46,7 +48,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
|
||||
print("graph load ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
y, x = read_and_process(hyper_parameters['data']['val_data'])
|
||||
# 取该数据集的百分之几的语料测试
|
||||
len_rate = int(len(y) * rate)
|
||||
@ -57,7 +59,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
|
||||
for x_one in x:
|
||||
count += 1
|
||||
ques_embed = ra_ed.sentence2idx(x_one)
|
||||
if hyper_parameters['embedding_type'] == 'bert': # bert数据处理, token
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']: # bert数据处理, token
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
@ -91,7 +93,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
"""
|
||||
# 加载超参数
|
||||
hyper_parameters = load_json(path_hyper_parameter)
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
# 模式初始化和加载
|
||||
graph = Graph(hyper_parameters)
|
||||
graph.load_model()
|
||||
@ -99,7 +101,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
ques = '我要打王者荣耀'
|
||||
# str to token
|
||||
ques_embed = ra_ed.sentence2idx(ques)
|
||||
if hyper_parameters['embedding_type'] == 'bert':
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
@ -115,7 +117,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
ques = input()
|
||||
ques_embed = ra_ed.sentence2idx(ques)
|
||||
print(ques_embed)
|
||||
if hyper_parameters['embedding_type'] == 'bert':
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
@ -127,12 +129,13 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
|
||||
|
||||
if __name__=="__main__":
|
||||
path_fineture = path_root + "/data/model/ccks_2020_el_cls_albert/hyper_parameters.json"
|
||||
|
||||
# 测试集预测
|
||||
pred_tet(path_test=path_baidu_qa_2019_valid, rate=1) # sample条件下设为1,否则训练语料可能会很少
|
||||
pred_tet(path_hyper_parameter=path_fineture, path_test=path_baidu_qa_2019_valid, rate=1) # sample条件下设为1,否则训练语料可能会很少
|
||||
|
||||
# 可输入 input 预测
|
||||
pred_input()
|
||||
pred_input(path_hyper_parameter=path_fineture)
|
||||
|
||||
# pred
|
||||
# precision recall f1-score support
|
||||
|
@ -19,15 +19,31 @@ sys.path.append(project_path)
|
||||
# 地址
|
||||
from keras_textclassification.conf.path_config import path_model, path_fineture, path_model_dir, path_hyper_parameters
|
||||
# 训练验证数据地址
|
||||
from keras_textclassification.conf.path_config import path_baidu_qa_2019_train, path_baidu_qa_2019_valid
|
||||
from keras_textclassification.conf.path_config import path_baidu_qa_2019_train, path_baidu_qa_2019_valid,\
|
||||
path_ccks_2020_nil_train, path_ccks_2020_nil_dev, path_root
|
||||
# 数据预处理, 删除文件目录下文件
|
||||
from keras_textclassification.data_preprocess.text_preprocess import PreprocessText, delete_file
|
||||
from keras_textclassification.data_preprocess.text_preprocess import PreprocessText, PreprocessSim, delete_file
|
||||
# 模型图
|
||||
from keras_textclassification.m00_Albert.graph import AlbertGraph as Graph
|
||||
# 计算时间
|
||||
import time
|
||||
|
||||
|
||||
# fast_text config
|
||||
# 模型目录
|
||||
path_model_dir = path_root + "/data/model/ccks_2020_el_cls_albert/"
|
||||
# 语料地址
|
||||
path_model = path_root + '/data/model/ccks_2020_el_cls_albert/model_fast_text.h5'
|
||||
# 超参数保存地址
|
||||
path_hyper_parameters = path_root + '/data/model/ccks_2020_el_cls_albert/hyper_parameters.json'
|
||||
# embedding微调保存地址
|
||||
path_fineture = path_root + "/data/model/ccks_2020_el_cls_albert/embedding_trainable.h5"
|
||||
|
||||
|
||||
if not os.path.exists(path_model_dir):
|
||||
os.mkdir(path_model_dir)
|
||||
|
||||
|
||||
def train(hyper_parameters=None, rate=1.0):
|
||||
"""
|
||||
训练函数
|
||||
@ -37,24 +53,24 @@ def train(hyper_parameters=None, rate=1.0):
|
||||
"""
|
||||
if not hyper_parameters:
|
||||
hyper_parameters = {
|
||||
'len_max': 20, # 句子最大长度, 固定 推荐20-50
|
||||
'len_max': 50, # 句子最大长度, 固定 推荐20-50
|
||||
'embed_size': 768, # 字/词向量维度
|
||||
'vocab_size': 20000, # 这里随便填的,会根据代码里修改
|
||||
'trainable': True, # embedding是静态的还是动态的, 即控制可不可以微调
|
||||
'level_type': 'char', # 级别, 最小单元, 字/词, 填 'char' or 'word'
|
||||
'embedding_type': 'albert', # 级别, 嵌入类型, 还可以填'xlnet'、'random'、 'bert'、 'albert' or 'word2vec"
|
||||
'gpu_memory_fraction': 0.76, #gpu使用率
|
||||
'gpu_memory_fraction': 0.78, #gpu使用率
|
||||
'model': {'label': 17, # 类别数
|
||||
'batch_size': 32, # 批处理尺寸, 感觉原则上越大越好,尤其是样本不均衡的时候, batch_size设置影响比较大
|
||||
'filters': [2, 3, 4, 5], # 卷积核尺寸
|
||||
'filters_num': 300, # 卷积个数 text-cnn:300-600
|
||||
'channel_size': 1, # CNN通道数
|
||||
'dropout': 0.5, # 随机失活, 概率
|
||||
'decay_step': 100, # 学习率衰减step, 每N个step衰减一次
|
||||
'decay_rate': 0.9, # 学习率衰减系数, 乘法
|
||||
'epochs': 20, # 训练最大轮次
|
||||
'decay_step': 1000, # 学习率衰减step, 每N个step衰减一次
|
||||
'decay_rate': 0.999, # 学习率衰减系数, 乘法
|
||||
'epochs': 1, # 训练最大轮次
|
||||
'patience': 3, # 早停,2-3就好
|
||||
'lr': 5e-5, # 学习率, 对训练会有比较大的影响, 如果准确率一直上不去,可以考虑调这个参数
|
||||
'lr': 5e-3, # 学习率, 对训练会有比较大的影响, 如果准确率一直上不去,可以考虑调这个参数
|
||||
'l2': 1e-9, # l2正则化
|
||||
'activate_classify': 'softmax', # 最后一个layer, 即分类激活函数
|
||||
'loss': 'categorical_crossentropy', # 损失函数
|
||||
@ -66,11 +82,14 @@ def train(hyper_parameters=None, rate=1.0):
|
||||
'path_hyper_parameters': path_hyper_parameters, # 模型(包括embedding),超参数地址,
|
||||
'path_fineture': path_fineture, # 保存embedding trainable地址, 例如字向量、词向量、bert向量等
|
||||
},
|
||||
'embedding': {'layer_indexes': [i for i in range(25)] + [-i for i in range(25)], # bert取的层数,包括embedding层
|
||||
'embedding': {'layer_indexes': [11] # [i for i in range(25)] + [-i for i in range(25)], # bert取的层数,包括embedding层
|
||||
# 'corpus_path': 'D:/soft_install/dataset/bert-model/albert_tiny_489k', # embedding预训练数据地址,不配则会默认取conf里边默认的地址, keras-bert可以加载谷歌版bert,百度版ernie(需转换,https://github.com/ArthurRizar/tensorflow_ernie),哈工大版bert-wwm(tf框架,https://github.com/ymcui/Chinese-BERT-wwm)
|
||||
},
|
||||
'data':{'train_data': path_baidu_qa_2019_train, # 训练数据
|
||||
'data':{
|
||||
'train_data': path_baidu_qa_2019_train, # 训练数据
|
||||
'val_data': path_baidu_qa_2019_valid # 验证数据
|
||||
# 'train_data': path_ccks_2020_nil_train, # 训练数据
|
||||
# 'val_data': path_ccks_2020_nil_dev # 验证数据
|
||||
},
|
||||
}
|
||||
|
||||
@ -82,7 +101,8 @@ def train(hyper_parameters=None, rate=1.0):
|
||||
print("graph init ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
# pt = PreprocessSim(path_model_dir)
|
||||
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
|
||||
hyper_parameters['data']['train_data'],
|
||||
ra_ed, rate=rate, shuffle=True)
|
||||
|
@ -51,6 +51,7 @@ class BertGraph(graph):
|
||||
# concat_out.append(x)
|
||||
# x = Concatenate(axis=1)(concat_out)
|
||||
# x = Dropout(self.dropout)(x)
|
||||
|
||||
x = Flatten()(x)
|
||||
# 最后就是softmax
|
||||
dense_layer = Dense(self.label, activation=self.activate_classify)(x)
|
||||
|
@ -46,7 +46,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
|
||||
print("graph load ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
y, x = read_and_process(hyper_parameters['data']['val_data'])
|
||||
# 取该数据集的百分之几的语料测试
|
||||
len_rate = int(len(y) * rate)
|
||||
@ -57,7 +57,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
|
||||
for x_one in x:
|
||||
count += 1
|
||||
ques_embed = ra_ed.sentence2idx(x_one)
|
||||
if hyper_parameters['embedding_type'] == 'bert': # bert数据处理, token
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']: # bert数据处理, token
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
@ -91,7 +91,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
"""
|
||||
# 加载超参数
|
||||
hyper_parameters = load_json(path_hyper_parameter)
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
# 模式初始化和加载
|
||||
graph = Graph(hyper_parameters)
|
||||
graph.load_model()
|
||||
@ -99,7 +99,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
ques = '我要打王者荣耀'
|
||||
# str to token
|
||||
ques_embed = ra_ed.sentence2idx(ques)
|
||||
if hyper_parameters['embedding_type'] == 'bert':
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
@ -115,7 +115,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
ques = input()
|
||||
ques_embed = ra_ed.sentence2idx(ques)
|
||||
print(ques_embed)
|
||||
if hyper_parameters['embedding_type'] == 'bert':
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
|
@ -82,7 +82,7 @@ def train(hyper_parameters=None, rate=1.0):
|
||||
print("graph init ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
|
||||
hyper_parameters['data']['train_data'],
|
||||
ra_ed, rate=rate, shuffle=True)
|
||||
|
@ -46,7 +46,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
|
||||
print("graph load ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
y, x = read_and_process(hyper_parameters['data']['val_data'])
|
||||
# 取该数据集的百分之几的语料测试
|
||||
len_rate = int(len(y) * rate)
|
||||
@ -57,7 +57,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
|
||||
for x_one in x:
|
||||
count += 1
|
||||
ques_embed = ra_ed.sentence2idx(x_one)
|
||||
if hyper_parameters['embedding_type'] == 'bert': # bert数据处理, token
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']: # bert数据处理, token
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
@ -91,7 +91,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
"""
|
||||
# 加载超参数
|
||||
hyper_parameters = load_json(path_hyper_parameter)
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
# 模式初始化和加载
|
||||
graph = Graph(hyper_parameters)
|
||||
graph.load_model()
|
||||
@ -99,7 +99,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
ques = '我要打王者荣耀'
|
||||
# str to token
|
||||
ques_embed = ra_ed.sentence2idx(ques)
|
||||
if hyper_parameters['embedding_type'] == 'bert':
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
@ -115,7 +115,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
ques = input()
|
||||
ques_embed = ra_ed.sentence2idx(ques)
|
||||
print(ques_embed)
|
||||
if hyper_parameters['embedding_type'] == 'bert':
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
|
@ -85,7 +85,7 @@ def train(hyper_parameters=None, rate=1.0):
|
||||
print("graph init ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
|
||||
hyper_parameters['data']['train_data'],
|
||||
ra_ed, rate=rate, shuffle=True)
|
||||
|
@ -7,9 +7,10 @@
|
||||
|
||||
|
||||
from keras_textclassification.base.graph import graph
|
||||
from keras.layers import GlobalMaxPooling1D
|
||||
from keras.layers import Dense
|
||||
from keras.layers import GlobalMaxPooling1D, GlobalAveragePooling1D, Concatenate
|
||||
from keras.layers import Dense, Dropout
|
||||
from keras.models import Model
|
||||
import keras.backend as K
|
||||
|
||||
|
||||
class FastTextGraph(graph):
|
||||
@ -29,9 +30,129 @@ class FastTextGraph(graph):
|
||||
"""
|
||||
super().create_model(hyper_parameters)
|
||||
embedding = self.word_embedding.output
|
||||
x = GlobalMaxPooling1D()(embedding)
|
||||
x_m = GlobalMaxPooling1D()(embedding)
|
||||
x_g = GlobalAveragePooling1D()(embedding)
|
||||
x = Concatenate()([x_g, x_m])
|
||||
x = Dense(128, activation="tanh")(x)
|
||||
x = Dropout(self.dropout)(x)
|
||||
output = Dense(self.label, activation=self.activate_classify)(x)
|
||||
self.model = Model(inputs=self.word_embedding.input, outputs=output)
|
||||
self.model.summary(132)
|
||||
|
||||
|
||||
# def focal_loss(self, gamma=2, alpha=0.75): # 0.25, 0.5
|
||||
def focal_loss(self, gamma=2, alpha=0.75, batch_size=None, label_num=None, epsilon=1.e-7, multi_dim=False, use_softmax=True):
|
||||
from tensorflow.python.ops import array_ops
|
||||
import tensorflow as tf
|
||||
def focal_loss_fixed(y_true, y_pred): # with tensorflow
|
||||
eps = 1e-12
|
||||
y_pred = K.clip(y_pred, eps, 1. - eps) # improve the stability of the focal loss and see issues 1 for more information
|
||||
pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
|
||||
pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))
|
||||
loss = -K.sum(alpha * K.pow(1. - pt_1, gamma) * K.log(pt_1)) - K.sum((1 - alpha) * K.pow(pt_0, gamma) * K.log(1. - pt_0))
|
||||
return loss
|
||||
|
||||
def focal_loss_all(prediction_tensor, target_tensor):
|
||||
r"""Compute focal loss for predictions.
|
||||
Multi-labels Focal loss formula:
|
||||
FL = -alpha * (z-p)^gamma * log(p) -(1-alpha) * p^gamma * log(1-p)
|
||||
,which alpha = 0.25, gamma = 2, p = sigmoid(x), z = target_tensor.
|
||||
Args:
|
||||
prediction_tensor: A float tensor of shape [batch_size, num_anchors,
|
||||
num_classes] representing the predicted logits for each class
|
||||
target_tensor: A float tensor of shape [batch_size, num_anchors,
|
||||
num_classes] representing one-hot encoded classification targets
|
||||
weights: A float tensor of shape [batch_size, num_anchors]
|
||||
alpha: A scalar tensor for focal loss alpha hyper-parameter
|
||||
gamma: A scalar tensor for focal loss gamma hyper-parameter
|
||||
Returns:
|
||||
loss: A (scalar) tensor representing the value of the loss function
|
||||
"""
|
||||
sigmoid_p = tf.nn.sigmoid(prediction_tensor)
|
||||
zeros = array_ops.zeros_like(sigmoid_p, dtype=sigmoid_p.dtype)
|
||||
|
||||
# For poitive prediction, only need consider front part loss, back part is 0;
|
||||
# target_tensor > zeros <=> z=1, so poitive coefficient = z - p.
|
||||
pos_p_sub = array_ops.where(target_tensor > zeros, target_tensor - sigmoid_p, zeros)
|
||||
|
||||
# For negative prediction, only need consider back part loss, front part is 0;
|
||||
# target_tensor > zeros <=> z=1, so negative coefficient = 0.
|
||||
neg_p_sub = array_ops.where(target_tensor > zeros, zeros, sigmoid_p)
|
||||
per_entry_cross_ent = - alpha * (pos_p_sub ** gamma) * tf.log(tf.clip_by_value(sigmoid_p, 1e-8, 1.0)) \
|
||||
- (1 - alpha) * (neg_p_sub ** gamma) * tf.log(tf.clip_by_value(1.0 - sigmoid_p, 1e-8, 1.0))
|
||||
return tf.reduce_sum(per_entry_cross_ent)
|
||||
|
||||
def focal_loss_category(logits, labels):
|
||||
'''
|
||||
:param logits: [batch_size, n_class]
|
||||
:param labels: [batch_size] not one-hot !!!
|
||||
:return: -alpha*(1-y)^r * log(y)
|
||||
它是在哪实现 1- y 的? 通过gather选择的就是1-p,而不是通过计算实现的;
|
||||
logits soft max之后是多个类别的概率,也就是二分类时候的1-P和P;多分类的时候不是1-p了;
|
||||
|
||||
怎么把alpha的权重加上去?
|
||||
通过gather把alpha选择后变成batch长度,同时达到了选择和维度变换的目的
|
||||
|
||||
是否需要对logits转换后的概率值进行限制?
|
||||
需要的,避免极端情况的影响
|
||||
|
||||
针对输入是 (N,P,C )和 (N,P)怎么处理?
|
||||
先把他转换为和常规的一样形状,(N*P,C) 和 (N*P,)
|
||||
|
||||
bug:
|
||||
ValueError: Cannot convert an unknown Dimension to a Tensor: ?
|
||||
因为输入的尺寸有时是未知的,导致了该bug,如果batchsize是确定的,可以直接修改为batchsize
|
||||
|
||||
'''
|
||||
|
||||
if multi_dim:
|
||||
logits = tf.reshape(logits, [-1, logits.shape[2]])
|
||||
labels = tf.reshape(labels, [-1])
|
||||
|
||||
# (Class ,1)
|
||||
alpha = tf.constant([0.5]*batch_size, dtype=tf.float32)
|
||||
|
||||
labels = tf.argmax(labels) #
|
||||
labels = tf.cast(labels, dtype=tf.int32)
|
||||
logits = tf.cast(logits, tf.float32)
|
||||
if use_softmax:
|
||||
# (N,Class) > N*Class
|
||||
softmax = tf.reshape(tf.nn.softmax(logits), [-1]) # [batch_size * n_class]
|
||||
else:
|
||||
softmax = tf.reshape(tf.nn.sigmoid(logits), [-1]) # [batch_size * n_class]
|
||||
# (N,) > (N,) ,但是数值变换了,变成了每个label在N*Class中的位置
|
||||
# labels_shift = tf.range(0, logits.shape[0]) * logits.shape[1] + labels
|
||||
labels_shift = tf.range(0, label_num) * batch_size + labels
|
||||
# (N*Class,) > (N,)
|
||||
prob = tf.gather(softmax, labels_shift)
|
||||
# 预防预测概率值为0的情况 ; (N,)
|
||||
prob = tf.clip_by_value(prob, epsilon, 1. - epsilon)
|
||||
# (Class ,1) > (N,)
|
||||
alpha_choice = tf.gather(alpha, labels)
|
||||
# (N,) > (N,)
|
||||
weight = tf.pow(tf.subtract(1., prob), gamma)
|
||||
weight = tf.multiply(alpha_choice, weight)
|
||||
# (N,) > 1
|
||||
loss = -tf.reduce_sum(tf.multiply(weight, tf.log(prob)))
|
||||
return loss
|
||||
|
||||
return focal_loss_fixed
|
||||
|
||||
|
||||
def create_compile(self):
|
||||
"""
|
||||
构建优化器、损失函数和评价函数
|
||||
:return:
|
||||
"""
|
||||
from keras_textclassification.keras_layers.keras_radam import RAdam
|
||||
from keras.optimizers import Adam
|
||||
# self.model.compile(optimizer=Adam(lr=self.lr, beta_1=0.9, beta_2=0.999, decay=0.0),
|
||||
# loss=[self.focal_loss(alpha=.25, gamma=2)],
|
||||
# metrics=['accuracy'])
|
||||
|
||||
self.model.compile(optimizer=Adam(lr=self.lr, beta_1=0.9, beta_2=0.999, decay=0.0),
|
||||
loss=[self.focal_loss(alpha=.25, gamma=2)], # self.loss, #
|
||||
# loss_weights=[0.6, 0.5],
|
||||
# loss=[self.focal_loss(gamma=2, alpha=0.25, batch_size=self.batch_size, label_num=self.label, epsilon=1.e-7, multi_dim=False, use_softmax=False)],
|
||||
# loss=[self.focal_loss(gamma=2, alpha=0.75)],
|
||||
metrics=['accuracy']) # Any optimize
|
||||
|
@ -46,7 +46,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
|
||||
print("graph load ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
y, x = read_and_process(hyper_parameters['data']['val_data'])
|
||||
# 取该数据集的百分之几的语料测试
|
||||
len_rate = int(len(y) * rate)
|
||||
@ -57,7 +57,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
|
||||
for x_one in x:
|
||||
count += 1
|
||||
ques_embed = ra_ed.sentence2idx(x_one)
|
||||
if hyper_parameters['embedding_type'] == 'bert': # bert数据处理, token
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']: # bert数据处理, token
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
@ -91,7 +91,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
"""
|
||||
# 加载超参数
|
||||
hyper_parameters = load_json(path_hyper_parameter)
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
# 模式初始化和加载
|
||||
graph = Graph(hyper_parameters)
|
||||
graph.load_model()
|
||||
@ -99,7 +99,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
ques = '我要打王者荣耀'
|
||||
# str to token
|
||||
ques_embed = ra_ed.sentence2idx(ques)
|
||||
if hyper_parameters['embedding_type'] == 'bert':
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
@ -115,7 +115,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
ques = input()
|
||||
ques_embed = ra_ed.sentence2idx(ques)
|
||||
print(ques_embed)
|
||||
if hyper_parameters['embedding_type'] == 'bert':
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
|
@ -9,35 +9,73 @@
|
||||
import pathlib
|
||||
import sys
|
||||
import os
|
||||
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
|
||||
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
|
||||
project_path = str(pathlib.Path(os.path.abspath(__file__)).parent.parent.parent)
|
||||
sys.path.append(project_path)
|
||||
# 地址
|
||||
from keras_textclassification.conf.path_config import path_model, path_fineture, path_model_dir, path_hyper_parameters
|
||||
# 训练验证数据地址
|
||||
from keras_textclassification.conf.path_config import path_baidu_qa_2019_train, path_baidu_qa_2019_valid
|
||||
# 训练验证数据地址
|
||||
from keras_textclassification.conf.path_config import path_baidu_qa_2019_train, path_baidu_qa_2019_valid, \
|
||||
path_ccks_2020_el_dev, path_ccks_2020_el_tet, path_ccks_2020_el_train,\
|
||||
path_ccks_2020_el_cls_dev, path_ccks_2020_el_cls_tet, path_ccks_2020_el_cls_train, \
|
||||
path_root
|
||||
|
||||
# 数据预处理, 删除文件目录下文件
|
||||
from keras_textclassification.data_preprocess.text_preprocess import PreprocessText, delete_file
|
||||
from keras_textclassification.data_preprocess.text_preprocess import PreprocessText, PreprocessSim, delete_file
|
||||
|
||||
# 模型图
|
||||
from keras_textclassification.m01_FastText.graph import FastTextGraph as Graph
|
||||
# from keras_textclassification.m02_TextCNN.graph import TextCNNGraph as Graph
|
||||
|
||||
# 计算时间
|
||||
import time
|
||||
|
||||
|
||||
# # fast_text config
|
||||
# # 模型目录
|
||||
# path_model_dir = path_root + "/data/model/ccks_2020_el_cls_albert_fasttext/"
|
||||
# # 语料地址
|
||||
# path_model = path_root + '/data/model/ccks_2020_el_cls_albert_fasttext/model_fast_text.h5'
|
||||
# # 超参数保存地址
|
||||
# path_hyper_parameters = path_root + '/data/model/ccks_2020_el_cls_albert_fasttext/hyper_parameters.json'
|
||||
# # embedding微调保存地址
|
||||
# path_fineture = path_root + "/data/model/ccks_2020_el_cls_albert_fasttext/embedding_trainable.h5"
|
||||
|
||||
|
||||
|
||||
# fast_text config
|
||||
# # 模型目录
|
||||
# path_model_dir = path_root + "/data/model/ccks_2020_el_cls_random_fasttext/"
|
||||
# # 语料地址
|
||||
# path_model = path_root + '/data/model/ccks_2020_el_cls_random_fasttext/model_fast_text.h5'
|
||||
# # 超参数保存地址
|
||||
# path_hyper_parameters = path_root + '/data/model/ccks_2020_el_cls_random_fasttext/hyper_parameters.json'
|
||||
# # embedding微调保存地址
|
||||
# path_fineture = path_root + "/data/model/ccks_2020_el_cls_random_fasttext/embedding_trainable.h5"
|
||||
# if not os.path.exists(path_model_dir):
|
||||
# os.mkdir(path_model_dir)
|
||||
|
||||
|
||||
def train(hyper_parameters=None, rate=1.0):
|
||||
if not hyper_parameters:
|
||||
hyper_parameters = {
|
||||
'len_max': 50, # 句子最大长度, 固定推荐20-50, bert越长会越慢, 占用空间也会变大, 小心OOM
|
||||
'embed_size': 150, # 字/词向量维度, bert取768, word取300, char可以更小些
|
||||
'len_max': 56, # 句子最大长度, 固定推荐20-50, bert越长会越慢, 占用空间也会变大, 小心OOM
|
||||
'embed_size': 300, # 字/词向量维度, bert取768, word取300, char可以更小些
|
||||
'vocab_size': 20000, # 这里随便填的,会根据代码里修改
|
||||
'trainable': True, # embedding是静态的还是动态的, 即控制可不可以微调
|
||||
'level_type': 'ngram', # 级别, 最小单元, 字/词, 填 'char' or 'word', 注意:word2vec模式下训练语料要首先切好
|
||||
'level_type': 'char', # 级别, 最小单元, 字/词, 填 'char' or 'word', 注意:word2vec模式下训练语料要首先切好
|
||||
'embedding_type': 'random', # 级别, 嵌入类型, 还可以填'xlnet'、'random'、 'bert'、 'albert' or 'word2vec"
|
||||
# 'gpu_memory_fraction': 0.76, #gpu使用率
|
||||
'model': {'label': 17, # 类别数
|
||||
'batch_size': 64, # 批处理尺寸, 感觉原则上越大越好,尤其是样本不均衡的时候, batch_size设置影响比较大
|
||||
'batch_size': 256, # 批处理尺寸, 感觉原则上越大越好,尤其是样本不均衡的时候, batch_size设置影响比较大
|
||||
'dropout': 0.5, # 随机失活, 概率
|
||||
'decay_step': 1000, # 学习率衰减step, 每N个step衰减一次
|
||||
'decay_rate': 0.9, # 学习率衰减系数, 乘法
|
||||
'decay_rate': 0.999, # 学习率衰减系数, 乘法
|
||||
'filters': [3, 7, 7],
|
||||
'filters_num': 300, # 卷积个数 论文中 filters_num=150,300
|
||||
'epochs': 20, # 训练最大轮次
|
||||
'patience': 3, # 早停,2-3就好
|
||||
'lr': 1e-3, # 学习率,bert取5e-5,其他取1e-3, 对训练会有比较大的影响, 如果准确率一直上不去,可以考虑调这个参数
|
||||
@ -54,11 +92,11 @@ def train(hyper_parameters=None, rate=1.0):
|
||||
'path_fineture': path_fineture, # 保存embedding trainable地址, 例如字向量、词向量、bert向量等
|
||||
},
|
||||
'embedding': {'layer_indexes': [24], # bert取的层数
|
||||
'ngram_ns': [3],
|
||||
'corpus_path': path_baidu_qa_2019_train,
|
||||
# 'ngram_ns': [3],
|
||||
# 'corpus_path': path_baidu_qa_2019_train,
|
||||
},
|
||||
'data':{'train_data': path_baidu_qa_2019_train, # 训练数据
|
||||
'val_data': path_baidu_qa_2019_valid # 验证数据
|
||||
'data':{'train_data': path_ccks_2020_el_cls_train, # 训练数据
|
||||
'val_data': path_ccks_2020_el_cls_dev # 验证数据
|
||||
},
|
||||
}
|
||||
|
||||
@ -70,7 +108,7 @@ def train(hyper_parameters=None, rate=1.0):
|
||||
print("graph init ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessSim(path_model_dir)
|
||||
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
|
||||
hyper_parameters['data']['train_data'],
|
||||
ra_ed, rate=rate, shuffle=True)
|
||||
|
@ -36,7 +36,7 @@ class TextCNNGraph(graph):
|
||||
kernel_size = (filter, self.embed_size),
|
||||
padding = 'valid',
|
||||
kernel_initializer = 'normal',
|
||||
activation = 'relu',
|
||||
activation = 'tanh',
|
||||
)(embedding_reshape)
|
||||
pooled = MaxPool2D(pool_size = (self.len_max - filter + 1, 1),
|
||||
strides = (1, 1),
|
||||
@ -45,8 +45,132 @@ class TextCNNGraph(graph):
|
||||
conv_pools.append(pooled)
|
||||
# 拼接
|
||||
x = Concatenate(axis=-1)(conv_pools)
|
||||
x = Dropout(self.dropout)(x)
|
||||
x = Flatten()(x)
|
||||
x = Dense(units=64, activation='tanh')(x)
|
||||
x = Dropout(self.dropout)(x)
|
||||
output = Dense(units=self.label, activation=self.activate_classify)(x)
|
||||
self.model = Model(inputs=self.word_embedding.input, outputs=output)
|
||||
self.model.summary(120)
|
||||
|
||||
|
||||
|
||||
# def focal_loss(self, gamma=2, alpha=0.75): # 0.25, 0.5
|
||||
def focal_loss(self, gamma=2, alpha=0.75, batch_size=None, label_num=None, epsilon=1.e-7, multi_dim=False, use_softmax=True):
|
||||
from tensorflow.python.ops import array_ops
|
||||
import keras.backend as K
|
||||
import tensorflow as tf
|
||||
def focal_loss_fixed(y_true, y_pred): # with tensorflow
|
||||
eps = 1e-12
|
||||
y_pred = K.clip(y_pred, eps, 1. - eps) # improve the stability of the focal loss and see issues 1 for more information
|
||||
pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
|
||||
pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))
|
||||
loss = -K.sum(alpha * K.pow(1. - pt_1, gamma) * K.log(pt_1)) - K.sum((1 - alpha) * K.pow(pt_0, gamma) * K.log(1. - pt_0))
|
||||
return loss
|
||||
|
||||
def focal_loss_all(prediction_tensor, target_tensor):
|
||||
r"""Compute focal loss for predictions.
|
||||
Multi-labels Focal loss formula:
|
||||
FL = -alpha * (z-p)^gamma * log(p) -(1-alpha) * p^gamma * log(1-p)
|
||||
,which alpha = 0.25, gamma = 2, p = sigmoid(x), z = target_tensor.
|
||||
Args:
|
||||
prediction_tensor: A float tensor of shape [batch_size, num_anchors,
|
||||
num_classes] representing the predicted logits for each class
|
||||
target_tensor: A float tensor of shape [batch_size, num_anchors,
|
||||
num_classes] representing one-hot encoded classification targets
|
||||
weights: A float tensor of shape [batch_size, num_anchors]
|
||||
alpha: A scalar tensor for focal loss alpha hyper-parameter
|
||||
gamma: A scalar tensor for focal loss gamma hyper-parameter
|
||||
Returns:
|
||||
loss: A (scalar) tensor representing the value of the loss function
|
||||
"""
|
||||
sigmoid_p = tf.nn.sigmoid(prediction_tensor)
|
||||
zeros = array_ops.zeros_like(sigmoid_p, dtype=sigmoid_p.dtype)
|
||||
|
||||
# For poitive prediction, only need consider front part loss, back part is 0;
|
||||
# target_tensor > zeros <=> z=1, so poitive coefficient = z - p.
|
||||
pos_p_sub = array_ops.where(target_tensor > zeros, target_tensor - sigmoid_p, zeros)
|
||||
|
||||
# For negative prediction, only need consider back part loss, front part is 0;
|
||||
# target_tensor > zeros <=> z=1, so negative coefficient = 0.
|
||||
neg_p_sub = array_ops.where(target_tensor > zeros, zeros, sigmoid_p)
|
||||
per_entry_cross_ent = - alpha * (pos_p_sub ** gamma) * tf.log(tf.clip_by_value(sigmoid_p, 1e-8, 1.0)) \
|
||||
- (1 - alpha) * (neg_p_sub ** gamma) * tf.log(tf.clip_by_value(1.0 - sigmoid_p, 1e-8, 1.0))
|
||||
return tf.reduce_sum(per_entry_cross_ent)
|
||||
|
||||
def focal_loss_category(logits, labels):
|
||||
'''
|
||||
:param logits: [batch_size, n_class]
|
||||
:param labels: [batch_size] not one-hot !!!
|
||||
:return: -alpha*(1-y)^r * log(y)
|
||||
它是在哪实现 1- y 的? 通过gather选择的就是1-p,而不是通过计算实现的;
|
||||
logits soft max之后是多个类别的概率,也就是二分类时候的1-P和P;多分类的时候不是1-p了;
|
||||
|
||||
怎么把alpha的权重加上去?
|
||||
通过gather把alpha选择后变成batch长度,同时达到了选择和维度变换的目的
|
||||
|
||||
是否需要对logits转换后的概率值进行限制?
|
||||
需要的,避免极端情况的影响
|
||||
|
||||
针对输入是 (N,P,C )和 (N,P)怎么处理?
|
||||
先把他转换为和常规的一样形状,(N*P,C) 和 (N*P,)
|
||||
|
||||
bug:
|
||||
ValueError: Cannot convert an unknown Dimension to a Tensor: ?
|
||||
因为输入的尺寸有时是未知的,导致了该bug,如果batchsize是确定的,可以直接修改为batchsize
|
||||
|
||||
'''
|
||||
|
||||
if multi_dim:
|
||||
logits = tf.reshape(logits, [-1, logits.shape[2]])
|
||||
labels = tf.reshape(labels, [-1])
|
||||
|
||||
# (Class ,1)
|
||||
alpha = tf.constant([0.5]*batch_size, dtype=tf.float32)
|
||||
|
||||
labels = tf.argmax(labels) #
|
||||
labels = tf.cast(labels, dtype=tf.int32)
|
||||
logits = tf.cast(logits, tf.float32)
|
||||
if use_softmax:
|
||||
# (N,Class) > N*Class
|
||||
softmax = tf.reshape(tf.nn.softmax(logits), [-1]) # [batch_size * n_class]
|
||||
else:
|
||||
softmax = tf.reshape(tf.nn.sigmoid(logits), [-1]) # [batch_size * n_class]
|
||||
# (N,) > (N,) ,但是数值变换了,变成了每个label在N*Class中的位置
|
||||
# labels_shift = tf.range(0, logits.shape[0]) * logits.shape[1] + labels
|
||||
labels_shift = tf.range(0, label_num) * batch_size + labels
|
||||
# (N*Class,) > (N,)
|
||||
prob = tf.gather(softmax, labels_shift)
|
||||
# 预防预测概率值为0的情况 ; (N,)
|
||||
prob = tf.clip_by_value(prob, epsilon, 1. - epsilon)
|
||||
# (Class ,1) > (N,)
|
||||
alpha_choice = tf.gather(alpha, labels)
|
||||
# (N,) > (N,)
|
||||
weight = tf.pow(tf.subtract(1., prob), gamma)
|
||||
weight = tf.multiply(alpha_choice, weight)
|
||||
# (N,) > 1
|
||||
loss = -tf.reduce_sum(tf.multiply(weight, tf.log(prob)))
|
||||
return loss
|
||||
|
||||
return focal_loss_fixed
|
||||
|
||||
|
||||
def create_compile(self):
|
||||
"""
|
||||
构建优化器、损失函数和评价函数
|
||||
:return:
|
||||
"""
|
||||
from keras_textclassification.keras_layers.keras_radam import RAdam
|
||||
from keras.optimizers import Adam
|
||||
# self.model.compile(optimizer=Adam(lr=self.lr, beta_1=0.9, beta_2=0.999, decay=0.0),
|
||||
# loss=[self.focal_loss(alpha=.25, gamma=2)],
|
||||
# metrics=['accuracy'])
|
||||
|
||||
self.model.compile(optimizer=Adam(lr=self.lr, beta_1=0.9, beta_2=0.999, decay=0.0),
|
||||
loss=[self.focal_loss(alpha=.25, gamma=2)], # self.loss, #
|
||||
# loss_weights=[0.6, 0.5],
|
||||
# loss=[self.focal_loss(gamma=2, alpha=0.25, batch_size=self.batch_size, label_num=self.label, epsilon=1.e-7, multi_dim=False, use_softmax=False)],
|
||||
# loss=[self.focal_loss(gamma=2, alpha=0.75)],
|
||||
metrics=['accuracy']) # Any optimize
|
||||
|
||||
|
||||
|
@ -40,7 +40,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
|
||||
print("graph load ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
y, x = read_and_process(hyper_parameters['data']['val_data'])
|
||||
# 取该数据集的百分之几的语料测试
|
||||
len_rate = int(len(y) * rate)
|
||||
@ -87,7 +87,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
# 输入预测
|
||||
# 加载超参数
|
||||
hyper_parameters = load_json(path_hyper_parameter)
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
# 模式初始化和加载
|
||||
graph = Graph(hyper_parameters)
|
||||
graph.load_model()
|
||||
@ -95,7 +95,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
ques = '我要打王者荣耀'
|
||||
# str to token
|
||||
ques_embed = ra_ed.sentence2idx(ques)
|
||||
if hyper_parameters['embedding_type'] == 'bert':
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
@ -111,7 +111,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
ques = input()
|
||||
ques_embed = ra_ed.sentence2idx(ques)
|
||||
print(ques_embed)
|
||||
if hyper_parameters['embedding_type'] == 'bert':
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
|
@ -70,7 +70,7 @@ def train(hyper_parameters=None, rate=1.0):
|
||||
print("graph init ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
|
||||
hyper_parameters['data']['train_data'],
|
||||
ra_ed, rate=rate, shuffle=True)
|
||||
@ -79,8 +79,7 @@ def train(hyper_parameters=None, rate=1.0):
|
||||
ra_ed, rate=rate, shuffle=True)
|
||||
print("data propress ok!")
|
||||
print(len(y_train))
|
||||
# 训练
|
||||
graph.fit(x_train, y_train, x_val, y_val)
|
||||
# 训练 graph.fit(x_train, y_train, x_val, y_val)
|
||||
print("耗时:" + str(time.time()-time_start))
|
||||
|
||||
|
||||
|
@ -41,7 +41,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
|
||||
print("graph load ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
y, x = read_and_process(hyper_parameters['data']['val_data'])
|
||||
# 取该数据集的百分之几的语料测试
|
||||
len_rate = int(len(y) * rate)
|
||||
@ -52,7 +52,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
|
||||
for x_one in x:
|
||||
count += 1
|
||||
ques_embed = ra_ed.sentence2idx(x_one)
|
||||
if hyper_parameters['embedding_type'] == 'bert': # bert数据处理, token
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']: # bert数据处理, token
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
@ -82,7 +82,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
# 输入预测
|
||||
# 加载超参数
|
||||
hyper_parameters = load_json(path_hyper_parameter)
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
# 模式初始化和加载
|
||||
graph = Graph(hyper_parameters)
|
||||
graph.load_model()
|
||||
@ -90,7 +90,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
ques = '我要打王者荣耀'
|
||||
# str to token
|
||||
ques_embed = ra_ed.sentence2idx(ques)
|
||||
if hyper_parameters['embedding_type'] == 'bert':
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
@ -106,7 +106,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
ques = input()
|
||||
ques_embed = ra_ed.sentence2idx(ques)
|
||||
print(ques_embed)
|
||||
if hyper_parameters['embedding_type'] == 'bert':
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
|
@ -83,7 +83,7 @@ def train(hyper_parameters=None, rate=1.0):
|
||||
print("graph init ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
|
||||
hyper_parameters['data']['train_data'],
|
||||
ra_ed, rate=rate, shuffle=True)
|
||||
|
@ -82,7 +82,7 @@ def train(hyper_parameters=None, rate=1.0):
|
||||
print("graph init ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
|
||||
hyper_parameters['data']['train_data'],
|
||||
ra_ed, rate=rate, shuffle=True)
|
||||
|
@ -39,7 +39,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
|
||||
print("graph load ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
y, x = read_and_process(hyper_parameters['data']['val_data'])
|
||||
# 取该数据集的百分之几的语料测试
|
||||
len_rate = int(len(y) * rate)
|
||||
@ -50,7 +50,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
|
||||
for x_one in x:
|
||||
count += 1
|
||||
ques_embed = ra_ed.sentence2idx(x_one)
|
||||
if hyper_parameters['embedding_type'] == 'bert': # bert数据处理, token
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']: # bert数据处理, token
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
@ -80,7 +80,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
# 输入预测
|
||||
# 加载超参数
|
||||
hyper_parameters = load_json(path_hyper_parameter)
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
# 模式初始化和加载
|
||||
graph = Graph(hyper_parameters)
|
||||
graph.load_model()
|
||||
@ -88,7 +88,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
ques = '我要打王者荣耀'
|
||||
# str to token
|
||||
ques_embed = ra_ed.sentence2idx(ques)
|
||||
if hyper_parameters['embedding_type'] == 'bert':
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
@ -104,7 +104,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
ques = input()
|
||||
ques_embed = ra_ed.sentence2idx(ques)
|
||||
print(ques_embed)
|
||||
if hyper_parameters['embedding_type'] == 'bert':
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
|
@ -69,7 +69,7 @@ def train(hyper_parameters=None, rate=1.0):
|
||||
print("graph init ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
|
||||
hyper_parameters['data']['train_data'],
|
||||
ra_ed, rate=rate, shuffle=True)
|
||||
@ -85,13 +85,4 @@ def train(hyper_parameters=None, rate=1.0):
|
||||
|
||||
if __name__=="__main__":
|
||||
train(rate=1)
|
||||
# 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。
|
||||
# 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的
|
||||
# win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory
|
||||
# 开始时候一支部队, learning_rate=0.01才好起来
|
||||
|
||||
# rate=0.01,random,char下训练结果, random随机embedding对结果影响很大
|
||||
# 11s 737us/step - loss: 1.3494 - acc: 0.6646 - val_loss: 1.9863 - val_acc: 0.5501
|
||||
# Epoch 00003: val_loss improved from 2.09217 to 1.98626, saving model to
|
||||
# Epoch 4/20
|
||||
|
||||
|
@ -80,9 +80,11 @@ class RCNNGraph(graph):
|
||||
conv_pools.append(pooled)
|
||||
# 拼接
|
||||
x = Concatenate()(conv_pools)
|
||||
x = Dropout(self.dropout)(x)
|
||||
x = Flatten()(x)
|
||||
#########################################################################
|
||||
|
||||
x = Dense(units=128, activation="tanh")(x)
|
||||
x = Dropout(self.dropout)(x)
|
||||
output = Dense(units=self.label, activation=self.activate_classify)(x)
|
||||
self.model = Model(inputs=self.word_embedding.input, outputs=output)
|
||||
self.model.summary(120)
|
||||
|
@ -39,7 +39,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
|
||||
print("graph load ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
y, x = read_and_process(hyper_parameters['data']['val_data'])
|
||||
# 取该数据集的百分之几的语料测试
|
||||
len_rate = int(len(y) * rate)
|
||||
@ -50,7 +50,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
|
||||
for x_one in x:
|
||||
count += 1
|
||||
ques_embed = ra_ed.sentence2idx(x_one)
|
||||
if hyper_parameters['embedding_type'] == 'bert': # bert数据处理, token
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']: # bert数据处理, token
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
@ -80,7 +80,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
# 输入预测
|
||||
# 加载超参数
|
||||
hyper_parameters = load_json(path_hyper_parameter)
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
# 模式初始化和加载
|
||||
graph = Graph(hyper_parameters)
|
||||
graph.load_model()
|
||||
@ -88,7 +88,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
ques = '我要打王者荣耀'
|
||||
# str to token
|
||||
ques_embed = ra_ed.sentence2idx(ques)
|
||||
if hyper_parameters['embedding_type'] == 'bert':
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
@ -104,7 +104,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
ques = input()
|
||||
ques_embed = ra_ed.sentence2idx(ques)
|
||||
print(ques_embed)
|
||||
if hyper_parameters['embedding_type'] == 'bert':
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
|
@ -79,7 +79,7 @@ def train(hyper_parameters=None, rate=1.0):
|
||||
print("graph init ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
|
||||
hyper_parameters['data']['train_data'],
|
||||
ra_ed, rate=rate, shuffle=True)
|
||||
|
@ -39,7 +39,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
|
||||
print("graph load ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
y, x = read_and_process(hyper_parameters['data']['val_data'])
|
||||
# 取该数据集的百分之几的语料测试
|
||||
len_rate = int(len(y) * rate)
|
||||
@ -50,7 +50,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
|
||||
for x_one in x:
|
||||
count += 1
|
||||
ques_embed = ra_ed.sentence2idx(x_one)
|
||||
if hyper_parameters['embedding_type'] == 'bert': # bert数据处理, token
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']: # bert数据处理, token
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
@ -80,7 +80,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
# 输入预测
|
||||
# 加载超参数
|
||||
hyper_parameters = load_json(path_hyper_parameter)
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
# 模式初始化和加载
|
||||
graph = Graph(hyper_parameters)
|
||||
graph.load_model()
|
||||
@ -88,7 +88,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
ques = '我要打王者荣耀'
|
||||
# str to token
|
||||
ques_embed = ra_ed.sentence2idx(ques)
|
||||
if hyper_parameters['embedding_type'] == 'bert':
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
@ -104,7 +104,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
ques = input()
|
||||
ques_embed = ra_ed.sentence2idx(ques)
|
||||
print(ques_embed)
|
||||
if hyper_parameters['embedding_type'] == 'bert':
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
|
@ -9,6 +9,7 @@
|
||||
import pathlib
|
||||
import sys
|
||||
import os
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
|
||||
project_path = str(pathlib.Path(os.path.abspath(__file__)).parent.parent.parent)
|
||||
sys.path.append(project_path)
|
||||
# 地址
|
||||
@ -34,7 +35,7 @@ def train(hyper_parameters=None, rate=1.0):
|
||||
'embedding_type': 'random', # 级别, 嵌入类型, 还可以填'xlnet'、'random'、 'bert'、 'albert' or 'word2vec"
|
||||
'gpu_memory_fraction': 0.66, #gpu使用率
|
||||
'model': {'label': 17, # 类别数
|
||||
'batch_size': 32, # 批处理尺寸, 感觉原则上越大越好,尤其是样本不均衡的时候, batch_size设置影响比较大
|
||||
'batch_size': 16, # 批处理尺寸, 感觉原则上越大越好,尤其是样本不均衡的时候, batch_size设置影响比较大
|
||||
'filters': [[10, 7, 5], [6, 4, 3]], # 3层的时候
|
||||
# 'filters': [[10, 7], [5, 3]], # 2层的时候
|
||||
# 'filters': [[5, 3], [4, 2]], #2层的时候
|
||||
@ -71,7 +72,7 @@ def train(hyper_parameters=None, rate=1.0):
|
||||
print("graph init ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
|
||||
hyper_parameters['data']['train_data'],
|
||||
ra_ed, rate=rate, shuffle=True)
|
||||
@ -87,11 +88,4 @@ def train(hyper_parameters=None, rate=1.0):
|
||||
|
||||
if __name__=="__main__":
|
||||
train(rate=1)
|
||||
# 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。
|
||||
# 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的
|
||||
# win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory
|
||||
|
||||
# 14251/14251 [==============================] - 40s 3ms/step - loss: 0.8393 - acc: 0.7466 - val_loss: 1.0829 - val_acc: 0.6637
|
||||
# Epoch 00003: val_loss improved from 1.12679 to 1.08295, saving model to
|
||||
# Epoch 4/20
|
||||
|
||||
|
@ -39,7 +39,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
|
||||
print("graph load ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
y, x = read_and_process(hyper_parameters['data']['val_data'])
|
||||
# 取该数据集的百分之几的语料测试
|
||||
len_rate = int(len(y) * rate)
|
||||
@ -50,7 +50,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
|
||||
for x_one in x:
|
||||
count += 1
|
||||
ques_embed = ra_ed.sentence2idx(x_one)
|
||||
if hyper_parameters['embedding_type'] == 'bert': # bert数据处理, token
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']: # bert数据处理, token
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
@ -80,7 +80,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
# 输入预测
|
||||
# 加载超参数
|
||||
hyper_parameters = load_json(path_hyper_parameter)
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
# 模式初始化和加载
|
||||
graph = Graph(hyper_parameters)
|
||||
graph.load_model()
|
||||
@ -88,7 +88,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
ques = '我要打王者荣耀'
|
||||
# str to token
|
||||
ques_embed = ra_ed.sentence2idx(ques)
|
||||
if hyper_parameters['embedding_type'] == 'bert':
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
@ -104,7 +104,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
ques = input()
|
||||
ques_embed = ra_ed.sentence2idx(ques)
|
||||
print(ques_embed)
|
||||
if hyper_parameters['embedding_type'] == 'bert':
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
|
@ -75,7 +75,7 @@ def train(hyper_parameters=None, rate=1.0):
|
||||
print("graph init ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
|
||||
hyper_parameters['data']['train_data'],
|
||||
ra_ed, rate=rate, shuffle=True)
|
||||
@ -91,10 +91,4 @@ def train(hyper_parameters=None, rate=1.0):
|
||||
|
||||
if __name__=="__main__":
|
||||
train(rate=1)
|
||||
# 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。
|
||||
# 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的
|
||||
# win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory
|
||||
|
||||
# 14251/14251 [==============================] - 13s 931us/step - loss: 1.0622 - acc: 0.6821 - val_loss: 1.6637 - val_acc: 0.6214
|
||||
# Epoch 00003: val_loss improved from 1.74499 to 1.66371, saving model
|
||||
# Epoch 4/20
|
@ -40,7 +40,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
|
||||
print("graph load ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
y, x = read_and_process(hyper_parameters['data']['val_data'])
|
||||
# 取该数据集的百分之几的语料测试
|
||||
len_rate = int(len(y) * rate)
|
||||
@ -51,7 +51,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
|
||||
for x_one in x:
|
||||
count += 1
|
||||
ques_embed = ra_ed.sentence2idx(x_one)
|
||||
if hyper_parameters['embedding_type'] == 'bert': # bert数据处理, token
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']: # bert数据处理, token
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
@ -81,7 +81,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
# 输入预测
|
||||
# 加载超参数
|
||||
hyper_parameters = load_json(path_hyper_parameter)
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
# 模式初始化和加载
|
||||
graph = Graph(hyper_parameters)
|
||||
graph.load_model()
|
||||
@ -89,7 +89,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
ques = '我要打王者荣耀'
|
||||
# str to token
|
||||
ques_embed = ra_ed.sentence2idx(ques)
|
||||
if hyper_parameters['embedding_type'] == 'bert':
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
@ -105,7 +105,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
ques = input()
|
||||
ques_embed = ra_ed.sentence2idx(ques)
|
||||
print(ques_embed)
|
||||
if hyper_parameters['embedding_type'] == 'bert':
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
|
@ -83,7 +83,7 @@ def train(hyper_parameters=None, rate=1.0):
|
||||
print("graph init ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
|
||||
hyper_parameters['data']['train_data'],
|
||||
ra_ed, rate=rate, shuffle=True)
|
||||
|
@ -40,7 +40,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
|
||||
print("graph load ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
y, x = read_and_process(hyper_parameters['data']['val_data'])
|
||||
# 取该数据集的百分之几的语料测试
|
||||
len_rate = int(len(y) * rate)
|
||||
@ -51,7 +51,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
|
||||
for x_one in x:
|
||||
count += 1
|
||||
ques_embed = ra_ed.sentence2idx(x_one)
|
||||
if hyper_parameters['embedding_type'] == 'bert': # bert数据处理, token
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']: # bert数据处理, token
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
@ -81,7 +81,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
# 输入预测
|
||||
# 加载超参数
|
||||
hyper_parameters = load_json(path_hyper_parameter)
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
# 模式初始化和加载
|
||||
graph = Graph(hyper_parameters)
|
||||
graph.load_model()
|
||||
@ -89,7 +89,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
ques = '我要打王者荣耀'
|
||||
# str to token
|
||||
ques_embed = ra_ed.sentence2idx(ques)
|
||||
if hyper_parameters['embedding_type'] == 'bert':
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
@ -105,7 +105,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
ques = input()
|
||||
ques_embed = ra_ed.sentence2idx(ques)
|
||||
print(ques_embed)
|
||||
if hyper_parameters['embedding_type'] == 'bert':
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
|
@ -79,7 +79,7 @@ def train(hyper_parameters=None, rate=1.0):
|
||||
print("graph init ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
|
||||
hyper_parameters['data']['train_data'],
|
||||
ra_ed, rate=rate, shuffle=True)
|
||||
|
@ -83,6 +83,9 @@ class DeepMojiGraph(graph):
|
||||
x, weights = x
|
||||
|
||||
x = Dropout(self.dropout)(x)
|
||||
x = Dense(128, activation="tanh")(x)
|
||||
x = Dropout(self.dropout)(x)
|
||||
|
||||
# x = Flatten()(x)
|
||||
# 最后就是softmax
|
||||
dense_layer = Dense(self.label, activation=self.activate_classify)(x)
|
||||
|
@ -40,7 +40,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
|
||||
print("graph load ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
y, x = read_and_process(hyper_parameters['data']['val_data'])
|
||||
# 取该数据集的百分之几的语料测试
|
||||
len_rate = int(len(y) * rate)
|
||||
@ -51,7 +51,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
|
||||
for x_one in x:
|
||||
count += 1
|
||||
ques_embed = ra_ed.sentence2idx(x_one)
|
||||
if hyper_parameters['embedding_type'] == 'bert': # bert数据处理, token
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']: # bert数据处理, token
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
@ -81,7 +81,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
# 输入预测
|
||||
# 加载超参数
|
||||
hyper_parameters = load_json(path_hyper_parameter)
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
# 模式初始化和加载
|
||||
graph = Graph(hyper_parameters)
|
||||
graph.load_model()
|
||||
@ -89,7 +89,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
ques = '我要打王者荣耀'
|
||||
# str to token
|
||||
ques_embed = ra_ed.sentence2idx(ques)
|
||||
if hyper_parameters['embedding_type'] == 'bert':
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
@ -105,7 +105,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
ques = input()
|
||||
ques_embed = ra_ed.sentence2idx(ques)
|
||||
print(ques_embed)
|
||||
if hyper_parameters['embedding_type'] == 'bert':
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
|
@ -13,18 +13,37 @@ import os
|
||||
project_path = str(pathlib.Path(os.path.abspath(__file__)).parent.parent.parent)
|
||||
sys.path.append(project_path)
|
||||
# 地址
|
||||
from keras_textclassification.conf.path_config import path_model, path_fineture, path_model_dir, \
|
||||
path_hyper_parameters
|
||||
from keras_textclassification.conf.path_config import path_model, path_fineture, \
|
||||
path_model_dir, path_hyper_parameters
|
||||
# 训练验证数据地址
|
||||
from keras_textclassification.conf.path_config import path_baidu_qa_2019_train, path_baidu_qa_2019_valid
|
||||
from keras_textclassification.conf.path_config import path_baidu_qa_2019_train, path_baidu_qa_2019_valid, \
|
||||
path_ccks_2020_el_dev, path_ccks_2020_el_tet, path_ccks_2020_el_train,\
|
||||
path_ccks_2020_el_cls_dev, path_ccks_2020_el_cls_tet, path_ccks_2020_el_cls_train, \
|
||||
path_root
|
||||
|
||||
# 数据预处理, 删除文件目录下文件
|
||||
from keras_textclassification.data_preprocess.text_preprocess import PreprocessText, delete_file
|
||||
from keras_textclassification.data_preprocess.text_preprocess import PreprocessText, PreprocessSim, delete_file
|
||||
# 模型图
|
||||
from keras_textclassification.m10_DeepMoji.graph import DeepMojiGraph as Graph
|
||||
# 计算时间
|
||||
import time
|
||||
|
||||
|
||||
|
||||
|
||||
# fast_text config
|
||||
# 模型目录
|
||||
path_model_dir = path_root + "/data/model/ccks_2020_el_cls_deepmoji/"
|
||||
# 语料地址
|
||||
path_model = path_model_dir + 'model_fast_text.h5'
|
||||
# 超参数保存地址
|
||||
path_hyper_parameters = path_model_dir + 'hyper_parameters.json'
|
||||
# embedding微调保存地址
|
||||
path_fineture = path_model_dir + "embedding_trainable.h5"
|
||||
|
||||
if not os.path.exists(path_model_dir):
|
||||
os.mkdir(path_model_dir)
|
||||
|
||||
def train(hyper_parameters=None, rate=1.0):
|
||||
"""
|
||||
训练函数
|
||||
@ -34,20 +53,20 @@ def train(hyper_parameters=None, rate=1.0):
|
||||
"""
|
||||
if not hyper_parameters:
|
||||
hyper_parameters = {
|
||||
'len_max': 50, # 句子最大长度, 固定 推荐20-50
|
||||
'len_max': 1376, # 句子最大长度, 固定 推荐20-50
|
||||
'embed_size': 300, # 字/词向量维度
|
||||
'vocab_size': 20000, # 这里随便填的,会根据代码里修改
|
||||
'trainable': True, # embedding是静态的还是动态的, 即控制可不可以微调
|
||||
'level_type': 'char', # 级别, 最小单元, 字/词, 填 'char' or 'word'
|
||||
'embedding_type': 'random', # 级别, 嵌入类型, 还可以填'xlnet'、'random'、 'bert'、 'albert' or 'word2vec"
|
||||
'gpu_memory_fraction': 0.66, # gpu使用率
|
||||
'model': {'label': 17, # 类别数
|
||||
'batch_size': 64, # 批处理尺寸, 感觉原则上越大越好,尤其是样本不均衡的时候, batch_size设置影响比较大
|
||||
'gpu_memory_fraction': 0.8, # gpu使用率
|
||||
'model': {'label': 23, # 类别数
|
||||
'batch_size': 16, # 批处理尺寸, 感觉原则上越大越好,尤其是样本不均衡的时候, batch_size设置影响比较大
|
||||
'dropout': 0.5, # 随机失活, 概率
|
||||
'decay_step': 100, # 学习率衰减step, 每N个step衰减一次
|
||||
'decay_rate': 0.9, # 学习率衰减系数, 乘法
|
||||
'decay_rate': 0.999, # 学习率衰减系数, 乘法
|
||||
'epochs': 20, # 训练最大轮次
|
||||
'patience': 3, # 早停,2-3就好
|
||||
'patience': 6, # 早停,2-3就好
|
||||
'lr': 1e-3, # 学习率, 对训练会有比较大的影响, 如果准确率一直上不去,可以考虑调这个参数
|
||||
'l2': 1e-6, # l2正则化
|
||||
'activate_classify': 'softmax', # 最后一个layer, 即分类激活函数
|
||||
@ -58,25 +77,25 @@ def train(hyper_parameters=None, rate=1.0):
|
||||
'path_hyper_parameters': path_hyper_parameters, # 模型(包括embedding),超参数地址,
|
||||
'path_fineture': path_fineture, # 保存embedding trainable地址, 例如字向量、词向量、bert向量等
|
||||
'rnn_type': 'GRU', # type of rnn, select 'LSTM', 'GRU', 'Bidirectional-GRU'
|
||||
'rnn_units': 256, # RNN隐藏层,
|
||||
'rnn_units': 150, # RNN隐藏层,
|
||||
},
|
||||
'embedding': {'layer_indexes': [12], # bert取的层数,
|
||||
# 'corpus_path': '', # embedding预训练数据地址,不配则会默认取conf里边默认的地址, keras-bert可以加载谷歌版bert,百度版ernie(需转换,https://github.com/ArthurRizar/tensorflow_ernie),哈工大版bert-wwm(tf框架,https://github.com/ymcui/Chinese-BERT-wwm)
|
||||
},
|
||||
'data': {'train_data': path_baidu_qa_2019_train, # 训练数据
|
||||
'val_data': path_baidu_qa_2019_valid # 验证数据
|
||||
'data': {'train_data': path_ccks_2020_el_cls_train, # 训练数据
|
||||
'val_data': path_ccks_2020_el_cls_dev # 验证数据
|
||||
},
|
||||
}
|
||||
|
||||
# 删除先前存在的模型\embedding微调模型等
|
||||
delete_file(path_model_dir)
|
||||
# delete_file(path_model_dir)
|
||||
time_start = time.time()
|
||||
# graph初始化
|
||||
graph = Graph(hyper_parameters)
|
||||
print("graph init ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessSim(path_model_dir)
|
||||
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
|
||||
hyper_parameters['data']['train_data'],
|
||||
ra_ed, rate=rate, shuffle=True)
|
||||
@ -91,12 +110,4 @@ def train(hyper_parameters=None, rate=1.0):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
train(rate=1) # sample条件下设为1,否则训练语料可能会很少
|
||||
# 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。
|
||||
# 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的
|
||||
# win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory
|
||||
# 参数较多,不适合用bert,会比较慢和OOM
|
||||
|
||||
# 1425/1425 [==============================] - 6s 4ms/step - loss: 1.0751 - acc: 0.6618 - val_loss: 1.6913 - val_acc: 0.5227
|
||||
# Epoch 00007: val_loss improved from 1.71417 to 1.69127, saving model
|
||||
# Epoch 8/20
|
||||
train(rate=1)
|
||||
|
@ -8,7 +8,7 @@
|
||||
from keras import regularizers
|
||||
from keras.layers import Dense
|
||||
from keras.layers import Dropout, Flatten
|
||||
from keras.layers import SpatialDropout1D, GlobalMaxPooling1D, MaxPooling1D
|
||||
from keras.layers import SpatialDropout1D, GlobalMaxPooling1D, GlobalAveragePooling1D, Concatenate
|
||||
from keras.models import Model
|
||||
|
||||
from keras_textclassification.keras_layers.attention_self import AttentionSelf
|
||||
@ -34,9 +34,13 @@ class SelfAttentionGraph(graph):
|
||||
x = self.word_embedding.output
|
||||
x = SpatialDropout1D(self.dropout_spatial)(x)
|
||||
x = AttentionSelf(self.word_embedding.embed_size)(x)
|
||||
x = GlobalMaxPooling1D()(x)
|
||||
x_max = GlobalMaxPooling1D()(x)
|
||||
x_avg = GlobalAveragePooling1D()(x)
|
||||
x = Concatenate()([x_max, x_avg])
|
||||
x = Dropout(self.dropout)(x)
|
||||
# x = Flatten()(x)
|
||||
x = Dense(72, activation="tanh")(x)
|
||||
x = Dropout(self.dropout)(x)
|
||||
# 最后就是softmax
|
||||
dense_layer = Dense(self.label, activation=self.activate_classify)(x)
|
||||
output = [dense_layer]
|
||||
|
@ -40,7 +40,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
|
||||
print("graph load ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
y, x = read_and_process(hyper_parameters['data']['val_data'])
|
||||
# 取该数据集的百分之几的语料测试
|
||||
len_rate = int(len(y) * rate)
|
||||
@ -51,7 +51,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
|
||||
for x_one in x:
|
||||
count += 1
|
||||
ques_embed = ra_ed.sentence2idx(x_one)
|
||||
if hyper_parameters['embedding_type'] == 'bert': # bert数据处理, token
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']: # bert数据处理, token
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
@ -81,7 +81,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
# 输入预测
|
||||
# 加载超参数
|
||||
hyper_parameters = load_json(path_hyper_parameter)
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
# 模式初始化和加载
|
||||
graph = Graph(hyper_parameters)
|
||||
graph.load_model()
|
||||
@ -89,7 +89,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
ques = '我要打王者荣耀'
|
||||
# str to token
|
||||
ques_embed = ra_ed.sentence2idx(ques)
|
||||
if hyper_parameters['embedding_type'] == 'bert':
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
@ -105,7 +105,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
ques = input()
|
||||
ques_embed = ra_ed.sentence2idx(ques)
|
||||
print(ques_embed)
|
||||
if hyper_parameters['embedding_type'] == 'bert':
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
|
@ -13,9 +13,14 @@ sys.path.append(project_path)
|
||||
# 地址
|
||||
from keras_textclassification.conf.path_config import path_model, path_fineture, path_model_dir, path_hyper_parameters
|
||||
# 训练验证数据地址
|
||||
from keras_textclassification.conf.path_config import path_baidu_qa_2019_train, path_baidu_qa_2019_valid
|
||||
from keras_textclassification.conf.path_config import path_baidu_qa_2019_train, path_baidu_qa_2019_valid, \
|
||||
path_ccks_2020_el_dev, path_ccks_2020_el_tet, path_ccks_2020_el_train,\
|
||||
path_ccks_2020_el_cls_dev, path_ccks_2020_el_cls_tet, path_ccks_2020_el_cls_train, \
|
||||
path_root
|
||||
|
||||
# 数据预处理, 删除文件目录下文件
|
||||
from keras_textclassification.data_preprocess.text_preprocess import PreprocessText, delete_file
|
||||
from keras_textclassification.data_preprocess.text_preprocess import PreprocessText, PreprocessSim, delete_file
|
||||
|
||||
# 模型图
|
||||
from keras_textclassification.m11_SelfAttention.graph import SelfAttentionGraph as Graph
|
||||
# 计算时间
|
||||
@ -31,18 +36,18 @@ def train(hyper_parameters=None, rate=1.0):
|
||||
"""
|
||||
if not hyper_parameters:
|
||||
hyper_parameters = {
|
||||
'len_max': 50, # 句子最大长度, 固定 推荐20-50
|
||||
'len_max': 1376, # 句子最大长度, 固定 推荐20-50
|
||||
'embed_size': 300, # 字/词向量维度
|
||||
'vocab_size': 20000, # 这里随便填的,会根据代码里修改
|
||||
'trainable': True, # embedding是静态的还是动态的, 即控制可不可以微调
|
||||
'level_type': 'char', # 级别, 最小单元, 字/词, 填 'char' or 'word'
|
||||
'embedding_type': 'random', # 级别, 嵌入类型, 还可以填'xlnet'、'random'、 'bert'、 'albert' or 'word2vec"
|
||||
'gpu_memory_fraction': 0.66, # gpu使用率
|
||||
'model': {'label': 17, # 类别数
|
||||
'batch_size': 64, # 批处理尺寸, 感觉原则上越大越好,尤其是样本不均衡的时候, batch_size设置影响比较大
|
||||
'gpu_memory_fraction': 0.76, # gpu使用率
|
||||
'model': {'label': 23, # 类别数
|
||||
'batch_size': 8, # 批处理尺寸, 感觉原则上越大越好,尤其是样本不均衡的时候, batch_size设置影响比较大
|
||||
'dropout': 0.5, # 随机失活, 概率
|
||||
'decay_step': 100, # 学习率衰减step, 每N个step衰减一次
|
||||
'decay_rate': 0.9, # 学习率衰减系数, 乘法
|
||||
'decay_step': 1000, # 学习率衰减step, 每N个step衰减一次
|
||||
'decay_rate': 0.999, # 学习率衰减系数, 乘法
|
||||
'epochs': 20, # 训练最大轮次
|
||||
'patience': 3, # 早停,2-3就好
|
||||
'lr': 1e-3, # 学习率, 对训练会有比较大的影响, 如果准确率一直上不去,可以考虑调这个参数
|
||||
@ -58,8 +63,8 @@ def train(hyper_parameters=None, rate=1.0):
|
||||
'embedding': {'layer_indexes': [12], # bert取的层数,
|
||||
# 'corpus_path': '', # embedding预训练数据地址,不配则会默认取conf里边默认的地址, keras-bert可以加载谷歌版bert,百度版ernie(需转换,https://github.com/ArthurRizar/tensorflow_ernie),哈工大版bert-wwm(tf框架,https://github.com/ymcui/Chinese-BERT-wwm)
|
||||
},
|
||||
'data': {'train_data': path_baidu_qa_2019_train, # 训练数据
|
||||
'val_data': path_baidu_qa_2019_valid # 验证数据
|
||||
'data': {'train_data': path_ccks_2020_el_cls_train, # 训练数据
|
||||
'val_data': path_ccks_2020_el_cls_dev # 验证数据
|
||||
},
|
||||
}
|
||||
|
||||
@ -71,7 +76,7 @@ def train(hyper_parameters=None, rate=1.0):
|
||||
print("graph init ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessSim(path_model_dir)
|
||||
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
|
||||
hyper_parameters['data']['train_data'],
|
||||
ra_ed, rate=rate, shuffle=True)
|
||||
@ -86,13 +91,4 @@ def train(hyper_parameters=None, rate=1.0):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
train(rate=1) # sample条件下设为1,否则训练语料可能会很少
|
||||
# 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。
|
||||
# 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的
|
||||
# win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory
|
||||
# 参数较多,不适合用bert,会比较慢和OOM
|
||||
|
||||
#
|
||||
# 1425/1425 [==============================] - 0s 283us/step - loss: 1.0207 - acc: 0.7396 - val_loss: 1.8706 - val_acc: 0.5000
|
||||
# Epoch 00012: val_loss improved from 1.89859 to 1.87060, saving model to
|
||||
# Epoch 13/20
|
||||
train(rate=1)
|
||||
|
@ -5,8 +5,8 @@
|
||||
# @function :Hierarchical Attention Networks for Document Classification(https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf)
|
||||
|
||||
|
||||
from keras.layers import Dense, Dropout, SpatialDropout1D, Flatten, Input
|
||||
from keras.layers import Bidirectional, LSTM, GRU, TimeDistributed
|
||||
from keras.layers import Dense, Dropout, Flatten, Input
|
||||
from keras.layers import Bidirectional, GRU
|
||||
from keras import regularizers
|
||||
from keras.models import Model
|
||||
import keras.backend as K
|
||||
|
@ -40,7 +40,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
|
||||
print("graph load ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
y, x = read_and_process(hyper_parameters['data']['val_data'])
|
||||
# 取该数据集的百分之几的语料测试
|
||||
len_rate = int(len(y) * rate)
|
||||
@ -51,7 +51,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
|
||||
for x_one in x:
|
||||
count += 1
|
||||
ques_embed = ra_ed.sentence2idx(x_one)
|
||||
if hyper_parameters['embedding_type'] == 'bert': # bert数据处理, token
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']: # bert数据处理, token
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
@ -81,7 +81,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
# 输入预测
|
||||
# 加载超参数
|
||||
hyper_parameters = load_json(path_hyper_parameter)
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
# 模式初始化和加载
|
||||
graph = Graph(hyper_parameters)
|
||||
graph.load_model()
|
||||
@ -89,7 +89,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
ques = '我要打王者荣耀'
|
||||
# str to token
|
||||
ques_embed = ra_ed.sentence2idx(ques)
|
||||
if hyper_parameters['embedding_type'] == 'bert':
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
@ -105,7 +105,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
ques = input()
|
||||
ques_embed = ra_ed.sentence2idx(ques)
|
||||
print(ques_embed)
|
||||
if hyper_parameters['embedding_type'] == 'bert':
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
|
@ -75,7 +75,7 @@ def train(hyper_parameters=None, rate=1.0):
|
||||
print("graph init ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
|
||||
hyper_parameters['data']['train_data'],
|
||||
ra_ed, rate=rate, shuffle=True)
|
||||
|
@ -40,7 +40,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
|
||||
print("graph load ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
y, x = read_and_process(hyper_parameters['data']['val_data'])
|
||||
# 取该数据集的百分之几的语料测试
|
||||
len_rate = int(len(y) * rate)
|
||||
@ -51,7 +51,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
|
||||
for x_one in x:
|
||||
count += 1
|
||||
ques_embed = ra_ed.sentence2idx(x_one)
|
||||
if hyper_parameters['embedding_type'] == 'bert': # bert数据处理, token
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']: # bert数据处理, token
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
@ -88,7 +88,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
# 输入预测
|
||||
# 加载超参数
|
||||
hyper_parameters = load_json(path_hyper_parameter)
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
# 模式初始化和加载
|
||||
graph = Graph(hyper_parameters)
|
||||
graph.load_model()
|
||||
@ -96,7 +96,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
ques = '我要打王者荣耀'
|
||||
# str to token
|
||||
ques_embed = ra_ed.sentence2idx(ques)
|
||||
if hyper_parameters['embedding_type'] == 'bert':
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
@ -112,7 +112,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
ques = input()
|
||||
ques_embed = ra_ed.sentence2idx(ques)
|
||||
print(ques_embed)
|
||||
if hyper_parameters['embedding_type'] == 'bert':
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
|
@ -75,7 +75,7 @@ def train(hyper_parameters=None, rate=1.0):
|
||||
print("graph init ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
|
||||
hyper_parameters['data']['train_data'],
|
||||
ra_ed, rate=rate, shuffle=True)
|
||||
|
@ -26,6 +26,8 @@ import time
|
||||
|
||||
import numpy as np
|
||||
|
||||
# path_hyper_parameters=path_model_dir + "hyper_parameters.json"
|
||||
|
||||
|
||||
def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.0):
|
||||
# 测试集的准确率
|
||||
@ -40,7 +42,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
|
||||
print("graph load ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
y, x = read_and_process(hyper_parameters['data']['val_data'])
|
||||
# 取该数据集的百分之几的语料测试
|
||||
len_rate = int(len(y) * rate)
|
||||
@ -51,7 +53,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
|
||||
for x_one in x:
|
||||
count += 1
|
||||
ques_embed = ra_ed.sentence2idx(x_one)
|
||||
if hyper_parameters['embedding_type'] == 'bert': # bert数据处理, token
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']: # bert数据处理, token
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
@ -81,7 +83,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
# 输入预测
|
||||
# 加载超参数
|
||||
hyper_parameters = load_json(path_hyper_parameter)
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
# 模式初始化和加载
|
||||
graph = Graph(hyper_parameters)
|
||||
graph.load_model()
|
||||
@ -89,7 +91,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
ques = '我要打王者荣耀'
|
||||
# str to token
|
||||
ques_embed = ra_ed.sentence2idx(ques)
|
||||
if hyper_parameters['embedding_type'] == 'bert':
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
@ -105,7 +107,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
ques = input()
|
||||
ques_embed = ra_ed.sentence2idx(ques)
|
||||
print(ques_embed)
|
||||
if hyper_parameters['embedding_type'] == 'bert':
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
|
@ -36,7 +36,7 @@ def train(hyper_parameters=None, rate=1.0):
|
||||
'vocab_size': 20000, # 这里随便填的,会根据代码里修改
|
||||
'trainable': True, # embedding是静态的还是动态的
|
||||
'level_type': 'char', # 级别, 最小单元, 字/词, 填 'char' or 'word'
|
||||
'embedding_type': 'random', # 级别, 嵌入类型, 还可以填'xlnet'、'random'、 'bert'、 'albert' or 'word2vec"
|
||||
'embedding_type': 'word2vec', # 级别, 嵌入类型, 还可以填'xlnet'、'random'、 'bert'、 'albert' or 'word2vec"
|
||||
'gpu_memory_fraction': 0.66, # gpu使用率
|
||||
'model': {'label': 17, # 类别数
|
||||
'batch_size': 64, # 批处理尺寸, 感觉原则上越大越好,尤其是样本不均衡的时候, batch_size设置影响比较大
|
||||
@ -80,7 +80,7 @@ def train(hyper_parameters=None, rate=1.0):
|
||||
print("graph init ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
|
||||
hyper_parameters['data']['train_data'],
|
||||
ra_ed, rate=rate, shuffle=True)
|
||||
@ -95,10 +95,4 @@ def train(hyper_parameters=None, rate=1.0):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
train(rate=1) # sample条件下设为1,否则训练语料可能会很少
|
||||
# 注意: 4G的1050Ti的GPU、win10下batch_size=32,len_max=20, gpu<=0.87, 应该就可以bert-fineture了。
|
||||
# 全量数据训练一轮(batch_size=32),就能达到80%准确率(验证集), 效果还是不错的
|
||||
# win10下出现过错误,gpu、len_max、batch_size配小一点就好:ailed to allocate 3.56G (3822520832 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory
|
||||
# 参数较多,不适合用bert,会比较慢和OOM
|
||||
|
||||
# 速度很慢呐,字/词维度(embed_size)一设大就OOM了,不喜欢用
|
||||
train(rate=1)
|
||||
|
@ -27,6 +27,8 @@ import time
|
||||
import numpy as np
|
||||
|
||||
|
||||
|
||||
|
||||
def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.0):
|
||||
"""
|
||||
测试集测试与模型评估
|
||||
@ -46,7 +48,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
|
||||
print("graph load ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
y, x = read_and_process(hyper_parameters['data']['val_data'])
|
||||
# 取该数据集的百分之几的语料测试
|
||||
len_rate = int(len(y) * rate)
|
||||
@ -57,7 +59,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
|
||||
for x_one in x:
|
||||
count += 1
|
||||
ques_embed = ra_ed.sentence2idx(x_one)
|
||||
if hyper_parameters['embedding_type'] == 'bert': # bert数据处理, token
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']: # bert数据处理, token
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
@ -91,7 +93,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
"""
|
||||
# 加载超参数
|
||||
hyper_parameters = load_json(path_hyper_parameter)
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
# 模式初始化和加载
|
||||
graph = Graph(hyper_parameters)
|
||||
graph.load_model()
|
||||
@ -99,7 +101,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
ques = '我要打王者荣耀'
|
||||
# str to token
|
||||
ques_embed = ra_ed.sentence2idx(ques)
|
||||
if hyper_parameters['embedding_type'] == 'bert':
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
@ -115,7 +117,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
ques = input()
|
||||
ques_embed = ra_ed.sentence2idx(ques)
|
||||
print(ques_embed)
|
||||
if hyper_parameters['embedding_type'] == 'bert':
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
|
@ -72,7 +72,7 @@ def train(hyper_parameters=None, rate=1.0):
|
||||
print('graph init ok!')
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
|
||||
hyper_parameters['data']['train_data'],
|
||||
ra_ed, rate=rate, shuffle=True)
|
||||
|
@ -46,7 +46,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
|
||||
print("graph load ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
y, x = read_and_process(hyper_parameters['data']['val_data'])
|
||||
# 取该数据集的百分之几的语料测试
|
||||
len_rate = int(len(y) * rate)
|
||||
@ -57,7 +57,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
|
||||
for x_one in x:
|
||||
count += 1
|
||||
ques_embed = ra_ed.sentence2idx(x_one)
|
||||
if hyper_parameters['embedding_type'] == 'bert': # bert数据处理, token
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']: # bert数据处理, token
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
@ -91,7 +91,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
"""
|
||||
# 加载超参数
|
||||
hyper_parameters = load_json(path_hyper_parameter)
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
# 模式初始化和加载
|
||||
graph = Graph(hyper_parameters)
|
||||
graph.load_model()
|
||||
@ -99,7 +99,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
ques = '我要打王者荣耀'
|
||||
# str to token
|
||||
ques_embed = ra_ed.sentence2idx(ques)
|
||||
if hyper_parameters['embedding_type'] == 'bert':
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
@ -115,7 +115,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
ques = input()
|
||||
ques_embed = ra_ed.sentence2idx(ques)
|
||||
print(ques_embed)
|
||||
if hyper_parameters['embedding_type'] == 'bert':
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
|
@ -71,7 +71,7 @@ def train(hyper_parameters=None, rate=1.0):
|
||||
print('graph init ok!')
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
|
||||
hyper_parameters['data']['train_data'],
|
||||
ra_ed, rate=rate, shuffle=True)
|
||||
@ -154,7 +154,7 @@ def train_sim(hyper_parameters=None, rate=1.0):
|
||||
|
||||
|
||||
if __name__=='__main__':
|
||||
# train(rate=1)
|
||||
train_sim()
|
||||
train(rate=1)
|
||||
# train_sim()
|
||||
|
||||
|
||||
|
@ -166,3 +166,4 @@ def train(graph='TextCNN', label=17, rate=1.0, hyper_parameters=None, path_train
|
||||
if __name__ == "__main__":
|
||||
train(graph='TextCNN', label=17, rate=1, path_train_data=None, path_dev_data=None,hyper_parameters=None)
|
||||
|
||||
|
||||
|
@ -1,13 +1,12 @@
|
||||
gensim>=3.7.1
|
||||
jieba>=0.39
|
||||
numpy>=1.16.2
|
||||
pandas>=0.23.4
|
||||
scikit-learn>=0.19.1
|
||||
tflearn>=0.3.2
|
||||
tqdm>=4.31.1
|
||||
passlib>=1.7.1
|
||||
gensim==3.7.1
|
||||
jieba==0.39
|
||||
numpy==1.16.2
|
||||
pandas==0.23.4
|
||||
scikit-learn==0.19.1
|
||||
tflearn==0.3.2
|
||||
tqdm==4.31.1
|
||||
passlib==1.7.1
|
||||
keras==2.2.4
|
||||
tensorflow-gpu==1.12.0
|
||||
keras-bert>=0.80.0
|
||||
keras-xlnet>=0.16.0
|
||||
keras-adaptive-softmax>=0.6.0
|
||||
keras-bert==0.80.0
|
||||
keras-xlnet==0.16.0
|
||||
keras-adaptive-softmax==0.6.0
|
||||
|
@ -40,7 +40,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
|
||||
print("graph load ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
y, x = read_and_process(hyper_parameters['data']['val_data'])
|
||||
# 取该数据集的百分之几的语料测试
|
||||
len_rate = int(len(y) * rate)
|
||||
@ -51,7 +51,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
|
||||
for x_one in x:
|
||||
count += 1
|
||||
ques_embed = ra_ed.sentence2idx(x_one)
|
||||
if hyper_parameters['embedding_type'] == 'bert': # bert数据处理, token
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']: # bert数据处理, token
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
@ -86,7 +86,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
# 输入预测
|
||||
# 加载超参数
|
||||
hyper_parameters = load_json(path_hyper_parameter)
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
# 模式初始化和加载
|
||||
graph = Graph(hyper_parameters)
|
||||
graph.load_model()
|
||||
@ -94,7 +94,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
ques = '我要打王者荣耀'
|
||||
# str to token
|
||||
ques_embed = ra_ed.sentence2idx(ques)
|
||||
if hyper_parameters['embedding_type'] == 'bert':
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
@ -110,7 +110,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
ques = input()
|
||||
ques_embed = ra_ed.sentence2idx(ques)
|
||||
print(ques_embed)
|
||||
if hyper_parameters['embedding_type'] == 'bert':
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
|
@ -39,7 +39,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
ques = '我要打王者荣耀'
|
||||
# str to token
|
||||
ques_embed = ra_ed.sentence2idx(ques)
|
||||
if hyper_parameters['embedding_type'] == 'bert':
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
@ -61,7 +61,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
ques = input()
|
||||
ques_embed = ra_ed.sentence2idx(ques)
|
||||
print(ques_embed)
|
||||
if hyper_parameters['embedding_type'] == 'bert':
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
|
@ -43,7 +43,7 @@ def train(hyper_parameters=None, rate=1.0):
|
||||
'lr': 1e-3, # 学习率, bert取5e-5, 其他取1e-3, 对训练会有比较大的影响, 如果准确率一直上不去,可以考虑调这个参数
|
||||
'l2': 1e-9, # l2正则化
|
||||
'activate_classify': 'sigmoid', # 'sigmoid', # 最后一个layer, 即分类激活函数
|
||||
'loss': 'categorical_crossentropy', # 损失函数, 可能有问题, 可以自己定义
|
||||
'loss': 'binary_crossentropy', # 损失函数, 可能有问题, 可以自己定义
|
||||
'metrics': 'top_k_categorical_accuracy', # 1070个类, 太多了先用topk, 这里数据k设置为最大:33
|
||||
# 'metrics': 'categorical_accuracy', # 保存更好模型的评价标准
|
||||
'is_training': True, # 训练后者是测试模型
|
||||
@ -68,7 +68,7 @@ def train(hyper_parameters=None, rate=1.0):
|
||||
print("graph init ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessTextMulti()
|
||||
pt = PreprocessTextMulti(path_model_dir)
|
||||
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
|
||||
hyper_parameters['data']['train_data'],
|
||||
ra_ed, rate=rate, shuffle=True)
|
||||
|
@ -40,7 +40,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
|
||||
print("graph load ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
y, x = read_and_process(hyper_parameters['data']['val_data'])
|
||||
# 取该数据集的百分之几的语料测试
|
||||
len_rate = int(len(y) * rate)
|
||||
@ -51,7 +51,7 @@ def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.
|
||||
for x_one in x:
|
||||
count += 1
|
||||
ques_embed = ra_ed.sentence2idx(x_one)
|
||||
if hyper_parameters['embedding_type'] == 'bert': # bert数据处理, token
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']: # bert数据处理, token
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
@ -81,7 +81,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
# 输入预测
|
||||
# 加载超参数
|
||||
hyper_parameters = load_json(path_hyper_parameter)
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
# 模式初始化和加载
|
||||
graph = Graph(hyper_parameters)
|
||||
graph.load_model()
|
||||
@ -89,7 +89,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
ques = '我要打王者荣耀'
|
||||
# str to token
|
||||
ques_embed = ra_ed.sentence2idx(ques)
|
||||
if hyper_parameters['embedding_type'] == 'bert':
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
@ -105,7 +105,7 @@ def pred_input(path_hyper_parameter=path_hyper_parameters):
|
||||
ques = input()
|
||||
ques_embed = ra_ed.sentence2idx(ques)
|
||||
print(ques_embed)
|
||||
if hyper_parameters['embedding_type'] == 'bert':
|
||||
if hyper_parameters['embedding_type'] in ['bert', 'albert']:
|
||||
x_val_1 = np.array([ques_embed[0]])
|
||||
x_val_2 = np.array([ques_embed[1]])
|
||||
x_val = [x_val_1, x_val_2]
|
||||
|
@ -67,7 +67,7 @@ def train(hyper_parameters=None, rate=1.0):
|
||||
print("graph init ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
|
||||
hyper_parameters['data']['train_data'],
|
||||
ra_ed, rate=rate, shuffle=True)
|
||||
|
@ -67,7 +67,7 @@ def train(hyper_parameters=None, rate=1.0):
|
||||
print("graph init ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
|
||||
hyper_parameters['data']['train_data'],
|
||||
ra_ed, rate=rate, shuffle=True)
|
||||
|
@ -66,7 +66,7 @@ def train(hyper_parameters=None, rate=1.0):
|
||||
print("graph init ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
|
||||
hyper_parameters['data']['train_data'],
|
||||
ra_ed, rate=rate, shuffle=True)
|
||||
|
@ -67,7 +67,7 @@ def train(hyper_parameters=None, rate=1.0):
|
||||
print("graph init ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
|
||||
hyper_parameters['data']['train_data'],
|
||||
ra_ed, rate=rate, shuffle=True)
|
||||
|
@ -75,7 +75,7 @@ def train(hyper_parameters=None, rate=1.0):
|
||||
print("graph init ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
|
||||
hyper_parameters['data']['train_data'],
|
||||
ra_ed, rate=rate, shuffle=True)
|
||||
|
@ -66,7 +66,7 @@ def train(hyper_parameters=None, rate=1.0):
|
||||
print("graph init ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
|
||||
hyper_parameters['data']['train_data'],
|
||||
ra_ed, rate=rate, shuffle=True)
|
||||
|
@ -66,7 +66,7 @@ def train(hyper_parameters=None, rate=1.0):
|
||||
print("graph init ok!")
|
||||
ra_ed = graph.word_embedding
|
||||
# 数据预处理
|
||||
pt = PreprocessText()
|
||||
pt = PreprocessText(path_model_dir)
|
||||
x_train, y_train = pt.preprocess_label_ques_to_idx(hyper_parameters['embedding_type'],
|
||||
hyper_parameters['data']['train_data'],
|
||||
ra_ed, rate=rate, shuffle=True)
|
||||
|
Loading…
Reference in New Issue
Block a user