167 lines
6.3 KiB
167 lines
6.3 KiB
# encoding:utf-8
@author = 'XXY'
@contact = '529379497@qq.com'
@researchFie1d = 'NLP DL ML'
@date= '2017/12/21 10:18'
import os
import json
import time
import logging
import pandas as pd
import numpy as np
import tensorflow as tf
from cnn_char_punc import TextCNN
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from data_helper import batch_iter
def train():
X_char = []
X_punc = []
y = []
word2id = json.loads(open('./data/word2id.json').read())
punc2id = json.loads(open('./data/punc2id.json').read())
print "读取char特征"
with open('./data/training_char.txt') as f:
for line in f:
temp = line.strip().split('\t')
text = temp[0][1:-1].split(',')
label = temp[1]
print "读取punc特征"
with open('./data/training_punc.txt') as f:
for line in f:
temp = line.strip().split('\t')
text = temp[0][1:-1].split(',')
print "读取全连接层特征"
X_df_fc = pd.read_csv('./data/training_fc_feat_norm_200.txt', sep=',')
labels = sorted(list(set(y)))
one_hot = np.zeros((len(labels), len(labels)), int)
np.fill_diagonal(one_hot, 1)
label_dict = dict(zip(labels, one_hot))
y = [label_dict[i] for i in y]
X_char = np.array(X_char)
X_punc = np.array(X_punc)
X_fc_feat = X_df_fc.values
y = np.array(y)
parameter_file = './config/cnn_parameters.json'
params = json.loads(open(parameter_file).read())
print"所有训练数据的大小:", X_char.shape, X_punc.shape
X_train_char, X_dev_char, X_train_punc, X_dev_punc, X_train_fc_feat, X_dev_fc_feat, y_train, y_dev = \
train_test_split(X_char, X_punc, X_fc_feat, y, random_state=10, test_size=0.1)
print"训练数据大小:", X_train_char.shape, X_train_punc.shape, X_train_fc_feat.shape
print"验证数据大小:", X_dev_char.shape, X_dev_punc.shape, X_dev_fc_feat.shape
graph = tf.Graph()
with graph.as_default():
session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
sess = tf.Session(config=session_conf)
with sess.as_default():
cnn = TextCNN(
sequence_length_char = X_char.shape[1],
sequence_length_punc = X_punc.shape[1],
filter_sizes_char=list(map(int, params['filter_sizes_char'].split(","))),
filter_sizes_punc=list(map(int, params['filter_sizes_punc'].split(","))),
global_step = tf.Variable(0, name="global_step", trainable=False)
optimizer = tf.train.AdamOptimizer(1e-3)
grads_and_vars = optimizer.compute_gradients(cnn.loss)
train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
timestamp = str(int(time.time()))
out_dir = os.path.abspath(os.path.join("models", "cnn_models", "trained_model_" + timestamp))
checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
checkpoint_prefix = os.path.join(checkpoint_dir, "model")
if not os.path.exists(checkpoint_dir):
saver = tf.train.Saver(tf.global_variables())
def train_step(input_x_char, input_x_punc, input_x_fc_feat, y_train):
feed_dict = {
cnn.input_x_char: input_x_char,
cnn.input_x_punc: input_x_punc,
cnn.input_x_fc_feat: input_x_fc_feat,
cnn.input_y: y_train,
cnn.dropout_keep_prob: params['dropout_keep_prob']
_, step, loss, acc, prediction = sess.run([train_op, global_step, cnn.loss, cnn.accuracy, cnn.predictions], feed_dict)
print("After training {} step loss is: {}, accuracy is {}".format(step, loss, acc))
def test_step(input_x_char, input_x_punc, input_x_fc_feat, y_test):
feed_dict = {
cnn.input_x_char: input_x_char,
cnn.input_x_punc: input_x_punc,
cnn.input_x_fc_feat: input_x_fc_feat,
cnn.input_y: y_test,
cnn.dropout_keep_prob: params['dropout_keep_prob']
step, loss, acc, num_correct, prediction = sess.run(
[global_step, cnn.loss, cnn.accuracy, cnn.num_correct, cnn.predictions], feed_dict)
return num_correct, prediction, loss, acc
# 下面开始训练过程
# Save the word_to_id map since predict.py needs it
# 对训练集分batch
train_batches = batch_iter(zip(X_train_char, X_train_punc, X_train_fc_feat, y_train), params['batch_size'], params['num_epochs'])
X_dev = zip(X_dev_char, X_dev_punc, X_dev_fc_feat, y_dev)
best_accuracy, best_at_step = 0, 0
for train_batch in train_batches:
X_train_char_batch, X_train_punc_batch, X_train_fc_feat_batch, y_train_batch = zip(*train_batch)
train_step(X_train_char_batch, X_train_punc_batch, X_train_fc_feat_batch, y_train_batch)
current_step = tf.train.global_step(sess, global_step)
if current_step % params['evaluate_every'] == 0: # 多少步评估一次
total_dev_correct = 0
dev_predictions = []
for i in range(int(len(X_dev) / params['batch_size']) + 1):
start_index = i * params['batch_size']
end_index = min((i + 1) * params['batch_size'], len(X_dev))
X_dev_batch = X_dev[start_index: end_index]
X_dev_batch_char, X_dev_batch_punc, X_dev_batch_fc_feat, y_test_batch = zip(*X_dev_batch)
num_dev_correct, dev_prediction, loss, acc = test_step(X_dev_batch_char, X_dev_batch_punc, X_dev_batch_fc_feat, y_test_batch)
total_dev_correct += num_dev_correct
dev_predictions = np.concatenate([dev_predictions, dev_prediction])
print "最后预测结果:", dev_predictions
print "长度为:", len(dev_predictions)
print "最后预测结果:", dev_predictions
dev_accuracy = float(total_dev_correct) / len(y_dev)
logging.critical('Loss on dev set is:{}, Accuracy on dev set: {}'.format(loss, dev_accuracy))
if dev_accuracy >= best_accuracy:
best_accuracy, best_at_step = dev_accuracy, current_step
path = saver.save(sess, checkpoint_prefix, global_step=current_step)
logging.critical('Saved model at {} at step {}'.format(path, best_at_step))
logging.critical('Best accuracy is {} at step {}'.format(best_accuracy, best_at_step))
if __name__ == '__main__':