From 572086adc3c7aea4de911c9a0748c72cdf214f73 Mon Sep 17 00:00:00 2001 From: joerg Date: Thu, 5 Jul 2018 22:11:00 +0200 Subject: [PATCH] add comments --- scripts/inference_babi_task.py | 1 + scripts/inference_cnn_task.py | 4 +- scripts/start_training.py | 161 ++++++++++++++++----------------- 3 files changed, 83 insertions(+), 83 deletions(-) diff --git a/scripts/inference_babi_task.py b/scripts/inference_babi_task.py index ab7becc..680bf15 100755 --- a/scripts/inference_babi_task.py +++ b/scripts/inference_babi_task.py @@ -15,6 +15,7 @@ import os os.environ["CUDA_VISIBLE_DEVICES"] = "" # gpu not required for inference + import argparse import yaml import numpy as np diff --git a/scripts/inference_cnn_task.py b/scripts/inference_cnn_task.py index 176f504..991213e 100644 --- a/scripts/inference_cnn_task.py +++ b/scripts/inference_cnn_task.py @@ -13,10 +13,10 @@ # limitations under the License. # ============================================================================== import os -import argparse + +import tensorflow as tf import yaml from tqdm import tqdm -import tensorflow as tf from adnc.data.loader import DataLoader from adnc.model.mann import MANN diff --git a/scripts/start_training.py b/scripts/start_training.py index bc556ee..871e5bc 100755 --- a/scripts/start_training.py +++ b/scripts/start_training.py @@ -21,9 +21,9 @@ import numpy as np import tensorflow as tf from tqdm import tqdm -from adnc.model import MANN, Optimizer, Supporter from adnc.analysis import Analyser from adnc.data import DataLoader +from adnc.model import MANN, Optimizer, Supporter from adnc.model.utils import EarlyStop tf.reset_default_graph() @@ -33,85 +33,71 @@ parser.add_argument('--sess', type=int, default=False, help='session number') parser.add_argument('--check', type=int, default=False, help='restore checkpoint') args = parser.parse_args() -session_no = args.sess # allows to restore a specific session +session_no = args.sess # allows to restore a specific session if not session_no: session_no = False -restore_checkpoint = args.check # allows to restore a specific checkpoint +restore_checkpoint = args.check # allows to restore a specific checkpoint if not restore_checkpoint: restore_checkpoint = False +dataset_name = 'babi_task' # defines the dataset choosen from config +model_type = 'mann' # type of model, currently only 'mann' +experiment_name = 'github_example' # name of the experiment -data_set_name = 'babi_task' -model_type = 'mann' +project_dir = 'experiments/' # folder to save experiments +config_file = 'config.yml' # name of config file -experiment_name = 'github_example' +early_stop = EarlyStop(10) # initialize early stopping after 10 higher losses in a row +analyse = True # allows a closer analysis of the training progress, like memory influence +plot_process = True # plots a function plot after each epoch -project_dir = 'experiments/' -config_file = 'config.yml' - -early_stop = EarlyStop(10) - - - -analyse = True -plot_process = True - - - - -sp = Supporter(project_dir, config_file, experiment_name, data_set_name, model_type, session_no) - -data_set_config = sp.config(data_set_name) - -dl = DataLoader(data_set_config) -valid_loader = dl.get_data_loader('valid') -train_loader = dl.get_data_loader('train') +sp = Supporter(project_dir, config_file, experiment_name, dataset_name, model_type, + session_no) # initializes supporter class for experiment handling +dl = DataLoader(sp.config(dataset_name)) # initializes data loader class +valid_loader = dl.get_data_loader('valid') # gets a valid data iterator +train_loader = dl.get_data_loader('train') # gets a train data iterator if analyse: - ana = Analyser(data_set_name, sp.session_dir, save_fig=plot_process) + ana = Analyser(dataset_name, sp.session_dir, save_fig=plot_process, + save_variables=True) # initilizes a analyzer class +sp.config(model_type)['input_size'] = dl.x_size # after the data loader is initilized, the input size +sp.config(model_type)['output_size'] = dl.y_size # and output size is known and used for the model +model = MANN(sp.config('mann'), analyse) # initilizes the model class +data, target, mask = model.feed # TF data, target and mask placeholders for training -sp.config(model_type)['input_size'] = dl.x_size -sp.config(model_type)['output_size'] = dl.y_size -model = MANN(sp.config('mann'), analyse) - -data, target, mask = model.feed - - -trainer = Optimizer(sp.config('training'), model.loss, model.trainable_variables) -optimizer = trainer.optimizer +trainer = Optimizer(sp.config('training'), model.loss, + model.trainable_variables) # initilizes a trainer class with the optimizer +optimizer = trainer.optimizer # the optimizer for training, similar to TF init_op = tf.global_variables_initializer() saver = tf.train.Saver(max_to_keep=30) summary_train_loss = tf.summary.scalar("train_loss", model.loss) summary_valid_loss = tf.summary.scalar("valid_loss", model.loss) - lstm_scale = tf.summary.scalar("lstm_scale", tf.reduce_mean(model.trainable_variables[2])) lstm_beta = tf.summary.scalar("lstm_beta", tf.reduce_mean(model.trainable_variables[3])) -sp.pub("vocabulary size: {}".format(dl.vocabulary_size)) +sp.pub("vocabulary size: {}".format(dl.vocabulary_size)) # prints values and logs it to a log file sp.pub("train set length: {}".format(dl.sample_amount('train'))) sp.pub("train batch amount: {}".format(dl.batch_amount('train'))) sp.pub("valid set length: {}".format(dl.sample_amount('valid'))) sp.pub("valid batch amount: {}".format(dl.batch_amount('valid'))) sp.pub("model parameter amount: {}".format(model.parameter_amount)) - -conf = tf.ConfigProto() +conf = tf.ConfigProto() # TF session config for optimal GPU usage conf.gpu_options.per_process_gpu_memory_fraction = 0.8 conf.gpu_options.allocator_type = 'BFC' conf.gpu_options.allow_growth = True conf.allow_soft_placement = True with tf.Session(config=conf) as sess: - - if sp.restore and restore_checkpoint: + if sp.restore and restore_checkpoint: # restores model dumps after a crash or to continiue training saver.restore(sess, os.path.join(sp.session_dir, "model_dump_{}.ckpt".format(restore_checkpoint))) epoch_start = restore_checkpoint + 1 sp.pub("restart training with checkpoint {}".format(epoch_start - 1)) @@ -121,7 +107,7 @@ with tf.Session(config=conf) as sess: epoch_start = 0 sp.pub("start new training") else: - saver.restore(sess,tf.train.latest_checkpoint(sp.session_dir)) + saver.restore(sess, tf.train.latest_checkpoint(sp.session_dir)) epoch_start = int(tf.train.latest_checkpoint(sp.session_dir).split('_')[-1].split('.')[0]) + 1 sp.pub("restart training with checkpoint {}".format(epoch_start - 1)) else: @@ -131,7 +117,7 @@ with tf.Session(config=conf) as sess: writer = tf.summary.FileWriter(os.path.join(sp.session_dir, "summary"), sess.graph) - for e in range(epoch_start, sp.config('training')['epochs']): + for e in range(epoch_start, sp.config('training')['epochs']): # loop over all training epochs train_cost = 0 train_count = 0 @@ -140,11 +126,12 @@ with tf.Session(config=conf) as sess: time_e = time.time() time_0 = time.time() - for step in tqdm(range(int(dl.batch_amount('train')))): + for step in tqdm(range(int(dl.batch_amount('train')))): # loop over all training samples - sample = next(train_loader) + sample = next(train_loader) # new training sample from train iterator - _, c, summary, lb, ls = sess.run([optimizer, model.loss, summary_train_loss, lstm_beta, lstm_scale],feed_dict={data: sample['x'], target: sample['y'], mask: sample['m']}) + _, c, summary, lb, ls = sess.run([optimizer, model.loss, summary_train_loss, lstm_beta, lstm_scale], + feed_dict={data: sample['x'], target: sample['y'], mask: sample['m']}) train_cost += c train_count += 1 writer.add_summary(summary, e * dl.batch_amount('train') + step) @@ -154,75 +141,87 @@ with tf.Session(config=conf) as sess: valid_cost = 0 valid_count = 0 - for v in range(int(dl.batch_amount('valid'))): + for v in range(int(dl.batch_amount('valid'))): # loop over all validation samples vsample = next(valid_loader) - vcost, vpred, summary = sess.run([model.loss, model.prediction, summary_valid_loss],feed_dict={data: vsample['x'], target: vsample['y'], mask: vsample['m']}) + vcost, vpred, summary = sess.run([model.loss, model.prediction, summary_valid_loss], + feed_dict={data: vsample['x'], target: vsample['y'], mask: vsample['m']}) valid_cost += vcost valid_count += 1 - writer.add_summary(summary, e * dl.batch_amount('valid') + v) - tm = np.argmax(vsample['y'], axis=-1) + writer.add_summary(summary, e * dl.batch_amount('valid') + v) + tm = np.argmax(vsample['y'], axis=-1) # calculates the word error rate pm = np.argmax(vpred, axis=-1) corrects = np.equal(tm, pm) all_corrects += np.sum(corrects * vsample['m']) all_overall += np.sum(vsample['m']) valid_cost = valid_cost / valid_count - train_cost = train_cost /train_count - word_error_rate = 1- (all_corrects/all_overall) + train_cost = train_cost / train_count + word_error_rate = 1 - (all_corrects / all_overall) - if not np.isnan(valid_cost): + if not np.isnan(valid_cost): # checks NAN - save_path = saver.save(sess, os.path.join(sp.session_dir ,"model_dump_{}.ckpt".format(e))) + save_path = saver.save(sess, + os.path.join(sp.session_dir, "model_dump_{}.ckpt".format(e))) # dumps model weights - if analyse: + if analyse: # if analysis, it logs memory influence and plots functionality controller_inf = [] memory_inf = [] all_corrects = 0 all_overall = 0 - for vstep in range(10): + for vstep in range( + 10): # makes ten valid inferneces to get the gradiens for memory influence calculation vsample = next(valid_loader) - analyse_values, prediction, gradients = sess.run([model.analyse, model.prediction, trainer.gradients], - feed_dict={data: vsample['x'], target: vsample['y'], mask: vsample['m']}) - weights = {v.name: {'var':g[1], 'grad':g[0], 'shape':g[0].shape } for v, g in zip(model.trainable_variables, gradients)} + analyse_values, prediction, gradients = sess.run( + [model.analyse, model.prediction, trainer.gradients], + feed_dict={data: vsample['x'], target: vsample['y'], mask: vsample['m']}) + weights = {v.name: {'var': g[1], 'grad': g[0], 'shape': g[0].shape} for v, g in + zip(model.trainable_variables, gradients)} if 'x_word' not in vsample.keys(): - vsample['x_word'] = np.transpose(np.argmax(vsample['x'], axis=-1),(1,0)) - data_sample = [vsample['x'], vsample['y'], vsample['m'], vsample['x_word'],] - + vsample['x_word'] = np.transpose(np.argmax(vsample['x'], axis=-1), (1, 0)) + data_sample = [vsample['x'], vsample['y'], vsample['m'], vsample['x_word'], ] decoded_targets, decoded_predictions = dl.decode_output(vsample, prediction) - save_list = [analyse_values, prediction, decoded_predictions, data_sample, weights ] + save_list = [analyse_values, prediction, decoded_predictions, data_sample, weights] - co_inf, mu_inf = ana.feed_variables_two(save_list, e, name="states_epoch", save_plot=vstep) + co_inf, mu_inf = ana.feed_variables_two(save_list, e, name="states_epoch", + save_plot=vstep) # calculates the memory influence controller_inf.append(co_inf) memory_inf.append(mu_inf) - controller_inf = np.mean(controller_inf) memory_inf = np.mean(memory_inf) - writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='wer', simple_value=word_error_rate)]), e * dl.batch_amount('train') + step) - writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='controller_inf', simple_value=controller_inf)]), e * dl.batch_amount('train') + step) - writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='memory_inf', simple_value=memory_inf)]), e * dl.batch_amount('train') + step) - - - sp.pub("epoch {:3}, step {:5}, train cost {:4.3f}, valid cost {:4.3f}, wer {:4.3f}, controller influence {:4.3f}, " - "memory influence {:4.3f}, duration {:5.1f}sec, time: {}, Model saved in {}".format( - e, step, train_cost, valid_cost, word_error_rate, controller_inf, memory_inf, time.time() - time_0, sp.time_stamp(), save_path)) - sp.monitor(["epoch", "step", "train cost", "valid cost", "duration", "controller influence", "memory influence", "wer"], - [e, step, train_cost, valid_cost, time.time() - time_0, controller_inf, memory_inf, word_error_rate]) + writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='wer', simple_value=word_error_rate)]), + e * dl.batch_amount('train') + step) + writer.add_summary( + tf.Summary(value=[tf.Summary.Value(tag='controller_inf', simple_value=controller_inf)]), + e * dl.batch_amount('train') + step) + writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='memory_inf', simple_value=memory_inf)]), + e * dl.batch_amount('train') + step) + sp.pub( + "epoch {:3}, step {:5}, train cost {:4.3f}, valid cost {:4.3f}, wer {:4.3f}, controller influence {:4.3f}, " + "memory influence {:4.3f}, duration {:5.1f}sec, time: {}, Model saved in {}".format( + e, step, train_cost, valid_cost, word_error_rate, controller_inf, memory_inf, + time.time() - time_0, sp.time_stamp(), save_path)) + sp.monitor(["epoch", "step", "train cost", "valid cost", "duration", "controller influence", + "memory influence", "wer"], + [e, step, train_cost, valid_cost, time.time() - time_0, controller_inf, memory_inf, + word_error_rate]) # saves the values in an numpy array for later analysis else: - sp.pub("epoch {:3}, step {:5}, train cost {:4.3f}, valid cost {:4.3f}, duration {:5.1f}sec, time: {}, Model saved in {}".format( - e, step, train_cost, valid_cost, time.time() - time_0, sp.time_stamp(), save_path)) - sp.monitor(["epoch", "step", "train cost", "valid cost", "duration"], [e, step, train_cost, valid_cost, time.time() - time_0]) + sp.pub( + "epoch {:3}, step {:5}, train cost {:4.3f}, valid cost {:4.3f}, duration {:5.1f}sec, time: {}, Model saved in {}".format( + e, step, train_cost, valid_cost, time.time() - time_0, sp.time_stamp(), save_path)) + sp.monitor(["epoch", "step", "train cost", "valid cost", "duration"], + [e, step, train_cost, valid_cost, time.time() - time_0]) else: sp.pub("ERROR: nan in training") - sys.exit("NAN") + sys.exit("NAN") # end training in case of NAN if early_stop(valid_cost): sp.pub("EARLYSTOP: valid error increase") - sys.exit("EARLYSTOP") + sys.exit("EARLYSTOP") # end training when valid loss increases, early stopping