add comments

This commit is contained in:
joerg 2018-07-05 22:11:00 +02:00
parent 03464bc3e2
commit 572086adc3
3 changed files with 83 additions and 83 deletions

View File

@ -15,6 +15,7 @@
import os import os
os.environ["CUDA_VISIBLE_DEVICES"] = "" # gpu not required for inference os.environ["CUDA_VISIBLE_DEVICES"] = "" # gpu not required for inference
import argparse import argparse
import yaml import yaml
import numpy as np import numpy as np

View File

@ -13,10 +13,10 @@
# limitations under the License. # limitations under the License.
# ============================================================================== # ==============================================================================
import os import os
import argparse
import tensorflow as tf
import yaml import yaml
from tqdm import tqdm from tqdm import tqdm
import tensorflow as tf
from adnc.data.loader import DataLoader from adnc.data.loader import DataLoader
from adnc.model.mann import MANN from adnc.model.mann import MANN

View File

@ -21,9 +21,9 @@ import numpy as np
import tensorflow as tf import tensorflow as tf
from tqdm import tqdm from tqdm import tqdm
from adnc.model import MANN, Optimizer, Supporter
from adnc.analysis import Analyser from adnc.analysis import Analyser
from adnc.data import DataLoader from adnc.data import DataLoader
from adnc.model import MANN, Optimizer, Supporter
from adnc.model.utils import EarlyStop from adnc.model.utils import EarlyStop
tf.reset_default_graph() tf.reset_default_graph()
@ -33,85 +33,71 @@ parser.add_argument('--sess', type=int, default=False, help='session number')
parser.add_argument('--check', type=int, default=False, help='restore checkpoint') parser.add_argument('--check', type=int, default=False, help='restore checkpoint')
args = parser.parse_args() args = parser.parse_args()
session_no = args.sess # allows to restore a specific session session_no = args.sess # allows to restore a specific session
if not session_no: if not session_no:
session_no = False session_no = False
restore_checkpoint = args.check # allows to restore a specific checkpoint restore_checkpoint = args.check # allows to restore a specific checkpoint
if not restore_checkpoint: if not restore_checkpoint:
restore_checkpoint = False restore_checkpoint = False
dataset_name = 'babi_task' # defines the dataset choosen from config
model_type = 'mann' # type of model, currently only 'mann'
experiment_name = 'github_example' # name of the experiment
data_set_name = 'babi_task' project_dir = 'experiments/' # folder to save experiments
model_type = 'mann' config_file = 'config.yml' # name of config file
experiment_name = 'github_example' early_stop = EarlyStop(10) # initialize early stopping after 10 higher losses in a row
analyse = True # allows a closer analysis of the training progress, like memory influence
plot_process = True # plots a function plot after each epoch
project_dir = 'experiments/' sp = Supporter(project_dir, config_file, experiment_name, dataset_name, model_type,
config_file = 'config.yml' session_no) # initializes supporter class for experiment handling
early_stop = EarlyStop(10)
analyse = True
plot_process = True
sp = Supporter(project_dir, config_file, experiment_name, data_set_name, model_type, session_no)
data_set_config = sp.config(data_set_name)
dl = DataLoader(data_set_config)
valid_loader = dl.get_data_loader('valid')
train_loader = dl.get_data_loader('train')
dl = DataLoader(sp.config(dataset_name)) # initializes data loader class
valid_loader = dl.get_data_loader('valid') # gets a valid data iterator
train_loader = dl.get_data_loader('train') # gets a train data iterator
if analyse: if analyse:
ana = Analyser(data_set_name, sp.session_dir, save_fig=plot_process) ana = Analyser(dataset_name, sp.session_dir, save_fig=plot_process,
save_variables=True) # initilizes a analyzer class
sp.config(model_type)['input_size'] = dl.x_size # after the data loader is initilized, the input size
sp.config(model_type)['output_size'] = dl.y_size # and output size is known and used for the model
model = MANN(sp.config('mann'), analyse) # initilizes the model class
data, target, mask = model.feed # TF data, target and mask placeholders for training
sp.config(model_type)['input_size'] = dl.x_size trainer = Optimizer(sp.config('training'), model.loss,
sp.config(model_type)['output_size'] = dl.y_size model.trainable_variables) # initilizes a trainer class with the optimizer
model = MANN(sp.config('mann'), analyse) optimizer = trainer.optimizer # the optimizer for training, similar to TF
data, target, mask = model.feed
trainer = Optimizer(sp.config('training'), model.loss, model.trainable_variables)
optimizer = trainer.optimizer
init_op = tf.global_variables_initializer() init_op = tf.global_variables_initializer()
saver = tf.train.Saver(max_to_keep=30) saver = tf.train.Saver(max_to_keep=30)
summary_train_loss = tf.summary.scalar("train_loss", model.loss) summary_train_loss = tf.summary.scalar("train_loss", model.loss)
summary_valid_loss = tf.summary.scalar("valid_loss", model.loss) summary_valid_loss = tf.summary.scalar("valid_loss", model.loss)
lstm_scale = tf.summary.scalar("lstm_scale", tf.reduce_mean(model.trainable_variables[2])) lstm_scale = tf.summary.scalar("lstm_scale", tf.reduce_mean(model.trainable_variables[2]))
lstm_beta = tf.summary.scalar("lstm_beta", tf.reduce_mean(model.trainable_variables[3])) lstm_beta = tf.summary.scalar("lstm_beta", tf.reduce_mean(model.trainable_variables[3]))
sp.pub("vocabulary size: {}".format(dl.vocabulary_size)) sp.pub("vocabulary size: {}".format(dl.vocabulary_size)) # prints values and logs it to a log file
sp.pub("train set length: {}".format(dl.sample_amount('train'))) sp.pub("train set length: {}".format(dl.sample_amount('train')))
sp.pub("train batch amount: {}".format(dl.batch_amount('train'))) sp.pub("train batch amount: {}".format(dl.batch_amount('train')))
sp.pub("valid set length: {}".format(dl.sample_amount('valid'))) sp.pub("valid set length: {}".format(dl.sample_amount('valid')))
sp.pub("valid batch amount: {}".format(dl.batch_amount('valid'))) sp.pub("valid batch amount: {}".format(dl.batch_amount('valid')))
sp.pub("model parameter amount: {}".format(model.parameter_amount)) sp.pub("model parameter amount: {}".format(model.parameter_amount))
conf = tf.ConfigProto() # TF session config for optimal GPU usage
conf = tf.ConfigProto()
conf.gpu_options.per_process_gpu_memory_fraction = 0.8 conf.gpu_options.per_process_gpu_memory_fraction = 0.8
conf.gpu_options.allocator_type = 'BFC' conf.gpu_options.allocator_type = 'BFC'
conf.gpu_options.allow_growth = True conf.gpu_options.allow_growth = True
conf.allow_soft_placement = True conf.allow_soft_placement = True
with tf.Session(config=conf) as sess: with tf.Session(config=conf) as sess:
if sp.restore and restore_checkpoint: # restores model dumps after a crash or to continiue training
if sp.restore and restore_checkpoint:
saver.restore(sess, os.path.join(sp.session_dir, "model_dump_{}.ckpt".format(restore_checkpoint))) saver.restore(sess, os.path.join(sp.session_dir, "model_dump_{}.ckpt".format(restore_checkpoint)))
epoch_start = restore_checkpoint + 1 epoch_start = restore_checkpoint + 1
sp.pub("restart training with checkpoint {}".format(epoch_start - 1)) sp.pub("restart training with checkpoint {}".format(epoch_start - 1))
@ -121,7 +107,7 @@ with tf.Session(config=conf) as sess:
epoch_start = 0 epoch_start = 0
sp.pub("start new training") sp.pub("start new training")
else: else:
saver.restore(sess,tf.train.latest_checkpoint(sp.session_dir)) saver.restore(sess, tf.train.latest_checkpoint(sp.session_dir))
epoch_start = int(tf.train.latest_checkpoint(sp.session_dir).split('_')[-1].split('.')[0]) + 1 epoch_start = int(tf.train.latest_checkpoint(sp.session_dir).split('_')[-1].split('.')[0]) + 1
sp.pub("restart training with checkpoint {}".format(epoch_start - 1)) sp.pub("restart training with checkpoint {}".format(epoch_start - 1))
else: else:
@ -131,7 +117,7 @@ with tf.Session(config=conf) as sess:
writer = tf.summary.FileWriter(os.path.join(sp.session_dir, "summary"), sess.graph) writer = tf.summary.FileWriter(os.path.join(sp.session_dir, "summary"), sess.graph)
for e in range(epoch_start, sp.config('training')['epochs']): for e in range(epoch_start, sp.config('training')['epochs']): # loop over all training epochs
train_cost = 0 train_cost = 0
train_count = 0 train_count = 0
@ -140,11 +126,12 @@ with tf.Session(config=conf) as sess:
time_e = time.time() time_e = time.time()
time_0 = time.time() time_0 = time.time()
for step in tqdm(range(int(dl.batch_amount('train')))): for step in tqdm(range(int(dl.batch_amount('train')))): # loop over all training samples
sample = next(train_loader) sample = next(train_loader) # new training sample from train iterator
_, c, summary, lb, ls = sess.run([optimizer, model.loss, summary_train_loss, lstm_beta, lstm_scale],feed_dict={data: sample['x'], target: sample['y'], mask: sample['m']}) _, c, summary, lb, ls = sess.run([optimizer, model.loss, summary_train_loss, lstm_beta, lstm_scale],
feed_dict={data: sample['x'], target: sample['y'], mask: sample['m']})
train_cost += c train_cost += c
train_count += 1 train_count += 1
writer.add_summary(summary, e * dl.batch_amount('train') + step) writer.add_summary(summary, e * dl.batch_amount('train') + step)
@ -154,75 +141,87 @@ with tf.Session(config=conf) as sess:
valid_cost = 0 valid_cost = 0
valid_count = 0 valid_count = 0
for v in range(int(dl.batch_amount('valid'))): for v in range(int(dl.batch_amount('valid'))): # loop over all validation samples
vsample = next(valid_loader) vsample = next(valid_loader)
vcost, vpred, summary = sess.run([model.loss, model.prediction, summary_valid_loss],feed_dict={data: vsample['x'], target: vsample['y'], mask: vsample['m']}) vcost, vpred, summary = sess.run([model.loss, model.prediction, summary_valid_loss],
feed_dict={data: vsample['x'], target: vsample['y'], mask: vsample['m']})
valid_cost += vcost valid_cost += vcost
valid_count += 1 valid_count += 1
writer.add_summary(summary, e * dl.batch_amount('valid') + v) writer.add_summary(summary, e * dl.batch_amount('valid') + v)
tm = np.argmax(vsample['y'], axis=-1) tm = np.argmax(vsample['y'], axis=-1) # calculates the word error rate
pm = np.argmax(vpred, axis=-1) pm = np.argmax(vpred, axis=-1)
corrects = np.equal(tm, pm) corrects = np.equal(tm, pm)
all_corrects += np.sum(corrects * vsample['m']) all_corrects += np.sum(corrects * vsample['m'])
all_overall += np.sum(vsample['m']) all_overall += np.sum(vsample['m'])
valid_cost = valid_cost / valid_count valid_cost = valid_cost / valid_count
train_cost = train_cost /train_count train_cost = train_cost / train_count
word_error_rate = 1- (all_corrects/all_overall) word_error_rate = 1 - (all_corrects / all_overall)
if not np.isnan(valid_cost): if not np.isnan(valid_cost): # checks NAN
save_path = saver.save(sess, os.path.join(sp.session_dir ,"model_dump_{}.ckpt".format(e))) save_path = saver.save(sess,
os.path.join(sp.session_dir, "model_dump_{}.ckpt".format(e))) # dumps model weights
if analyse: if analyse: # if analysis, it logs memory influence and plots functionality
controller_inf = [] controller_inf = []
memory_inf = [] memory_inf = []
all_corrects = 0 all_corrects = 0
all_overall = 0 all_overall = 0
for vstep in range(10): for vstep in range(
10): # makes ten valid inferneces to get the gradiens for memory influence calculation
vsample = next(valid_loader) vsample = next(valid_loader)
analyse_values, prediction, gradients = sess.run([model.analyse, model.prediction, trainer.gradients], analyse_values, prediction, gradients = sess.run(
feed_dict={data: vsample['x'], target: vsample['y'], mask: vsample['m']}) [model.analyse, model.prediction, trainer.gradients],
weights = {v.name: {'var':g[1], 'grad':g[0], 'shape':g[0].shape } for v, g in zip(model.trainable_variables, gradients)} feed_dict={data: vsample['x'], target: vsample['y'], mask: vsample['m']})
weights = {v.name: {'var': g[1], 'grad': g[0], 'shape': g[0].shape} for v, g in
zip(model.trainable_variables, gradients)}
if 'x_word' not in vsample.keys(): if 'x_word' not in vsample.keys():
vsample['x_word'] = np.transpose(np.argmax(vsample['x'], axis=-1),(1,0)) vsample['x_word'] = np.transpose(np.argmax(vsample['x'], axis=-1), (1, 0))
data_sample = [vsample['x'], vsample['y'], vsample['m'], vsample['x_word'],] data_sample = [vsample['x'], vsample['y'], vsample['m'], vsample['x_word'], ]
decoded_targets, decoded_predictions = dl.decode_output(vsample, prediction) decoded_targets, decoded_predictions = dl.decode_output(vsample, prediction)
save_list = [analyse_values, prediction, decoded_predictions, data_sample, weights ] save_list = [analyse_values, prediction, decoded_predictions, data_sample, weights]
co_inf, mu_inf = ana.feed_variables_two(save_list, e, name="states_epoch", save_plot=vstep) co_inf, mu_inf = ana.feed_variables_two(save_list, e, name="states_epoch",
save_plot=vstep) # calculates the memory influence
controller_inf.append(co_inf) controller_inf.append(co_inf)
memory_inf.append(mu_inf) memory_inf.append(mu_inf)
controller_inf = np.mean(controller_inf) controller_inf = np.mean(controller_inf)
memory_inf = np.mean(memory_inf) memory_inf = np.mean(memory_inf)
writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='wer', simple_value=word_error_rate)]), e * dl.batch_amount('train') + step) writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='wer', simple_value=word_error_rate)]),
writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='controller_inf', simple_value=controller_inf)]), e * dl.batch_amount('train') + step) e * dl.batch_amount('train') + step)
writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='memory_inf', simple_value=memory_inf)]), e * dl.batch_amount('train') + step) writer.add_summary(
tf.Summary(value=[tf.Summary.Value(tag='controller_inf', simple_value=controller_inf)]),
e * dl.batch_amount('train') + step)
sp.pub("epoch {:3}, step {:5}, train cost {:4.3f}, valid cost {:4.3f}, wer {:4.3f}, controller influence {:4.3f}, " writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='memory_inf', simple_value=memory_inf)]),
"memory influence {:4.3f}, duration {:5.1f}sec, time: {}, Model saved in {}".format( e * dl.batch_amount('train') + step)
e, step, train_cost, valid_cost, word_error_rate, controller_inf, memory_inf, time.time() - time_0, sp.time_stamp(), save_path))
sp.monitor(["epoch", "step", "train cost", "valid cost", "duration", "controller influence", "memory influence", "wer"],
[e, step, train_cost, valid_cost, time.time() - time_0, controller_inf, memory_inf, word_error_rate])
sp.pub(
"epoch {:3}, step {:5}, train cost {:4.3f}, valid cost {:4.3f}, wer {:4.3f}, controller influence {:4.3f}, "
"memory influence {:4.3f}, duration {:5.1f}sec, time: {}, Model saved in {}".format(
e, step, train_cost, valid_cost, word_error_rate, controller_inf, memory_inf,
time.time() - time_0, sp.time_stamp(), save_path))
sp.monitor(["epoch", "step", "train cost", "valid cost", "duration", "controller influence",
"memory influence", "wer"],
[e, step, train_cost, valid_cost, time.time() - time_0, controller_inf, memory_inf,
word_error_rate]) # saves the values in an numpy array for later analysis
else: else:
sp.pub("epoch {:3}, step {:5}, train cost {:4.3f}, valid cost {:4.3f}, duration {:5.1f}sec, time: {}, Model saved in {}".format( sp.pub(
e, step, train_cost, valid_cost, time.time() - time_0, sp.time_stamp(), save_path)) "epoch {:3}, step {:5}, train cost {:4.3f}, valid cost {:4.3f}, duration {:5.1f}sec, time: {}, Model saved in {}".format(
sp.monitor(["epoch", "step", "train cost", "valid cost", "duration"], [e, step, train_cost, valid_cost, time.time() - time_0]) e, step, train_cost, valid_cost, time.time() - time_0, sp.time_stamp(), save_path))
sp.monitor(["epoch", "step", "train cost", "valid cost", "duration"],
[e, step, train_cost, valid_cost, time.time() - time_0])
else: else:
sp.pub("ERROR: nan in training") sp.pub("ERROR: nan in training")
sys.exit("NAN") sys.exit("NAN") # end training in case of NAN
if early_stop(valid_cost): if early_stop(valid_cost):
sp.pub("EARLYSTOP: valid error increase") sp.pub("EARLYSTOP: valid error increase")
sys.exit("EARLYSTOP") sys.exit("EARLYSTOP") # end training when valid loss increases, early stopping